diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..1eac0577 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-08-10T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.05725v1","updated":"2023-08-10T17:41:19Z","published":"2023-08-10T17:41:19Z","title":"EXPRESSO: A Benchmark and Analysis of Discrete Expressive Speech\n Resynthesis","summary":" Recent work has shown that it is possible to resynthesize high-quality speech\nbased, not on text, but on low bitrate discrete units that have been learned in\na self-supervised fashion and can therefore capture expressive aspects of\nspeech that are hard to transcribe (prosody, voice styles, non-verbal\nvocalization). The adoption of these methods is still limited by the fact that\nmost speech synthesis datasets are read, severely limiting spontaneity and\nexpressivity. Here, we introduce Expresso, a high-quality expressive speech\ndataset for textless speech synthesis that includes both read speech and\nimprovised dialogues rendered in 26 spontaneous expressive styles. We\nillustrate the challenges and potentials of this dataset with an expressive\nresynthesis benchmark where the task is to encode the input in low-bitrate\nunits and resynthesize it in a target voice while preserving content and style.\nWe evaluate resynthesis quality with automatic metrics for different\nself-supervised discrete encoders, and explore tradeoffs between quality,\nbitrate and invariance to speaker and style. All the dataset, evaluation\nmetrics and baseline models are open source\n","authors":["Tu Anh Nguyen","Wei-Ning Hsu","Antony D'Avirro","Bowen Shi","Itai Gat","Maryam Fazel-Zarani","Tal Remez","Jade Copet","Gabriel Synnaeve","Michael Hassid","Felix Kreuk","Yossi Adi","Emmanuel Dupoux"],"pdf_url":"https://arxiv.org/pdf/2308.05725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05696v1","updated":"2023-08-10T16:58:51Z","published":"2023-08-10T16:58:51Z","title":"A Preliminary Study of the Intrinsic Relationship between Complexity and\n Alignment","summary":" Training large language models (LLMs) with open-domain instruction data has\nyielded remarkable success in aligning to end tasks and user preferences.\nExtensive research has highlighted that enhancing the quality and diversity of\ninstruction data consistently improves performance. However, the impact of data\ncomplexity, as a crucial metric, remains relatively unexplored in three\naspects: (1) scaling law, where the sustainability of performance improvements\nwith increasing complexity is uncertain, (2) additional tokens, whether the\nimprovement brought by complexity comes from introducing more training tokens,\nand (3) curriculum tuning, where the potential advantages of incorporating\ninstructions ranging from easy to difficult are not yet fully understood. In\nthis paper, we propose \\textit{tree-instruct} to systematically enhance the\ncomplexity of instruction data in a controllable manner. This approach adds a\nspecified number of nodes into the instruction semantic tree, yielding new\ninstruction data based on the modified tree. By adjusting the number of added\nnodes, we can control the difficulty level in the modified instruction data.\nOur preliminary experiments reveal the following insights: (1) Increasing\ncomplexity consistently leads to sustained performance improvements. For\ninstance, using 1,000 instruction data and 10 nodes resulted in a substantial\n24\\% increase in win rate. (2) Under the same token budget, a few complex\ninstructions outperform diverse yet simple instructions. (3) Curriculum\ninstruction tuning might not yield the anticipated results; focusing on\nincreasing complexity appears to be the key.\n","authors":["Yingxiu Zhao","Bowen Yu","Binyuan Hui","Haiyang Yu","Fei Huang","Yongbin Li","Nevin L. Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14679v2","updated":"2023-08-10T16:46:35Z","published":"2023-02-28T15:42:30Z","title":"Synthesizing Mixed-type Electronic Health Records using Diffusion Models","summary":" Electronic Health Records (EHRs) contain sensitive patient information, which\npresents privacy concerns when sharing such data. Synthetic data generation is\na promising solution to mitigate these risks, often relying on deep generative\nmodels such as Generative Adversarial Networks (GANs). However, recent studies\nhave shown that diffusion models offer several advantages over GANs, such as\ngeneration of more realistic synthetic data and stable training in generating\ndata modalities, including image, text, and sound. In this work, we investigate\nthe potential of diffusion models for generating realistic mixed-type tabular\nEHRs, comparing TabDDPM model with existing methods on four datasets in terms\nof data quality, utility, privacy, and augmentation. Our experiments\ndemonstrate that TabDDPM outperforms the state-of-the-art models across all\nevaluation metrics, except for privacy, which confirms the trade-off between\nprivacy and utility.\n","authors":["Taha Ceritli","Ghadeer O. Ghosheh","Vinod Kumar Chauhan","Tingting Zhu","Andrew P. Creagh","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2302.14679v2.pdf","comment":"Page 2, Figure 1 is updated"},{"id":"http://arxiv.org/abs/2308.05680v1","updated":"2023-08-10T16:33:17Z","published":"2023-08-10T16:33:17Z","title":"Finding Already Debunked Narratives via Multistage Retrieval: Enabling\n Cross-Lingual, Cross-Dataset and Zero-Shot Learning","summary":" The task of retrieving already debunked narratives aims to detect stories\nthat have already been fact-checked. The successful detection of claims that\nhave already been debunked not only reduces the manual efforts of professional\nfact-checkers but can also contribute to slowing the spread of misinformation.\nMainly due to the lack of readily available data, this is an understudied\nproblem, particularly when considering the cross-lingual task, i.e. the\nretrieval of fact-checking articles in a language different from the language\nof the online post being checked. This paper fills this gap by (i) creating a\nnovel dataset to enable research on cross-lingual retrieval of already debunked\nnarratives, using tweets as queries to a database of fact-checking articles;\n(ii) presenting an extensive experiment to benchmark fine-tuned and\noff-the-shelf multilingual pre-trained Transformer models for this task; and\n(iii) proposing a novel multistage framework that divides this cross-lingual\ndebunk retrieval task into refinement and re-ranking stages. Results show that\nthe task of cross-lingual retrieval of already debunked narratives is\nchallenging and off-the-shelf Transformer models fail to outperform a strong\nlexical-based baseline (BM25). Nevertheless, our multistage retrieval framework\nis robust, outperforming BM25 in most scenarios and enabling cross-domain and\nzero-shot learning, without significantly harming the model's performance.\n","authors":["Iknoor Singh","Carolina Scarton","Xingyi Song","Kalina Bontcheva"],"pdf_url":"https://arxiv.org/pdf/2308.05680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05646v1","updated":"2023-08-10T15:43:46Z","published":"2023-08-10T15:43:46Z","title":"AST-MHSA : Code Summarization using Multi-Head Self-Attention","summary":" Code summarization aims to generate concise natural language descriptions for\nsource code. The prevailing approaches adopt transformer-based encoder-decoder\narchitectures, where the Abstract Syntax Tree (AST) of the source code is\nutilized for encoding structural information. However, ASTs are much longer\nthan the corresponding source code, and existing methods ignore this size\nconstraint by directly feeding the entire linearized AST into the encoders.\nThis simplistic approach makes it challenging to extract truly valuable\ndependency relations from the overlong input sequence and leads to significant\ncomputational overhead due to self-attention applied to all nodes in the AST.\n To address this issue effectively and efficiently, we present a model,\nAST-MHSA that uses multi-head attention to extract the important semantic\ninformation from the AST. The model consists of two main components: an encoder\nand a decoder. The encoder takes as input the abstract syntax tree (AST) of the\ncode and generates a sequence of hidden states. The decoder then takes these\nhidden states as input and generates a natural language summary of the code.\n The multi-head attention mechanism allows the model to learn different\nrepresentations of the input code, which can be combined to generate a more\ncomprehensive summary. The model is trained on a dataset of code and summaries,\nand the parameters of the model are optimized to minimize the loss between the\ngenerated summaries and the ground-truth summaries.\n","authors":["Yeshwanth Nagaraj","Ujjwal Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.05646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.02399v3","updated":"2023-08-10T15:31:54Z","published":"2021-12-04T18:34:24Z","title":"VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts","summary":" Contrastive Language-Image Pre-training (CLIP) has drawn increasing attention\nrecently for its transferable visual representation learning. However, due to\nthe semantic gap within datasets, CLIP's pre-trained image-text alignment\nbecomes sub-optimal on downstream tasks, which severely harms its transferring\nperformance. To better adapt the cross-modality embedding space, we propose to\nenhance CLIP via Visual-guided Texts, named VT-CLIP. Specifically, we guide\ntextual features of different categories to adaptively explore informative\nregions on the image and aggregate visual features by attention mechanisms. In\nthis way, the texts become visual-guided, namely, more semantically correlated\nwith downstream images, which greatly benefits the category-wise matching\nprocess. In few-shot settings, we evaluate our VT-CLIP on 11 well-known\nclassification datasets to demonstrate its effectiveness.\n","authors":["Longtian Qiu","Renrui Zhang","Ziyu Guo","Ziyao Zeng","Zilu Guo","Yafeng Li","Guangnan Zhang"],"pdf_url":"https://arxiv.org/pdf/2112.02399v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05633v1","updated":"2023-08-10T15:22:11Z","published":"2023-08-10T15:22:11Z","title":"IIHT: Medical Report Generation with Image-to-Indicator Hierarchical\n Transformer","summary":" Automated medical report generation has become increasingly important in\nmedical analysis. It can produce computer-aided diagnosis descriptions and thus\nsignificantly alleviate the doctors' work. Inspired by the huge success of\nneural machine translation and image captioning, various deep learning methods\nhave been proposed for medical report generation. However, due to the inherent\nproperties of medical data, including data imbalance and the length and\ncorrelation between report sequences, the generated reports by existing methods\nmay exhibit linguistic fluency but lack adequate clinical accuracy. In this\nwork, we propose an image-to-indicator hierarchical transformer (IIHT)\nframework for medical report generation. It consists of three modules, i.e., a\nclassifier module, an indicator expansion module and a generator module. The\nclassifier module first extracts image features from the input medical images\nand produces disease-related indicators with their corresponding states. The\ndisease-related indicators are subsequently utilised as input for the indicator\nexpansion module, incorporating the \"data-text-data\" strategy. The\ntransformer-based generator then leverages these extracted features along with\nimage features as auxiliary information to generate final reports. Furthermore,\nthe proposed IIHT method is feasible for radiologists to modify disease\nindicators in real-world scenarios and integrate the operations into the\nindicator expansion module for fluent and accurate medical report generation.\nExtensive experiments and comparisons with state-of-the-art methods under\nvarious evaluation metrics demonstrate the great performance of the proposed\nmethod.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2308.05633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05609v1","updated":"2023-08-10T14:41:17Z","published":"2023-08-10T14:41:17Z","title":"LASIGE and UNICAGE solution to the NASA LitCoin NLP Competition","summary":" Biomedical Natural Language Processing (NLP) tends to become cumbersome for\nmost researchers, frequently due to the amount and heterogeneity of text to be\nprocessed. To address this challenge, the industry is continuously developing\nhighly efficient tools and creating more flexible engineering solutions. This\nwork presents the integration between industry data engineering solutions for\nefficient data processing and academic systems developed for Named Entity\nRecognition (LasigeUnicage\\_NER) and Relation Extraction (BiOnt). Our design\nreflects an integration of those components with external knowledge in the form\nof additional training data from other datasets and biomedical ontologies. We\nused this pipeline in the 2022 LitCoin NLP Challenge, where our team\nLasigeUnicage was awarded the 7th Prize out of approximately 200 participating\nteams, reflecting a successful collaboration between the academia (LASIGE) and\nthe industry (Unicage). The software supporting this work is available at\n\\url{https://github.com/lasigeBioTM/Litcoin-Lasige_Unicage}.\n","authors":["Pedro Ruas","Diana F. Sousa","André Neves","Carlos Cruz","Francisco M. Couto"],"pdf_url":"https://arxiv.org/pdf/2308.05609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03762v2","updated":"2023-08-10T14:24:37Z","published":"2023-07-21T17:04:25Z","title":"GPT-4 Can't Reason","summary":" GPT-4 was released in March 2023 to wide acclaim, marking a very substantial\nimprovement across the board over GPT-3.5 (OpenAI's previously best model,\nwhich had powered the initial release of ChatGPT). However, despite the\ngenuinely impressive improvement, there are good reasons to be highly skeptical\nof GPT-4's ability to reason. This position paper discusses the nature of\nreasoning; criticizes the current formulation of reasoning problems in the NLP\ncommunity, as well as the way in which LLM reasoning performance is currently\nevaluated; introduces a small collection of 21 diverse reasoning problems; and\nperforms a detailed qualitative evaluation of GPT-4's performance on those\nproblems. Based on this analysis, the paper concludes that, despite its\noccasional flashes of analytical brilliance, GPT-4 at present is utterly\nincapable of reasoning.\n","authors":["Konstantine Arkoudas"],"pdf_url":"https://arxiv.org/pdf/2308.03762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05596v1","updated":"2023-08-10T14:14:13Z","published":"2023-08-10T14:14:13Z","title":"You Only Prompt Once: On the Capabilities of Prompt Learning on Large\n Language Models to Tackle Toxic Content","summary":" The spread of toxic content online is an important problem that has adverse\neffects on user experience online and in our society at large. Motivated by the\nimportance and impact of the problem, research focuses on developing solutions\nto detect toxic content, usually leveraging machine learning (ML) models\ntrained on human-annotated datasets. While these efforts are important, these\nmodels usually do not generalize well and they can not cope with new trends\n(e.g., the emergence of new toxic terms). Currently, we are witnessing a shift\nin the approach to tackling societal issues online, particularly leveraging\nlarge language models (LLMs) like GPT-3 or T5 that are trained on vast corpora\nand have strong generalizability. In this work, we investigate how we can use\nLLMs and prompt learning to tackle the problem of toxic content, particularly\nfocusing on three tasks; 1) Toxicity Classification, 2) Toxic Span Detection,\nand 3) Detoxification. We perform an extensive evaluation over five model\narchitectures and eight datasets demonstrating that LLMs with prompt learning\ncan achieve similar or even better performance compared to models trained on\nthese specific tasks. We find that prompt learning achieves around 10\\%\nimprovement in the toxicity classification task compared to the baselines,\nwhile for the toxic span detection task we find better performance to the best\nbaseline (0.643 vs. 0.640 in terms of $F_1$-score). Finally, for the\ndetoxification task, we find that prompt learning can successfully reduce the\naverage toxicity score (from 0.775 to 0.213) while preserving semantic meaning.\n","authors":["Xinlei He","Savvas Zannettou","Yun Shen","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05596v1.pdf","comment":"To Appear in the 45th IEEE Symposium on Security and Privacy, May\n 20-23, 2024"},{"id":"http://arxiv.org/abs/2308.05576v1","updated":"2023-08-10T13:39:40Z","published":"2023-08-10T13:39:40Z","title":"Do Language Models Refer?","summary":" What do language models (LMs) do with language? Everyone agrees that they\nproduce sequences of (mostly) coherent sentences. But are they saying anything\nwith those strings or simply babbling in a convincing simulacrum of language\nuse? This is a vague question, and there are many ways of making it precise.\nHere we will address one aspect of the question, namely, whether LMs' words\nrefer: that is, whether the outputs of LMs achieve \"word-to-world\" connections.\nThere is prima facie reason to think they do not since LMs do not interact with\nthe world in the way that ordinary language users do. Drawing on insights from\nthe externalist tradition in philosophy of language, we argue that appearances\nare misleading and that there is good reason to think that LMs can refer.\n","authors":["Matthew Mandelkern","Tal Linzen"],"pdf_url":"https://arxiv.org/pdf/2308.05576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05574v1","updated":"2023-08-10T13:38:09Z","published":"2023-08-10T13:38:09Z","title":"Exploring Linguistic Similarity and Zero-Shot Learning for Multilingual\n Translation of Dravidian Languages","summary":" Current research in zero-shot translation is plagued by several issues such\nas high compute requirements, increased training time and off target\ntranslations. Proposed remedies often come at the cost of additional data or\ncompute requirements. Pivot based neural machine translation is preferred over\na single-encoder model for most settings despite the increased training and\nevaluation time. In this work, we overcome the shortcomings of zero-shot\ntranslation by taking advantage of transliteration and linguistic similarity.\nWe build a single encoder-decoder neural machine translation system for\nDravidian-Dravidian multilingual translation and perform zero-shot translation.\nWe compare the data vs zero-shot accuracy tradeoff and evaluate the performance\nof our vanilla method against the current state of the art pivot based method.\nWe also test the theory that morphologically rich languages require large\nvocabularies by restricting the vocabulary using an optimal transport based\ntechnique. Our model manages to achieves scores within 3 BLEU of large-scale\npivot-based models when it is trained on 50\\% of the language directions.\n","authors":["Danish Ebadulla","Rahul Raman","S. Natarajan","Hridhay Kiran Shetty","Ashish Harish Shenoy"],"pdf_url":"https://arxiv.org/pdf/2308.05574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05502v1","updated":"2023-08-10T11:14:22Z","published":"2023-08-10T11:14:22Z","title":"Bringing order into the realm of Transformer-based language models for\n artificial intelligence and law","summary":" Transformer-based language models (TLMs) have widely been recognized to be a\ncutting-edge technology for the successful development of deep-learning-based\nsolutions to problems and applications that require natural language processing\nand understanding. Like for other textual domains, TLMs have indeed pushed the\nstate-of-the-art of AI approaches for many tasks of interest in the legal\ndomain. Despite the first Transformer model being proposed about six years ago,\nthere has been a rapid progress of this technology at an unprecedented rate,\nwhereby BERT and related models represent a major reference, also in the legal\ndomain. This article provides the first systematic overview of TLM-based\nmethods for AI-driven problems and tasks in the legal sphere. A major goal is\nto highlight research advances in this field so as to understand, on the one\nhand, how the Transformers have contributed to the success of AI in supporting\nlegal processes, and on the other hand, what are the current limitations and\nopportunities for further research development.\n","authors":["Candida M. Greco","Andrea Tagarelli"],"pdf_url":"https://arxiv.org/pdf/2308.05502v1.pdf","comment":"Accepted for publication with Artificial Intelligence and Law,\n Springer Nature"},{"id":"http://arxiv.org/abs/2306.02130v2","updated":"2023-08-10T11:13:02Z","published":"2023-06-03T14:57:47Z","title":"Extending an Event-type Ontology: Adding Verbs and Classes Using\n Fine-tuned LLMs Suggestions","summary":" In this project, we have investigated the use of advanced machine learning\nmethods, specifically fine-tuned large language models, for pre-annotating data\nfor a lexical extension task, namely adding descriptive words (verbs) to an\nexisting (but incomplete, as of yet) ontology of event types. Several research\nquestions have been focused on, from the investigation of a possible heuristics\nto provide at least hints to annotators which verbs to include and which are\noutside the current version of the ontology, to the possible use of the\nautomatic scores to help the annotators to be more efficient in finding a\nthreshold for identifying verbs that cannot be assigned to any existing class\nand therefore they are to be used as seeds for a new class. We have also\ncarefully examined the correlation of the automatic scores with the human\nannotation. While the correlation turned out to be strong, its influence on the\nannotation proper is modest due to its near linearity, even though the mere\nfact of such pre-annotation leads to relatively short annotation times.\n","authors":["Jana Straková","Eva Fučíková","Jan Hajič","Zdeňka Urešová"],"pdf_url":"https://arxiv.org/pdf/2306.02130v2.pdf","comment":"Published at LAW-XVII @ ACL 2023"},{"id":"http://arxiv.org/abs/2304.03531v2","updated":"2023-08-10T10:52:39Z","published":"2023-04-07T08:09:50Z","title":"From Retrieval to Generation: Efficient and Effective Entity Set\n Expansion","summary":" Entity Set Expansion (ESE) is a critical task aiming to expand entities of\nthe target semantic class described by a small seed entity set. Most existing\nESE methods are retrieval-based frameworks that need to extract the contextual\nfeatures of entities and calculate the similarity between seed entities and\ncandidate entities. To achieve the two purposes, they should iteratively\ntraverse the corpus and the entity vocabulary provided in the datasets,\nresulting in poor efficiency and scalability. The experimental results indicate\nthat the time consumed by the retrieval-based ESE methods increases linearly\nwith entity vocabulary and corpus size. In this paper, we firstly propose a\ngenerative ESE framework, Generative Entity Set Expansion (GenExpan), which\nutilizes a generative pre-trained language model to accomplish ESE task.\nSpecifically, a prefix tree is employed to guarantee the validity of entity\ngeneration, and automatically generated class names are adopted to guide the\nmodel to generate target entities. Moreover, we propose Knowledge Calibration\nand Generative Ranking to further bridge the gap between generic knowledge of\nthe language model and the goal of ESE task. Experiments on publicly available\ndatasets show that GenExpan is efficient and effective. For efficiency,\nexpansion time consumed by GenExpan is independent of entity vocabulary and\ncorpus size, and GenExpan achieves an average 600% speedup compared to strong\nbaselines. For expansion performance, our framework outperforms previous\nstate-of-the-art ESE methods.\n","authors":["Shulin Huang","Shirong Ma","Yangning Li","Yinghui Li","Hai-Tao Zheng","Yong Jiang","Hong-Gee Kim"],"pdf_url":"https://arxiv.org/pdf/2304.03531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05481v1","updated":"2023-08-10T10:12:43Z","published":"2023-08-10T10:12:43Z","title":"LLM As DBA","summary":" Database administrators (DBAs) play a crucial role in managing, maintaining\nand optimizing a database system to ensure data availability, performance, and\nreliability. However, it is hard and tedious for DBAs to manage a large number\nof database instances (e.g., millions of instances on the cloud databases).\nRecently large language models (LLMs) have shown great potential to understand\nvaluable documents and accordingly generate reasonable answers. Thus, we\npropose D-Bot, a LLM-based database administrator that can continuously acquire\ndatabase maintenance experience from textual sources, and provide reasonable,\nwell-founded, in-time diagnosis and optimization advice for target databases.\nThis paper presents a revolutionary LLM-centric framework for database\nmaintenance, including (i) database maintenance knowledge detection from\ndocuments and tools, (ii) tree of thought reasoning for root cause analysis,\nand (iii) collaborative diagnosis among multiple LLMs. Our preliminary\nexperimental results that D-Bot can efficiently and effectively diagnose the\nroot causes and our code is available at\ngithub.com/TsinghuaDatabaseGroup/DB-GPT.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhiyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05476v1","updated":"2023-08-10T10:07:00Z","published":"2023-08-10T10:07:00Z","title":"Exploring Machine Learning and Transformer-based Approaches for\n Deceptive Text Classification: A Comparative Analysis","summary":" Deceptive text classification is a critical task in natural language\nprocessing that aims to identify deceptive or fraudulent content. This study\npresents a comparative analysis of machine learning and transformer-based\napproaches for deceptive text classification. We investigate the effectiveness\nof traditional machine learning algorithms and state-of-the-art transformer\nmodels, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive\ntext. A labeled dataset consisting of deceptive and non-deceptive texts is used\nfor training and evaluation purposes. Through extensive experimentation, we\ncompare the performance metrics, including accuracy, precision, recall, and F1\nscore, of the different approaches. The results of this study shed light on the\nstrengths and limitations of machine learning and transformer-based methods for\ndeceptive text classification, enabling researchers and practitioners to make\ninformed decisions when dealing with deceptive content\n","authors":["Anusuya Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.05476v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.02697v2","updated":"2023-08-10T07:42:22Z","published":"2023-07-06T00:06:14Z","title":"Strahler Number of Natural Language Sentences in Comparison with Random\n Trees","summary":" The Strahler number was originally proposed to characterize the complexity of\nriver bifurcation and has found various applications. This article proposes\ncomputation of the Strahler number's upper and lower limits for natural\nlanguage sentence tree structures. Through empirical measurements across\ngrammatically annotated data, the Strahler number of natural language sentences\nis shown to be almost 3 or 4, similarly to the case of river bifurcation as\nreported by Strahler (1957). From the theory behind the number, we show that it\nis one kind of lower limit on the amount of memory required to process\nsentences. We consider the Strahler number to provide reasoning that explains\nreports showing that the number of required memory areas to process sentences\nis 3 to 4 for parsing (Abney and Johnson, 1991; Schuler et al., 2010), and\nreports indicating a psychological \"magical number\" of 3 to 5 (Cowan, 2001). An\nanalytical and empirical analysis shows that the Strahler number is not\nconstant but grows logarithmically; therefore, the Strahler number of sentences\nderives from the range of sentence lengths. Furthermore, the Strahler number is\nnot different for random trees, which could suggest that its origin is not\nspecific to natural language.\n","authors":["Kumiko Tanaka-Ishii","Akira Tanaka"],"pdf_url":"https://arxiv.org/pdf/2307.02697v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05361v1","updated":"2023-08-10T06:08:20Z","published":"2023-08-10T06:08:20Z","title":"WeaverBird: Empowering Financial Decision-Making with Large Language\n Model, Knowledge Base, and Search Engine","summary":" We present WeaverBird, an intelligent dialogue system designed specifically\nfor the finance domain. Our system harnesses a large language model of GPT\narchitecture that has been tuned using extensive corpora of finance-related\ntext. As a result, our system possesses the capability to understand complex\nfinancial queries, such as \"How should I manage my investments during\ninflation?\", and provide informed responses. Furthermore, our system\nincorporates a local knowledge base and a search engine to retrieve relevant\ninformation. The final responses are conditioned on the search results and\ninclude proper citations to the sources, thus enjoying an enhanced credibility.\nThrough a range of finance-related questions, we have demonstrated the superior\nperformance of our system compared to other models. To experience our system\nfirsthand, users can interact with our live demo at\nhttps://weaverbird.ttic.edu, as well as watch our 2-min video illustration at\nhttps://www.youtube.com/watch?v=yofgeqnlrMc.\n","authors":["Siqiao Xue","Fan Zhou","Yi Xu","Hongyu Zhao","Shuo Xie","Caigao Jiang","James Zhang","Jun Zhou","Peng Xu","Dacheng Xiu","Hongyuan Mei"],"pdf_url":"https://arxiv.org/pdf/2308.05361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11679v2","updated":"2023-08-10T05:27:58Z","published":"2023-04-23T15:11:49Z","title":"Domain Mastery Benchmark: An Ever-Updating Benchmark for Evaluating\n Holistic Domain Knowledge of Large Language Model--A Preliminary Release","summary":" Domain knowledge refers to the in-depth understanding, expertise, and\nfamiliarity with a specific subject, industry, field, or area of special\ninterest. The existing benchmarks are all lack of an overall design for domain\nknowledge evaluation. Holding the belief that the real ability of domain\nlanguage understanding can only be fairly evaluated by an comprehensive and\nin-depth benchmark, we introduces the Domma, a Domain Mastery Benchmark. DomMa\ntargets at testing Large Language Models (LLMs) on their domain knowledge\nunderstanding, it features extensive domain coverage, large data volume, and a\ncontinually updated data set based on Chinese 112 first-level subject\nclassifications. DomMa consist of 100,000 questions in both Chinese and English\nsourced from graduate entrance examinations and undergraduate exams in Chinese\ncollege. We have also propose designs to make benchmark and evaluation process\nmore suitable to LLMs.\n","authors":["Zhouhong Gu","Xiaoxuan Zhu","Haoning Ye","Lin Zhang","Zhuozhi Xiong","Zihan Li","Qianyu He","Sihang Jiang","Hongwei Feng","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2304.11679v2.pdf","comment":"The paper is updated, but we make a mistake that submit a new arxiv\n paper but not replace this one, the new version is in arXiv:2306.05783"},{"id":"http://arxiv.org/abs/2308.05342v1","updated":"2023-08-10T05:10:17Z","published":"2023-08-10T05:10:17Z","title":"Metacognitive Prompting Improves Understanding in Large Language Models","summary":" In Large Language Models (LLMs), there have been consistent advancements in\ntask-specific performance, largely influenced by effective prompt design. While\nrecent research on prompting has enhanced the reasoning capabilities of LLMs, a\ngap remains in further improving their understanding abilities. In this study,\nwe introduce metacognitive prompting (MP), a strategy inspired by human\nintrospective reasoning processes. Using MP, LLMs undergo a systematic series\nof structured, self-aware evaluations, drawing on both their vast inherent\nknowledge and new insights. Our experiments involve five prevalent LLMs:\nLlama2, Vicuna, PaLM, GPT-3.5, and GPT-4, all of which span various general\nnatural language understanding (NLU) tasks from the GLUE and SuperGLUE\nbenchmarks. Results indicate that, although GPT-4 consistently excels in most\ntasks, PaLM, when equipped with MP, approaches its performance level.\nFurthermore, across models and datasets, MP consistently outperforms existing\nprompting methods, including standard and chain-of-thought prompting. This\nstudy underscores the potential to amplify the understanding abilities of LLMs\nand highlights the benefits of mirroring human introspective reasoning in NLU\ntasks.\n","authors":["Yuqing Wang","Yun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.05342v1.pdf","comment":"9 pages, in submission"},{"id":"http://arxiv.org/abs/2308.05341v1","updated":"2023-08-10T05:09:42Z","published":"2023-08-10T05:09:42Z","title":"Classification of Human- and AI-Generated Texts: Investigating Features\n for ChatGPT","summary":" Recently, generative AIs like ChatGPT have become available to the wide\npublic. These tools can for instance be used by students to generate essays or\nwhole theses. But how does a teacher know whether a text is written by a\nstudent or an AI? In our work, we explore traditional and new features to (1)\ndetect text generated by AI from scratch and (2) text rephrased by AI. Since we\nfound that classification is more difficult when the AI has been instructed to\ncreate the text in a way that a human would not recognize that it was generated\nby an AI, we also investigate this more advanced case. For our experiments, we\nproduced a new text corpus covering 10 school topics. Our best systems to\nclassify basic and advanced human-generated/AI-generated texts have F1-scores\nof over 96%. Our best systems for classifying basic and advanced\nhuman-generated/AI-rephrased texts have F1-scores of more than 78%. The systems\nuse a combination of perplexity, semantic, list lookup, error-based,\nreadability, AI feedback, and text vector features. Our results show that the\nnew features substantially help to improve the performance of many classifiers.\nOur best basic text rephrasing detection system even outperforms GPTZero by\n183.8% relative in F1-score.\n","authors":["Lorenz Mindner","Tim Schlippe","Kristina Schaaff"],"pdf_url":"https://arxiv.org/pdf/2308.05341v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05336v1","updated":"2023-08-10T04:57:34Z","published":"2023-08-10T04:57:34Z","title":"Developing an Informal-Formal Persian Corpus","summary":" Informal language is a style of spoken or written language frequently used in\ncasual conversations, social media, weblogs, emails and text messages. In\ninformal writing, the language faces some lexical and/or syntactic changes\nvarying among different languages. Persian is one of the languages with many\ndifferences between its formal and informal styles of writing, thus developing\ninformal language processing tools for this language seems necessary. Such a\nconverter needs a large aligned parallel corpus of colloquial-formal sentences\nwhich can be useful for linguists to extract a regulated grammar and\northography for colloquial Persian as is done for the formal language. In this\npaper we explain our methodology in building a parallel corpus of 50,000\nsentence pairs with alignments in the word/phrase level. The sentences were\nattempted to cover almost all kinds of lexical and syntactic changes between\ninformal and formal Persian, therefore both methods of exploring and collecting\nfrom the different resources of informal scripts and following the phonological\nand morphological patterns of changes were applied to find as much instances as\npossible. The resulting corpus has about 530,000 alignments and a dictionary\ncontaining 49,397 word and phrase pairs.\n","authors":["Vahide Tajalli","Fateme Kalantari","Mehrnoush Shamsfard"],"pdf_url":"https://arxiv.org/pdf/2308.05336v1.pdf","comment":"16 pages, 1 Figure and 3 tables"},{"id":"http://arxiv.org/abs/2304.09797v5","updated":"2023-08-10T03:41:04Z","published":"2023-04-19T16:29:48Z","title":"Progressive-Hint Prompting Improves Reasoning in Large Language Models","summary":" The performance of Large Language Models (LLMs) in reasoning tasks depends\nheavily on prompt design, with Chain-of-Thought (CoT) and self-consistency\nbeing critical methods that enhance this ability. However, these methods do not\nfully exploit the answers generated by the LLM to guide subsequent responses.\nThis paper proposes a new prompting method, named Progressive-Hint Prompting\n(PHP), that enables automatic multiple interactions between users and LLMs by\nusing previously generated answers as hints to progressively guide toward the\ncorrect answers. PHP is orthogonal to CoT and self-consistency, making it easy\nto combine with state-of-the-art techniques to further improve performance. We\nconducted extensive and comprehensive experiments on seven benchmarks. The\nresults show that PHP significantly improves accuracy while remaining highly\nefficient. For instance, with text-davinci-003, we observed a 4.2% improvement\non GSM8K with greedy decoding compared to Complex CoT, and a 46.17% reduction\nin sample paths with self-consistency. With GPT-4 and PHP, we achieve\nstate-of-the-art performances on SVAMP (89.1% -> 91.9%), GSM8K (92% -> 95.5%),\nAQuA (76.4% -> 79.9%) and MATH (50.3% -> 53.9%).\n","authors":["Chuanyang Zheng","Zhengying Liu","Enze Xie","Zhenguo Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2304.09797v5.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2308.05317v1","updated":"2023-08-10T03:09:12Z","published":"2023-08-10T03:09:12Z","title":"Few-Shot Data-to-Text Generation via Unified Representation and\n Multi-Source Learning","summary":" We present a novel approach for structured data-to-text generation that\naddresses the limitations of existing methods that primarily focus on specific\ntypes of structured data. Our proposed method aims to improve performance in\nmulti-task training, zero-shot and few-shot scenarios by providing a unified\nrepresentation that can handle various forms of structured data such as tables,\nknowledge graph triples, and meaning representations. We demonstrate that our\nproposed approach can effectively adapt to new structured forms, and can\nimprove performance in comparison to current methods. For example, our method\nresulted in a 66% improvement in zero-shot BLEU scores when transferring models\ntrained on table inputs to a knowledge graph dataset. Our proposed method is an\nimportant step towards a more general data-to-text generation framework.\n","authors":["Alexander Hanbo Li","Mingyue Shang","Evangelia Spiliopoulou","Jie Ma","Patrick Ng","Zhiguo Wang","Bonan Min","William Wang","Kathleen McKeown","Vittorio Castelli","Dan Roth","Bing Xiang"],"pdf_url":"https://arxiv.org/pdf/2308.05317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03131v4","updated":"2023-08-10T02:08:04Z","published":"2023-08-06T14:49:26Z","title":"Towards Multiple References Era -- Addressing Data Leakage and Limited\n Reference Diversity in NLG Evaluation","summary":" N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely\nutilized across a range of natural language generation (NLG) tasks. However,\nrecent studies have revealed a weak correlation between these matching-based\nmetrics and human evaluations, especially when compared with neural-based\nmetrics like BLEURT. In this paper, we conjecture that the performance\nbottleneck in matching-based metrics may be caused by the limited diversity of\nreferences. To address this issue, we propose to utilize \\textit{multiple\nreferences} to enhance the consistency between these metrics and human\nevaluations. Within the WMT Metrics benchmarks, we observe that the\nmulti-references F200spBLEU surpasses the conventional single-reference one by\nan accuracy improvement of 7.2\\%. Remarkably, it also exceeds the neural-based\nBERTscore by an accuracy enhancement of 3.9\\%. Moreover, we observe that the\ndata leakage issue in large language models (LLMs) can be mitigated to a large\nextent by our multi-reference metric. We release the code and data at\n\\url{https://github.com/SefaZeng/LLM-Ref}\n","authors":["Xianfeng Zeng","Yijin Liu","Fandong Meng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.03131v4.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.05281v1","updated":"2023-08-10T01:51:33Z","published":"2023-08-10T01:51:33Z","title":"Investigating disaster response through social media data and the\n Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S.\n wildfire season","summary":" Effective disaster response is critical for affected communities. Responders\nand decision-makers would benefit from reliable, timely measures of the issues\nimpacting their communities during a disaster, and social media offers a\npotentially rich data source. Social media can reflect public concerns and\ndemands during a disaster, offering valuable insights for decision-makers to\nunderstand evolving situations and optimize resource allocation. We used\nBidirectional Encoder Representations from Transformers (BERT) topic modeling\nto cluster topics from Twitter data. Then, we conducted a temporal-spatial\nanalysis to examine the distribution of these topics across different regions\nduring the 2020 western U.S. wildfire season. Our results show that Twitter\nusers mainly focused on three topics:\"health impact,\" \"damage,\" and\n\"evacuation.\" We used the Susceptible-Infected-Recovered (SIR) theory to\nexplore the magnitude and velocity of topic diffusion on Twitter. The results\ndisplayed a clear relationship between topic trends and wildfire propagation\npatterns. The estimated parameters obtained from the SIR model in selected\ncities revealed that residents exhibited a high level of several concerns\nduring the wildfire. Our study details how the SIR model and topic modeling\nusing social media data can provide decision-makers with a quantitative\napproach to measure disaster response and support their decision-making\nprocesses.\n","authors":["Zihui Ma","Lingyao Li","Libby Hemphill","Gregory B. Baecher"],"pdf_url":"https://arxiv.org/pdf/2308.05281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05269v1","updated":"2023-08-10T01:02:45Z","published":"2023-08-10T01:02:45Z","title":"A Novel Self-training Approach for Low-resource Speech Recognition","summary":" In this paper, we propose a self-training approach for automatic speech\nrecognition (ASR) for low-resource settings. While self-training approaches\nhave been extensively developed and evaluated for high-resource languages such\nas English, their applications to low-resource languages like Punjabi have been\nlimited, despite the language being spoken by millions globally. The scarcity\nof annotated data has hindered the development of accurate ASR systems,\nespecially for low-resource languages (e.g., Punjabi and M\\=aori languages). To\naddress this issue, we propose an effective self-training approach that\ngenerates highly accurate pseudo-labels for unlabeled low-resource speech. Our\nexperimental analysis demonstrates that our approach significantly improves\nword error rate, achieving a relative improvement of 14.94% compared to a\nbaseline model across four real speech datasets. Further, our proposed approach\nreports the best results on the Common Voice Punjabi dataset.\n","authors":["Satwinder Singh","Feng Hou","Ruili Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05269v1.pdf","comment":"Accepted to Interspeech 2023"},{"id":"http://arxiv.org/abs/2210.15781v2","updated":"2023-08-10T23:34:35Z","published":"2022-10-27T21:47:30Z","title":"A Compact End-to-End Model with Local and Global Context for Spoken\n Language Identification","summary":" We introduce TitaNet-LID, a compact end-to-end neural network for Spoken\nLanguage Identification (LID) that is based on the ContextNet architecture.\nTitaNet-LID employs 1D depth-wise separable convolutions and\nSqueeze-and-Excitation layers to effectively capture local and global context\nwithin an utterance. Despite its small size, TitaNet-LID achieves performance\nsimilar to state-of-the-art models on the VoxLingua107 dataset while being 10\ntimes smaller. Furthermore, it can be easily adapted to new acoustic conditions\nand unseen languages through simple fine-tuning, achieving a state-of-the-art\naccuracy of 88.2% on the FLEURS benchmark. Our model is scalable and can\nachieve a better trade-off between accuracy and speed. TitaNet-LID performs\nwell even on short utterances less than 5s in length, indicating its robustness\nto input length.\n","authors":["Fei Jia","Nithin Rao Koluguri","Jagadeesh Balam","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2210.15781v2.pdf","comment":"Accepted to INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2308.00158v2","updated":"2023-08-10T23:20:03Z","published":"2023-07-31T21:13:30Z","title":"Predicting Perfect Quality Segments in MT Output with Fine-Tuned OpenAI\n LLM: Is it possible to capture editing distance patterns from historical\n data?","summary":" Translation Quality Estimation (TQE) is an important step before deploying\nthe output translation into usage. TQE is also critical in assessing machine\ntranslation (MT) and human translation (HT) quality without seeing the\nreference translations. In this work, we examine if the state-of-the-art large\nlanguage models (LLMs) can be fine-tuned for the TQE task and their capability.\nWe take ChatGPT as one example and approach TQE as a binary classification\ntask. Using English to Italian, German, French, Japanese, Dutch, Portuguese,\nTurkish, and Chinese training corpora, our experimental results show that\nfine-tuned ChatGPT via its API can achieve a relatively high score on\npredicting translation quality, i.e. if the translation needs to be edited, but\nthere is definitely much space to improve the accuracy. English-Italiano\nbilingual Abstract is available in the paper.\n","authors":["Serge Gladkoff","Gleb Erofeev","Lifeng Han","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.00158v2.pdf","comment":"7 pages, 11 figures, under-review to ItalianNLP-2023"},{"id":"http://arxiv.org/abs/2301.08427v2","updated":"2023-08-10T20:37:20Z","published":"2023-01-20T05:39:26Z","title":"Which Features are Learned by CodeBert: An Empirical Study of the\n BERT-based Source Code Representation Learning","summary":" The Bidirectional Encoder Representations from Transformers (BERT) were\nproposed in the natural language process (NLP) and shows promising results.\nRecently researchers applied the BERT to source-code representation learning\nand reported some good news on several downstream tasks. However, in this\npaper, we illustrated that current methods cannot effectively understand the\nlogic of source codes. The representation of source code heavily relies on the\nprogrammer-defined variable and function names. We design and implement a set\nof experiments to demonstrate our conjecture and provide some insights for\nfuture works.\n","authors":["Lan Zhang","Chen Cao","Zhilong Wang","Peng Liu"],"pdf_url":"https://arxiv.org/pdf/2301.08427v2.pdf","comment":"1 table, 2 figures"},{"id":"http://arxiv.org/abs/2308.02080v2","updated":"2023-08-10T18:32:56Z","published":"2023-08-03T23:39:03Z","title":"Causality Guided Disentanglement for Cross-Platform Hate Speech\n Detection","summary":" Social media platforms, despite their value in promoting open discourse, are\noften exploited to spread harmful content. Current deep learning and natural\nlanguage processing models used for detecting this harmful content overly rely\non domain-specific terms affecting their capabilities to adapt to generalizable\nhate speech detection. This is because they tend to focus too narrowly on\nparticular linguistic signals or the use of certain categories of words.\nAnother significant challenge arises when platforms lack high-quality annotated\ndata for training, leading to a need for cross-platform models that can adapt\nto different distribution shifts. Our research introduces a cross-platform hate\nspeech detection model capable of being trained on one platform's data and\ngeneralizing to multiple unseen platforms. To achieve good generalizability\nacross platforms, one way is to disentangle the input representations into\ninvariant and platform-dependent features. We also argue that learning causal\nrelationships, which remain constant across diverse environments, can\nsignificantly aid in understanding invariant representations in hate speech. By\ndisentangling input into platform-dependent features (useful for predicting\nhate targets) and platform-independent features (used to predict the presence\nof hate), we learn invariant representations resistant to distribution shifts.\nThese features are then used to predict hate speech across unseen platforms.\nOur extensive experiments across four platforms highlight our model's enhanced\nefficacy compared to existing state-of-the-art methods in detecting generalized\nhate speech.\n","authors":["Paras Sheth","Tharindu Kumarage","Raha Moraffah","Aman Chadha","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17316v2","updated":"2023-08-10T18:32:07Z","published":"2022-10-26T21:03:17Z","title":"There is more than one kind of robustness: Fooling Whisper with\n adversarial examples","summary":" Whisper is a recent Automatic Speech Recognition (ASR) model displaying\nimpressive robustness to both out-of-distribution inputs and random noise. In\nthis work, we show that this robustness does not carry over to adversarial\nnoise. We show that we can degrade Whisper performance dramatically, or even\ntranscribe a target sentence of our choice, by generating very small input\nperturbations with Signal Noise Ratio of 35-45dB. We also show that by fooling\nthe Whisper language detector we can very easily degrade the performance of\nmultilingual models. These vulnerabilities of a widely popular open-source\nmodel have practical security implications and emphasize the need for\nadversarially robust ASR.\n","authors":["Raphael Olivier","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2210.17316v2.pdf","comment":"Accepted at InterSpeech 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.05745v1","updated":"2023-08-10T17:59:46Z","published":"2023-08-10T17:59:46Z","title":"Iterative Reweighted Least Squares Networks With Convergence Guarantees\n for Solving Inverse Imaging Problems","summary":" In this work we present a novel optimization strategy for image\nreconstruction tasks under analysis-based image regularization, which promotes\nsparse and/or low-rank solutions in some learned transform domain. We\nparameterize such regularizers using potential functions that correspond to\nweighted extensions of the $\\ell_p^p$-vector and $\\mathcal{S}_p^p$\nSchatten-matrix quasi-norms with $0 < p \\le 1$. Our proposed minimization\nstrategy extends the Iteratively Reweighted Least Squares (IRLS) method,\ntypically used for synthesis-based $\\ell_p$ and $\\mathcal{S}_p$ norm and\nanalysis-based $\\ell_1$ and nuclear norm regularization. We prove that under\nmild conditions our minimization algorithm converges linearly to a stationary\npoint, and we provide an upper bound for its convergence rate. Further, to\nselect the parameters of the regularizers that deliver the best results for the\nproblem at hand, we propose to learn them from training data by formulating the\nsupervised learning process as a stochastic bilevel optimization problem. We\nshow that thanks to the convergence guarantees of our proposed minimization\nstrategy, such optimization can be successfully performed with a\nmemory-efficient implicit back-propagation scheme. We implement our learned\nIRLS variants as recurrent networks and assess their performance on the\nchallenging image reconstruction tasks of non-blind deblurring,\nsuper-resolution and demosaicking. The comparisons against other existing\nlearned reconstruction approaches demonstrate that our overall method is very\ncompetitive and in many cases outperforms existing unrolled networks, whose\nnumber of parameters is orders of magnitude higher than in our case.\n","authors":["Iaroslav Koshelev","Stamatios Lefkimmiatis"],"pdf_url":"https://arxiv.org/pdf/2308.05745v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.10536"},{"id":"http://arxiv.org/abs/2308.05744v1","updated":"2023-08-10T17:59:34Z","published":"2023-08-10T17:59:34Z","title":"PlankAssembly: Robust 3D Reconstruction from Three Orthographic Views\n with Learnt Shape Programs","summary":" In this paper, we develop a new method to automatically convert 2D line\ndrawings from three orthographic views into 3D CAD models. Existing methods for\nthis problem reconstruct 3D models by back-projecting the 2D observations into\n3D space while maintaining explicit correspondence between the input and\noutput. Such methods are sensitive to errors and noises in the input, thus\noften fail in practice where the input drawings created by human designers are\nimperfect. To overcome this difficulty, we leverage the attention mechanism in\na Transformer-based sequence generation model to learn flexible mappings\nbetween the input and output. Further, we design shape programs which are\nsuitable for generating the objects of interest to boost the reconstruction\naccuracy and facilitate CAD modeling applications. Experiments on a new\nbenchmark dataset show that our method significantly outperforms existing ones\nwhen the inputs are noisy or incomplete.\n","authors":["Wentao Hu","Jia Zheng","Zixin Zhang","Xiaojun Yuan","Jian Yin","Zihan Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.05744v1.pdf","comment":"To Appear in ICCV 2023. The first three authors contributed equally\n to this work. The project page is at\n https://manycore-research.github.io/PlankAssembly"},{"id":"http://arxiv.org/abs/2308.05741v1","updated":"2023-08-10T17:58:02Z","published":"2023-08-10T17:58:02Z","title":"Neural Progressive Meshes","summary":" The recent proliferation of 3D content that can be consumed on hand-held\ndevices necessitates efficient tools for transmitting large geometric data,\ne.g., 3D meshes, over the Internet. Detailed high-resolution assets can pose a\nchallenge to storage as well as transmission bandwidth, and level-of-detail\ntechniques are often used to transmit an asset using an appropriate bandwidth\nbudget. It is especially desirable for these methods to transmit data\nprogressively, improving the quality of the geometry with more data. Our key\ninsight is that the geometric details of 3D meshes often exhibit similar local\npatterns even across different shapes, and thus can be effectively represented\nwith a shared learned generative space. We learn this space using a\nsubdivision-based encoder-decoder architecture trained in advance on a large\ncollection of surfaces. We further observe that additional residual features\ncan be transmitted progressively between intermediate levels of subdivision\nthat enable the client to control the tradeoff between bandwidth cost and\nquality of reconstruction, providing a neural progressive mesh representation.\nWe evaluate our method on a diverse set of complex 3D shapes and demonstrate\nthat it outperforms baselines in terms of compression ratio and reconstruction\nquality.\n","authors":["Yun-Chun Chen","Vladimir G. Kim","Noam Aigerman","Alec Jacobson"],"pdf_url":"https://arxiv.org/pdf/2308.05741v1.pdf","comment":"SIGGRAPH 2023"},{"id":"http://arxiv.org/abs/2308.05739v1","updated":"2023-08-10T17:57:22Z","published":"2023-08-10T17:57:22Z","title":"Zero Grads Ever Given: Learning Local Surrogate Losses for\n Non-Differentiable Graphics","summary":" Gradient-based optimization is now ubiquitous across graphics, but\nunfortunately can not be applied to problems with undefined or zero gradients.\nTo circumvent this issue, the loss function can be manually replaced by a\n\"surrogate\" that has similar minima but is differentiable. Our proposed\nframework, ZeroGrads, automates this process by learning a neural approximation\nof the objective function, the surrogate, which in turn can be used to\ndifferentiate through arbitrary black-box graphics pipelines. We train the\nsurrogate on an actively smoothed version of the objective and encourage\nlocality, focusing the surrogate's capacity on what matters at the current\ntraining episode. The fitting is performed online, alongside the parameter\noptimization, and self-supervised, without pre-computed data or pre-trained\nmodels. As sampling the objective is expensive (it requires a full rendering or\nsimulator run), we devise an efficient sampling scheme that allows for\ntractable run-times and competitive performance at little overhead. We\ndemonstrate optimizing diverse non-convex, non-differentiable black-box\nproblems in graphics, such as visibility in rendering, discrete parameter\nspaces in procedural modelling or optimal control in physics-driven animation.\nIn contrast to more traditional algorithms, our approach scales well to higher\ndimensions, which we demonstrate on problems with up to 35k interlinked\nvariables.\n","authors":["Michael Fischer","Tobias Ritschel"],"pdf_url":"https://arxiv.org/pdf/2308.05739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05737v1","updated":"2023-08-10T17:57:06Z","published":"2023-08-10T17:57:06Z","title":"Follow Anything: Open-set detection, tracking, and following in\n real-time","summary":" Tracking and following objects of interest is critical to several robotics\nuse cases, ranging from industrial automation to logistics and warehousing, to\nhealthcare and security. In this paper, we present a robotic system to detect,\ntrack, and follow any object in real-time. Our approach, dubbed ``follow\nanything'' (FAn), is an open-vocabulary and multimodal model -- it is not\nrestricted to concepts seen at training time and can be applied to novel\nclasses at inference time using text, images, or click queries. Leveraging rich\nvisual descriptors from large-scale pre-trained models (foundation models), FAn\ncan detect and segment objects by matching multimodal queries (text, images,\nclicks) against an input image sequence. These detected and segmented objects\nare tracked across image frames, all while accounting for occlusion and object\nre-emergence. We demonstrate FAn on a real-world robotic system (a micro aerial\nvehicle) and report its ability to seamlessly follow the objects of interest in\na real-time control loop. FAn can be deployed on a laptop with a lightweight\n(6-8 GB) graphics card, achieving a throughput of 6-20 frames per second. To\nenable rapid adoption, deployment, and extensibility, we open-source all our\ncode on our project webpage at https://github.com/alaamaalouf/FollowAnything .\nWe also encourage the reader the watch our 5-minutes explainer video in this\nhttps://www.youtube.com/watch?v=6Mgt3EPytrw .\n","authors":["Alaa Maalouf","Ninad Jadhav","Krishna Murthy Jatavallabhula","Makram Chahine","Daniel M. Vogt","Robert J. Wood","Antonio Torralba","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2308.05737v1.pdf","comment":"Project webpage: https://github.com/alaamaalouf/FollowAnything\n Explainer video: https://www.youtube.com/watch?v=6Mgt3EPytrw"},{"id":"http://arxiv.org/abs/2308.05736v1","updated":"2023-08-10T17:56:53Z","published":"2023-08-10T17:56:53Z","title":"MapTRv2: An End-to-End Framework for Online Vectorized HD Map\n Construction","summary":" High-definition (HD) map provides abundant and precise static environmental\ninformation of the driving scene, serving as a fundamental and indispensable\ncomponent for planning in autonomous driving system. In this paper, we present\n\\textbf{Map} \\textbf{TR}ansformer, an end-to-end framework for online\nvectorized HD map construction. We propose a unified permutation-equivalent\nmodeling approach, \\ie, modeling map element as a point set with a group of\nequivalent permutations, which accurately describes the shape of map element\nand stabilizes the learning process. We design a hierarchical query embedding\nscheme to flexibly encode structured map information and perform hierarchical\nbipartite matching for map element learning. To speed up convergence, we\nfurther introduce auxiliary one-to-many matching and dense supervision. The\nproposed method well copes with various map elements with arbitrary shapes. It\nruns at real-time inference speed and achieves state-of-the-art performance on\nboth nuScenes and Argoverse2 datasets. Abundant qualitative results show stable\nand robust map construction quality in complex and various driving scenes. Code\nand more demos are available at \\url{https://github.com/hustvl/MapTR} for\nfacilitating further studies and applications.\n","authors":["Bencheng Liao","Shaoyu Chen","Yunchi Zhang","Bo Jiang","Qian Zhang","Wenyu Liu","Chang Huang","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05736v1.pdf","comment":"Code available at https://github.com/hustvl/MapTR . arXiv admin note:\n substantial text overlap with arXiv:2208.14437"},{"id":"http://arxiv.org/abs/2308.05733v1","updated":"2023-08-10T17:55:02Z","published":"2023-08-10T17:55:02Z","title":"FrozenRecon: Pose-free 3D Scene Reconstruction with Frozen Depth Models","summary":" 3D scene reconstruction is a long-standing vision task. Existing approaches\ncan be categorized into geometry-based and learning-based methods. The former\nleverages multi-view geometry but can face catastrophic failures due to the\nreliance on accurate pixel correspondence across views. The latter was\nproffered to mitigate these issues by learning 2D or 3D representation\ndirectly. However, without a large-scale video or 3D training data, it can\nhardly generalize to diverse real-world scenarios due to the presence of tens\nof millions or even billions of optimization parameters in the deep network.\nRecently, robust monocular depth estimation models trained with large-scale\ndatasets have been proven to possess weak 3D geometry prior, but they are\ninsufficient for reconstruction due to the unknown camera parameters, the\naffine-invariant property, and inter-frame inconsistency. Here, we propose a\nnovel test-time optimization approach that can transfer the robustness of\naffine-invariant depth models such as LeReS to challenging diverse scenes while\nensuring inter-frame consistency, with only dozens of parameters to optimize\nper video frame. Specifically, our approach involves freezing the pre-trained\naffine-invariant depth model's depth predictions, rectifying them by optimizing\nthe unknown scale-shift values with a geometric consistency alignment module,\nand employing the resulting scale-consistent depth maps to robustly obtain\ncamera poses and achieve dense scene reconstruction, even in low-texture\nregions. Experiments show that our method achieves state-of-the-art\ncross-dataset reconstruction on five zero-shot testing datasets.\n","authors":["Guangkai Xu","Wei Yin","Hao Chen","Chunhua Shen","Kai Cheng","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.05733v1.pdf","comment":"Accepted to ICCV 2023. Project webpage is at:\n https://aim-uofa.github.io/FrozenRecon/"},{"id":"http://arxiv.org/abs/2308.05731v1","updated":"2023-08-10T17:53:03Z","published":"2023-08-10T17:53:03Z","title":"Rethinking Integration of Prediction and Planning in Deep Learning-Based\n Automated Driving Systems: A Review","summary":" Automated driving has the potential to revolutionize personal, public, and\nfreight mobility. Besides the enormous challenge of perception, i.e. accurately\nperceiving the environment using available sensor data, automated driving\ncomprises planning a safe, comfortable, and efficient motion trajectory. To\npromote safety and progress, many works rely on modules that predict the future\nmotion of surrounding traffic. Modular automated driving systems commonly\nhandle prediction and planning as sequential separate tasks. While this\naccounts for the influence of surrounding traffic on the ego-vehicle, it fails\nto anticipate the reactions of traffic participants to the ego-vehicle's\nbehavior. Recent works suggest that integrating prediction and planning in an\ninterdependent joint step is necessary to achieve safe, efficient, and\ncomfortable driving. While various models implement such integrated systems, a\ncomprehensive overview and theoretical understanding of different principles\nare lacking. We systematically review state-of-the-art deep learning-based\nprediction, planning, and integrated prediction and planning models. Different\nfacets of the integration ranging from model architecture and model design to\nbehavioral aspects are considered and related to each other. Moreover, we\ndiscuss the implications, strengths, and limitations of different integration\nmethods. By pointing out research gaps, describing relevant future challenges,\nand highlighting trends in the research field, we identify promising directions\nfor future research.\n","authors":["Steffen Hagedorn","Marcel Hallgarten","Martin Stoll","Alexandru Condurache"],"pdf_url":"https://arxiv.org/pdf/2308.05731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00135v3","updated":"2023-08-10T17:50:49Z","published":"2023-07-22T17:05:47Z","title":"InFusion: Inject and Attention Fusion for Multi Concept Zero-Shot\n Text-based Video Editing","summary":" Large text-to-image diffusion models have achieved remarkable success in\ngenerating diverse, high-quality images. Additionally, these models have been\nsuccessfully leveraged to edit input images by just changing the text prompt.\nBut when these models are applied to videos, the main challenge is to ensure\ntemporal consistency and coherence across frames. In this paper, we propose\nInFusion, a framework for zero-shot text-based video editing leveraging large\npre-trained image diffusion models. Our framework specifically supports editing\nof multiple concepts with pixel-level control over diverse concepts mentioned\nin the editing prompt. Specifically, we inject the difference in features\nobtained with source and edit prompts from U-Net residual blocks of decoder\nlayers. When these are combined with injected attention features, it becomes\nfeasible to query the source contents and scale edited concepts along with the\ninjection of unedited parts. The editing is further controlled in a\nfine-grained manner with mask extraction and attention fusion, which cut the\nedited part from the source and paste it into the denoising pipeline for the\nediting prompt. Our framework is a low-cost alternative to one-shot tuned\nmodels for editing since it does not require training. We demonstrated complex\nconcept editing with a generalised image model (Stable Diffusion v1.5) using\nLoRA. Adaptation is compatible with all the existing image diffusion\ntechniques. Extensive experimental results demonstrate the effectiveness of\nexisting methods in rendering high-quality and temporally consistent videos.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2308.00135v3.pdf","comment":"10 pages, 8 figures, 1 Table, accepted at ICCVW 2023 (ICCV 2023\n Workshop on AI for Creative Video Editing and Understanding)"},{"id":"http://arxiv.org/abs/2308.05721v1","updated":"2023-08-10T17:37:49Z","published":"2023-08-10T17:37:49Z","title":"Deformable Mixer Transformer with Gating for Multi-Task Learning of\n Dense Prediction","summary":" CNNs and Transformers have their own advantages and both have been widely\nused for dense prediction in multi-task learning (MTL). Most of the current\nstudies on MTL solely rely on CNN or Transformer. In this work, we present a\nnovel MTL model by combining both merits of deformable CNN and query-based\nTransformer with shared gating for multi-task learning of dense prediction.\nThis combination may offer a simple and efficient solution owing to its\npowerful and flexible task-specific learning and advantages of lower cost, less\ncomplexity and smaller parameters than the traditional MTL methods. We\nintroduce deformable mixer Transformer with gating (DeMTG), a simple and\neffective encoder-decoder architecture up-to-date that incorporates the\nconvolution and attention mechanism in a unified network for MTL. It is\nexquisitely designed to use advantages of each block, and provide deformable\nand comprehensive features for all tasks from local and global perspective.\nFirst, the deformable mixer encoder contains two types of operators: the\nchannel-aware mixing operator leveraged to allow communication among different\nchannels, and the spatial-aware deformable operator with deformable convolution\napplied to efficiently sample more informative spatial locations. Second, the\ntask-aware gating transformer decoder is used to perform the task-specific\npredictions, in which task interaction block integrated with self-attention is\napplied to capture task interaction features, and the task query block\nintegrated with gating attention is leveraged to select corresponding\ntask-specific features. Further, the experiment results demonstrate that the\nproposed DeMTG uses fewer GFLOPs and significantly outperforms current\nTransformer-based and CNN-based competitive models on a variety of metrics on\nthree dense prediction datasets. Our code and models are available at\nhttps://github.com/yangyangxu0/DeMTG.\n","authors":["Yangyang Xu","Yibo Yang","Bernard Ghanemm","Lefei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05721v1.pdf","comment":"Comments: submitted to IJCV; an extension to our previous AAAI 2023\n paper arXiv:2301.03461"},{"id":"http://arxiv.org/abs/2303.14961v3","updated":"2023-08-10T17:34:48Z","published":"2023-03-27T07:52:58Z","title":"Diffusion Denoised Smoothing for Certified and Adversarial Robust\n Out-Of-Distribution Detection","summary":" As the use of machine learning continues to expand, the importance of\nensuring its safety cannot be overstated. A key concern in this regard is the\nability to identify whether a given sample is from the training distribution,\nor is an \"Out-Of-Distribution\" (OOD) sample. In addition, adversaries can\nmanipulate OOD samples in ways that lead a classifier to make a confident\nprediction. In this study, we present a novel approach for certifying the\nrobustness of OOD detection within a $\\ell_2$-norm around the input, regardless\nof network architecture and without the need for specific components or\nadditional training. Further, we improve current techniques for detecting\nadversarial attacks on OOD samples, while providing high levels of certified\nand adversarial robustness on in-distribution samples. The average of all OOD\ndetection metrics on CIFAR10/100 shows an increase of $\\sim 13 \\% / 5\\%$\nrelative to previous approaches.\n","authors":["Nicola Franco","Daniel Korth","Jeanette Miriam Lorenz","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2303.14961v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10608v2","updated":"2023-08-10T17:17:06Z","published":"2023-06-18T17:55:02Z","title":"STHG: Spatial-Temporal Heterogeneous Graph Learning for Advanced\n Audio-Visual Diarization","summary":" This report introduces our novel method named STHG for the Audio-Visual\nDiarization task of the Ego4D Challenge 2023. Our key innovation is that we\nmodel all the speakers in a video using a single, unified heterogeneous graph\nlearning framework. Unlike previous approaches that require a separate\ncomponent solely for the camera wearer, STHG can jointly detect the speech\nactivities of all people including the camera wearer. Our final method obtains\n61.1% DER on the test set of Ego4D, which significantly outperforms all the\nbaselines as well as last year's winner. Our submission achieved 1st place in\nthe Ego4D Challenge 2023. We additionally demonstrate that applying the\noff-the-shelf speech recognition system to the diarized speech segments by STHG\nproduces a competitive performance on the Speech Transcription task of this\nchallenge.\n","authors":["Kyle Min"],"pdf_url":"https://arxiv.org/pdf/2306.10608v2.pdf","comment":"Validation report for the Ego4D challenge at CVPR 2023"},{"id":"http://arxiv.org/abs/2210.07764v2","updated":"2023-08-10T17:14:47Z","published":"2022-10-14T12:54:03Z","title":"Intel Labs at Ego4D Challenge 2022: A Better Baseline for Audio-Visual\n Diarization","summary":" This report describes our approach for the Audio-Visual Diarization (AVD)\ntask of the Ego4D Challenge 2022. Specifically, we present multiple technical\nimprovements over the official baselines. First, we improve the detection\nperformance of the camera wearer's voice activity by modifying the training\nscheme of its model. Second, we discover that an off-the-shelf voice activity\ndetection model can effectively remove false positives when it is applied\nsolely to the camera wearer's voice activities. Lastly, we show that better\nactive speaker detection leads to a better AVD outcome. Our final method\nobtains 65.9% DER on the test set of Ego4D, which significantly outperforms all\nthe baselines. Our submission achieved 1st place in the Ego4D Challenge 2022.\n","authors":["Kyle Min"],"pdf_url":"https://arxiv.org/pdf/2210.07764v2.pdf","comment":"Validation report for the Ego4D challenge at ECCV 2022"},{"id":"http://arxiv.org/abs/2308.05707v1","updated":"2023-08-10T17:14:07Z","published":"2023-08-10T17:14:07Z","title":"Shadow Datasets, New challenging datasets for Causal Representation\n Learning","summary":" Discovering causal relations among semantic factors is an emergent topic in\nrepresentation learning. Most causal representation learning (CRL) methods are\nfully supervised, which is impractical due to costly labeling. To resolve this\nrestriction, weakly supervised CRL methods were introduced. To evaluate CRL\nperformance, four existing datasets, Pendulum, Flow, CelebA(BEARD) and\nCelebA(SMILE), are utilized. However, existing CRL datasets are limited to\nsimple graphs with few generative factors. Thus we propose two new datasets\nwith a larger number of diverse generative factors and more sophisticated\ncausal graphs. In addition, current real datasets, CelebA(BEARD) and\nCelebA(SMILE), the originally proposed causal graphs are not aligned with the\ndataset distributions. Thus, we propose modifications to them.\n","authors":["Jiageng Zhu","Hanchen Xie","Jianhua Wu","Jiazhi Li","Mahyar Khayatkhoei","Mohamed E. Hussein","Wael AbdAlmageed"],"pdf_url":"https://arxiv.org/pdf/2308.05707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05695v1","updated":"2023-08-10T16:57:14Z","published":"2023-08-10T16:57:14Z","title":"Masked Diffusion as Self-supervised Representation Learner","summary":" Denoising diffusion probabilistic models have recently demonstrated\nstate-of-the-art generative performance and been used as strong pixel-level\nrepresentation learners. This paper decomposes the interrelation between the\ngenerative capability and representation learning ability inherent in diffusion\nmodels. We present masked diffusion model (MDM), a scalable self-supervised\nrepresentation learner that substitutes the conventional additive Gaussian\nnoise of traditional diffusion with a masking mechanism. Our proposed approach\nconvincingly surpasses prior benchmarks, demonstrating remarkable advancements\nin both medical and natural image semantic segmentation tasks, particularly\nwithin the context of few-shot scenario.\n","authors":["Zixuan Pan","Jianxu Chen","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2308.05695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05681v1","updated":"2023-08-10T16:34:20Z","published":"2023-08-10T16:34:20Z","title":"Hard No-Box Adversarial Attack on Skeleton-Based Human Action\n Recognition with Skeleton-Motion-Informed Gradient","summary":" Recently, methods for skeleton-based human activity recognition have been\nshown to be vulnerable to adversarial attacks. However, these attack methods\nrequire either the full knowledge of the victim (i.e. white-box attacks),\naccess to training data (i.e. transfer-based attacks) or frequent model queries\n(i.e. black-box attacks). All their requirements are highly restrictive,\nraising the question of how detrimental the vulnerability is. In this paper, we\nshow that the vulnerability indeed exists. To this end, we consider a new\nattack task: the attacker has no access to the victim model or the training\ndata or labels, where we coin the term hard no-box attack. Specifically, we\nfirst learn a motion manifold where we define an adversarial loss to compute a\nnew gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our\ngradient contains information of the motion dynamics, which is different from\nexisting gradient-based attack methods that compute the loss gradient assuming\neach dimension in the data is independent. The SMI gradient can augment many\ngradient-based attack methods, leading to a new family of no-box attack\nmethods. Extensive evaluation and comparison show that our method imposes a\nreal threat to existing classifiers. They also show that the SMI gradient\nimproves the transferability and imperceptibility of adversarial samples in\nboth no-box and transfer-based black-box settings.\n","authors":["Zhengzhi Lu","He Wang","Ziyi Chang","Guoan Yang","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2308.05681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03712v2","updated":"2023-08-10T16:23:03Z","published":"2023-08-07T16:31:38Z","title":"Scaling may be all you need for achieving human-level object recognition\n capacity with human-like visual experience","summary":" This paper asks whether current self-supervised learning methods, if\nsufficiently scaled up, would be able to reach human-level visual object\nrecognition capabilities with the same type and amount of visual experience\nhumans learn from. Previous work on this question only considered the scaling\nof data size. Here, we consider the simultaneous scaling of data size, model\nsize, and image resolution. We perform a scaling experiment with vision\ntransformers up to 633M parameters in size (ViT-H/14) trained with up to 5K\nhours of human-like video data (long, continuous, mostly egocentric videos)\nwith image resolutions of up to 476x476 pixels. The efficiency of masked\nautoencoders (MAEs) as a self-supervised learning algorithm makes it possible\nto run this scaling experiment on an unassuming academic budget. We find that\nit is feasible to reach human-level object recognition capacity at sub-human\nscales of model size, data size, and image size, if these factors are scaled up\nsimultaneously. To give a concrete example, we estimate that a 2.5B parameter\nViT model trained with 20K hours (2.3 years) of human-like video data with a\nspatial resolution of 952x952 pixels should be able to reach roughly\nhuman-level accuracy on ImageNet. Human-level competence is thus achievable for\na fundamental perceptual capability from human-like perceptual experience\n(human-like in both amount and type) with extremely generic learning algorithms\nand architectures and without any substantive inductive biases.\n","authors":["A. Emin Orhan"],"pdf_url":"https://arxiv.org/pdf/2308.03712v2.pdf","comment":"v2 adds an Appendix containing results with alternative scaling\n functions; code & models available from\n https://github.com/eminorhan/humanlike-vits"},{"id":"http://arxiv.org/abs/2308.05667v1","updated":"2023-08-10T16:10:54Z","published":"2023-08-10T16:10:54Z","title":"2D3D-MATR: 2D-3D Matching Transformer for Detection-free Registration\n between Images and Point Clouds","summary":" The commonly adopted detect-then-match approach to registration finds\ndifficulties in the cross-modality cases due to the incompatible keypoint\ndetection and inconsistent feature description. We propose, 2D3D-MATR, a\ndetection-free method for accurate and robust registration between images and\npoint clouds. Our method adopts a coarse-to-fine pipeline where it first\ncomputes coarse correspondences between downsampled patches of the input image\nand the point cloud and then extends them to form dense correspondences between\npixels and points within the patch region. The coarse-level patch matching is\nbased on transformer which jointly learns global contextual constraints with\nself-attention and cross-modality correlations with cross-attention. To resolve\nthe scale ambiguity in patch matching, we construct a multi-scale pyramid for\neach image patch and learn to find for each point patch the best matching image\npatch at a proper resolution level. Extensive experiments on two public\nbenchmarks demonstrate that 2D3D-MATR outperforms the previous state-of-the-art\nP2-Net by around $20$ percentage points on inlier ratio and over $10$ points on\nregistration recall. Our code and models are available at\n\\url{https://github.com/minhaolee/2D3DMATR}.\n","authors":["Minhao Li","Zheng Qin","Zhirui Gao","Renjiao Yi","Chengyang Zhu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2308.05667v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2301.05221v2","updated":"2023-08-10T16:05:14Z","published":"2023-01-12T18:59:08Z","title":"Open-vocabulary Object Segmentation with Diffusion Models","summary":" The goal of this paper is to extract the visual-language correspondence from\na pre-trained text-to-image diffusion model, in the form of segmentation map,\ni.e., simultaneously generating images and segmentation masks for the\ncorresponding visual entities described in the text prompt. We make the\nfollowing contributions: (i) we pair the existing Stable Diffusion model with a\nnovel grounding module, that can be trained to align the visual and textual\nembedding space of the diffusion model with only a small number of object\ncategories; (ii) we establish an automatic pipeline for constructing a dataset,\nthat consists of {image, segmentation mask, text prompt} triplets, to train the\nproposed grounding module; (iii) we evaluate the performance of open-vocabulary\ngrounding on images generated from the text-to-image diffusion model and show\nthat the module can well segment the objects of categories beyond seen ones at\ntraining time; (iv) we adopt the augmented diffusion model to build a synthetic\nsemantic segmentation dataset, and show that, training a standard segmentation\nmodel on such dataset demonstrates competitive performance on the zero-shot\nsegmentation(ZS3) benchmark, which opens up new opportunities for adopting the\npowerful diffusion model for discriminative tasks.\n","authors":["Ziyi Li","Qinye Zhou","Xiaoyun Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2301.05221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.06193v3","updated":"2023-08-10T16:03:31Z","published":"2021-12-12T09:57:59Z","title":"GUNNEL: Guided Mixup Augmentation and Multi-View Fusion for Aquatic\n Animal Segmentation","summary":" Recent years have witnessed great advances in object segmentation research.\nIn addition to generic objects, aquatic animals have attracted research\nattention. Deep learning-based methods are widely used for aquatic animal\nsegmentation and have achieved promising performance. However, there is a lack\nof challenging datasets for benchmarking. In this work, we build a new dataset\ndubbed Aquatic Animal Species. We also devise a novel GUided mixup augmeNtatioN\nand multi-modEl fusion for aquatic animaL segmentation (GUNNEL) that leverages\nthe advantages of multiple segmentation models to effectively segment aquatic\nanimals and improves the training performance by synthesizing hard samples.\nExtensive experiments demonstrated the superiority of our proposed framework\nover existing state-of-the-art instance segmentation methods. The code is\navailable at https://github.com/lmquan2000/mask-mixup. The dataset is available\nat https://doi.org/10.5281/zenodo.8208877 .\n","authors":["Minh-Quan Le","Trung-Nghia Le","Tam V. Nguyen","Isao Echizen","Minh-Triet Tran"],"pdf_url":"https://arxiv.org/pdf/2112.06193v3.pdf","comment":"The code is available at https://github.com/lmquan2000/mask-mixup .\n The dataset is available at https://doi.org/10.5281/zenodo.8208877"},{"id":"http://arxiv.org/abs/2308.05659v1","updated":"2023-08-10T15:58:28Z","published":"2023-08-10T15:58:28Z","title":"AD-CLIP: Adapting Domains in Prompt Space Using CLIP","summary":" Although deep learning models have shown impressive performance on supervised\nlearning tasks, they often struggle to generalize well when the training\n(source) and test (target) domains differ. Unsupervised domain adaptation (DA)\nhas emerged as a popular solution to this problem. However, current DA\ntechniques rely on visual backbones, which may lack semantic richness. Despite\nthe potential of large-scale vision-language foundation models like CLIP, their\neffectiveness for DA has yet to be fully explored. To address this gap, we\nintroduce AD-CLIP, a domain-agnostic prompt learning strategy for CLIP that\naims to solve the DA problem in the prompt space. We leverage the frozen vision\nbackbone of CLIP to extract both image style (domain) and content information,\nwhich we apply to learn prompt tokens. Our prompts are designed to be\ndomain-invariant and class-generalizable, by conditioning prompt learning on\nimage style and content features simultaneously. We use standard supervised\ncontrastive learning in the source domain, while proposing an entropy\nminimization strategy to align domains in the embedding space given the target\ndomain data. We also consider a scenario where only target domain samples are\navailable during testing, without any source domain data, and propose a\ncross-domain style mapping network to hallucinate domain-agnostic tokens. Our\nextensive experiments on three benchmark DA datasets demonstrate the\neffectiveness of AD-CLIP compared to existing literature.\n","authors":["Mainak Singha","Harsh Pal","Ankit Jha","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2308.05659v1.pdf","comment":"10 pages, 8 figures, 4 tables. Accepted at OOD-CV, ICCV Workshop,\n 2023"},{"id":"http://arxiv.org/abs/2308.05655v1","updated":"2023-08-10T15:53:35Z","published":"2023-08-10T15:53:35Z","title":"Attention-based 3D CNN with Multi-layer Features for Alzheimer's Disease\n Diagnosis using Brain Images","summary":" Structural MRI and PET imaging play an important role in the diagnosis of\nAlzheimer's disease (AD), showing the morphological changes and glucose\nmetabolism changes in the brain respectively. The manifestations in the brain\nimage of some cognitive impairment patients are relatively inconspicuous, for\nexample, it still has difficulties in achieving accurate diagnosis through sMRI\nin clinical practice. With the emergence of deep learning, convolutional neural\nnetwork (CNN) has become a valuable method in AD-aided diagnosis, but some CNN\nmethods cannot effectively learn the features of brain image, making the\ndiagnosis of AD still presents some challenges. In this work, we propose an\nend-to-end 3D CNN framework for AD diagnosis based on ResNet, which integrates\nmulti-layer features obtained under the effect of the attention mechanism to\nbetter capture subtle differences in brain images. The attention maps showed\nour model can focus on key brain regions related to the disease diagnosis. Our\nmethod was verified in ablation experiments with two modality images on 792\nsubjects from the ADNI database, where AD diagnostic accuracies of 89.71% and\n91.18% were achieved based on sMRI and PET respectively, and also outperformed\nsome state-of-the-art methods.\n","authors":["Yanteng Zhang","Qizhi Teng","Xiaohai He","Tong Niu","Lipei Zhang","Yan Liu","Chao Ren"],"pdf_url":"https://arxiv.org/pdf/2308.05655v1.pdf","comment":"4 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.05648v1","updated":"2023-08-10T15:45:45Z","published":"2023-08-10T15:45:45Z","title":"Counterfactual Cross-modality Reasoning for Weakly Supervised Video\n Moment Localization","summary":" Video moment localization aims to retrieve the target segment of an untrimmed\nvideo according to the natural language query. Weakly supervised methods gains\nattention recently, as the precise temporal location of the target segment is\nnot always available. However, one of the greatest challenges encountered by\nthe weakly supervised method is implied in the mismatch between the video and\nlanguage induced by the coarse temporal annotations. To refine the\nvision-language alignment, recent works contrast the cross-modality\nsimilarities driven by reconstructing masked queries between positive and\nnegative video proposals. However, the reconstruction may be influenced by the\nlatent spurious correlation between the unmasked and the masked parts, which\ndistorts the restoring process and further degrades the efficacy of contrastive\nlearning since the masked words are not completely reconstructed from the\ncross-modality knowledge. In this paper, we discover and mitigate this spurious\ncorrelation through a novel proposed counterfactual cross-modality reasoning\nmethod. Specifically, we first formulate query reconstruction as an aggregated\ncausal effect of cross-modality and query knowledge. Then by introducing\ncounterfactual cross-modality knowledge into this aggregation, the spurious\nimpact of the unmasked part contributing to the reconstruction is explicitly\nmodeled. Finally, by suppressing the unimodal effect of masked query, we can\nrectify the reconstructions of video proposals to perform reasonable\ncontrastive learning. Extensive experimental evaluations demonstrate the\neffectiveness of our proposed method. The code is available at\n\\href{https://github.com/sLdZ0306/CCR}{https://github.com/sLdZ0306/CCR}.\n","authors":["Zezhong Lv","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.05648v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2112.02399v3","updated":"2023-08-10T15:31:54Z","published":"2021-12-04T18:34:24Z","title":"VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts","summary":" Contrastive Language-Image Pre-training (CLIP) has drawn increasing attention\nrecently for its transferable visual representation learning. However, due to\nthe semantic gap within datasets, CLIP's pre-trained image-text alignment\nbecomes sub-optimal on downstream tasks, which severely harms its transferring\nperformance. To better adapt the cross-modality embedding space, we propose to\nenhance CLIP via Visual-guided Texts, named VT-CLIP. Specifically, we guide\ntextual features of different categories to adaptively explore informative\nregions on the image and aggregate visual features by attention mechanisms. In\nthis way, the texts become visual-guided, namely, more semantically correlated\nwith downstream images, which greatly benefits the category-wise matching\nprocess. In few-shot settings, we evaluate our VT-CLIP on 11 well-known\nclassification datasets to demonstrate its effectiveness.\n","authors":["Longtian Qiu","Renrui Zhang","Ziyu Guo","Ziyao Zeng","Zilu Guo","Yafeng Li","Guangnan Zhang"],"pdf_url":"https://arxiv.org/pdf/2112.02399v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05633v1","updated":"2023-08-10T15:22:11Z","published":"2023-08-10T15:22:11Z","title":"IIHT: Medical Report Generation with Image-to-Indicator Hierarchical\n Transformer","summary":" Automated medical report generation has become increasingly important in\nmedical analysis. It can produce computer-aided diagnosis descriptions and thus\nsignificantly alleviate the doctors' work. Inspired by the huge success of\nneural machine translation and image captioning, various deep learning methods\nhave been proposed for medical report generation. However, due to the inherent\nproperties of medical data, including data imbalance and the length and\ncorrelation between report sequences, the generated reports by existing methods\nmay exhibit linguistic fluency but lack adequate clinical accuracy. In this\nwork, we propose an image-to-indicator hierarchical transformer (IIHT)\nframework for medical report generation. It consists of three modules, i.e., a\nclassifier module, an indicator expansion module and a generator module. The\nclassifier module first extracts image features from the input medical images\nand produces disease-related indicators with their corresponding states. The\ndisease-related indicators are subsequently utilised as input for the indicator\nexpansion module, incorporating the \"data-text-data\" strategy. The\ntransformer-based generator then leverages these extracted features along with\nimage features as auxiliary information to generate final reports. Furthermore,\nthe proposed IIHT method is feasible for radiologists to modify disease\nindicators in real-world scenarios and integrate the operations into the\nindicator expansion module for fluent and accurate medical report generation.\nExtensive experiments and comparisons with state-of-the-art methods under\nvarious evaluation metrics demonstrate the great performance of the proposed\nmethod.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2308.05633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.04278v2","updated":"2023-08-10T15:19:34Z","published":"2022-09-09T12:47:24Z","title":"Deep learning-based Crop Row Detection for Infield Navigation of\n Agri-Robots","summary":" Autonomous navigation in agricultural environments is challenged by varying\nfield conditions that arise in arable fields. State-of-the-art solutions for\nautonomous navigation in such environments require expensive hardware such as\nRTK-GNSS. This paper presents a robust crop row detection algorithm that\nwithstands such field variations using inexpensive cameras. Existing datasets\nfor crop row detection does not represent all the possible field variations. A\ndataset of sugar beet images was created representing 11 field variations\ncomprised of multiple grow stages, light levels, varying weed densities, curved\ncrop rows and discontinuous crop rows. The proposed pipeline segments the crop\nrows using a deep learning-based method and employs the predicted segmentation\nmask for extraction of the central crop using a novel central crop row\nselection algorithm. The novel crop row detection algorithm was tested for crop\nrow detection performance and the capability of visual servoing along a crop\nrow. The visual servoing-based navigation was tested on a realistic simulation\nscenario with the real ground and plant textures. Our algorithm demonstrated\nrobust vision-based crop row detection in challenging field conditions\noutperforming the baseline.\n","authors":["Rajitha de Silva","Grzegorz Cielniak","Gang Wang","Junfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2209.04278v2.pdf","comment":"Published in Journal of Field Robotics:\n https://onlinelibrary.wiley.com/doi/epdf/10.1002/rob.22238"},{"id":"http://arxiv.org/abs/2304.10769v4","updated":"2023-08-10T14:46:15Z","published":"2023-04-21T06:35:54Z","title":"Deep Multiview Clustering by Contrasting Cluster Assignments","summary":" Multiview clustering (MVC) aims to reveal the underlying structure of\nmultiview data by categorizing data samples into clusters. Deep learning-based\nmethods exhibit strong feature learning capabilities on large-scale datasets.\nFor most existing deep MVC methods, exploring the invariant representations of\nmultiple views is still an intractable problem. In this paper, we propose a\ncross-view contrastive learning (CVCL) method that learns view-invariant\nrepresentations and produces clustering results by contrasting the cluster\nassignments among multiple views. Specifically, we first employ deep\nautoencoders to extract view-dependent features in the pretraining stage. Then,\na cluster-level CVCL strategy is presented to explore consistent semantic label\ninformation among the multiple views in the fine-tuning stage. Thus, the\nproposed CVCL method is able to produce more discriminative cluster assignments\nby virtue of this learning strategy. Moreover, we provide a theoretical\nanalysis of soft cluster assignment alignment. Extensive experimental results\nobtained on several datasets demonstrate that the proposed CVCL method\noutperforms several state-of-the-art approaches.\n","authors":["Jie Chen","Hua Mao","Wai Lok Woo","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2304.10769v4.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.05605v1","updated":"2023-08-10T14:32:18Z","published":"2023-08-10T14:32:18Z","title":"Self-Supervised Monocular Depth Estimation by Direction-aware Cumulative\n Convolution Network","summary":" Monocular depth estimation is known as an ill-posed task in which objects in\na 2D image usually do not contain sufficient information to predict their\ndepth. Thus, it acts differently from other tasks (e.g., classification and\nsegmentation) in many ways. In this paper, we find that self-supervised\nmonocular depth estimation shows a direction sensitivity and environmental\ndependency in the feature representation. But the current backbones borrowed\nfrom other tasks pay less attention to handling different types of\nenvironmental information, limiting the overall depth accuracy. To bridge this\ngap, we propose a new Direction-aware Cumulative Convolution Network (DaCCN),\nwhich improves the depth feature representation in two aspects. First, we\npropose a direction-aware module, which can learn to adjust the feature\nextraction in each direction, facilitating the encoding of different types of\ninformation. Secondly, we design a new cumulative convolution to improve the\nefficiency for aggregating important environmental information. Experiments\nshow that our method achieves significant improvements on three widely used\nbenchmarks, KITTI, Cityscapes, and Make3D, setting a new state-of-the-art\nperformance on the popular benchmarks with all three types of self-supervision.\n","authors":["Wencheng Han","Junbo Yin","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2308.05605v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.05602v1","updated":"2023-08-10T14:21:33Z","published":"2023-08-10T14:21:33Z","title":"Object Goal Navigation with Recursive Implicit Maps","summary":" Object goal navigation aims to navigate an agent to locations of a given\nobject category in unseen environments. Classical methods explicitly build maps\nof environments and require extensive engineering while lacking semantic\ninformation for object-oriented exploration. On the other hand, end-to-end\nlearning methods alleviate manual map design and predict actions using implicit\nrepresentations. Such methods, however, lack an explicit notion of geometry and\nmay have limited ability to encode navigation history. In this work, we propose\nan implicit spatial map for object goal navigation. Our implicit map is\nrecursively updated with new observations at each step using a transformer. To\nencourage spatial reasoning, we introduce auxiliary tasks and train our model\nto reconstruct explicit maps as well as to predict visual features, semantic\nlabels and actions. Our method significantly outperforms the state of the art\non the challenging MP3D dataset and generalizes well to the HM3D dataset. We\nsuccessfully deploy our model on a real robot and achieve encouraging object\ngoal navigation results in real scenes using only a few real-world\ndemonstrations. Code, trained models and videos are available at\n\\url{https://www.di.ens.fr/willow/research/onav_rim/}.\n","authors":["Shizhe Chen","Thomas Chabal","Ivan Laptev","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2308.05602v1.pdf","comment":"Accepted to IROS 2023"},{"id":"http://arxiv.org/abs/2308.05600v1","updated":"2023-08-10T14:19:58Z","published":"2023-08-10T14:19:58Z","title":"NUPES : Non-Uniform Post-Training Quantization via Power Exponent Search","summary":" Deep neural network (DNN) deployment has been confined to larger hardware\ndevices due to their expensive computational requirements. This challenge has\nrecently reached another scale with the emergence of large language models\n(LLMs). In order to reduce both their memory footprint and latency, a promising\ntechnique is quantization. It consists in converting floating point\nrepresentations to low bit-width fixed point representations, usually by\nassuming a uniform mapping onto a regular grid. This process, referred to in\nthe literature as uniform quantization, may however be ill-suited as most DNN\nweights and activations follow a bell-shaped distribution. This is even worse\non LLMs whose weight distributions are known to exhibit large, high impact,\noutlier values. In this work, we propose an improvement over the most commonly\nadopted way to tackle this limitation in deep learning models quantization,\nnamely, non-uniform quantization. NUPES leverages automorphisms to preserve the\nscalar multiplications. Such transformations are derived from power functions.\nHowever, the optimization of the exponent parameter and weight values remains a\nchallenging and novel problem which could not be solved with previous post\ntraining optimization techniques which only learn to round up or down weight\nvalues in order to preserve the predictive function. We circumvent this\nlimitation with a new paradigm: learning new quantized weights over the entire\nquantized space. Similarly, we enable the optimization of the power exponent,\ni.e. the optimization of the quantization operator itself during training by\nalleviating all the numerical instabilities. The resulting predictive function\nis compatible with integer-only low-bit inference. We show the ability of the\nmethod to achieve state-of-the-art compression rates in both, data-free and\ndata-driven configurations.\n","authors":["Edouard Yvinec","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2308.05600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05595v1","updated":"2023-08-10T14:08:50Z","published":"2023-08-10T14:08:50Z","title":"Test-Time Selection for Robust Skin Lesion Analysis","summary":" Skin lesion analysis models are biased by artifacts placed during image\nacquisition, which influence model predictions despite carrying no clinical\ninformation. Solutions that address this problem by regularizing models to\nprevent learning those spurious features achieve only partial success, and\nexisting test-time debiasing techniques are inappropriate for skin lesion\nanalysis due to either making unrealistic assumptions on the distribution of\ntest data or requiring laborious annotation from medical practitioners. We\npropose TTS (Test-Time Selection), a human-in-the-loop method that leverages\npositive (e.g., lesion area) and negative (e.g., artifacts) keypoints in test\nsamples. TTS effectively steers models away from exploiting spurious\nartifact-related correlations without retraining, and with less annotation\nrequirements. Our solution is robust to a varying availability of annotations,\nand different levels of bias. We showcase on the ISIC2019 dataset (for which we\nrelease a subset of annotated images) how our model could be deployed in the\nreal-world for mitigating bias.\n","authors":["Alceu Bissoto","Catarina Barata","Eduardo Valle","Sandra Avila"],"pdf_url":"https://arxiv.org/pdf/2308.05595v1.pdf","comment":"Accepted at ISIC Workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.05581v1","updated":"2023-08-10T13:44:54Z","published":"2023-08-10T13:44:54Z","title":"Category Feature Transformer for Semantic Segmentation","summary":" Aggregation of multi-stage features has been revealed to play a significant\nrole in semantic segmentation. Unlike previous methods employing point-wise\nsummation or concatenation for feature aggregation, this study proposes the\nCategory Feature Transformer (CFT) that explores the flow of category embedding\nand transformation among multi-stage features through the prevalent multi-head\nattention mechanism. CFT learns unified feature embeddings for individual\nsemantic categories from high-level features during each aggregation process\nand dynamically broadcasts them to high-resolution features. Integrating the\nproposed CFT into a typical feature pyramid structure exhibits superior\nperformance over a broad range of backbone networks. We conduct extensive\nexperiments on popular semantic segmentation benchmarks. Specifically, the\nproposed CFT obtains a compelling 55.1% mIoU with greatly reduced model\nparameters and computations on the challenging ADE20K dataset.\n","authors":["Quan Tang","Chuanjian Liu","Fagui Liu","Yifan Liu","Jun Jiang","Bowen Zhang","Kai Han","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05574v1","updated":"2023-08-10T13:38:09Z","published":"2023-08-10T13:38:09Z","title":"Exploring Linguistic Similarity and Zero-Shot Learning for Multilingual\n Translation of Dravidian Languages","summary":" Current research in zero-shot translation is plagued by several issues such\nas high compute requirements, increased training time and off target\ntranslations. Proposed remedies often come at the cost of additional data or\ncompute requirements. Pivot based neural machine translation is preferred over\na single-encoder model for most settings despite the increased training and\nevaluation time. In this work, we overcome the shortcomings of zero-shot\ntranslation by taking advantage of transliteration and linguistic similarity.\nWe build a single encoder-decoder neural machine translation system for\nDravidian-Dravidian multilingual translation and perform zero-shot translation.\nWe compare the data vs zero-shot accuracy tradeoff and evaluate the performance\nof our vanilla method against the current state of the art pivot based method.\nWe also test the theory that morphologically rich languages require large\nvocabularies by restricting the vocabulary using an optimal transport based\ntechnique. Our model manages to achieves scores within 3 BLEU of large-scale\npivot-based models when it is trained on 50\\% of the language directions.\n","authors":["Danish Ebadulla","Rahul Raman","S. Natarajan","Hridhay Kiran Shetty","Ashish Harish Shenoy"],"pdf_url":"https://arxiv.org/pdf/2308.05574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05550v1","updated":"2023-08-10T13:06:05Z","published":"2023-08-10T13:06:05Z","title":"Cross-Domain Product Representation Learning for Rich-Content E-Commerce","summary":" The proliferation of short video and live-streaming platforms has\nrevolutionized how consumers engage in online shopping. Instead of browsing\nproduct pages, consumers are now turning to rich-content e-commerce, where they\ncan purchase products through dynamic and interactive media like short videos\nand live streams. This emerging form of online shopping has introduced\ntechnical challenges, as products may be presented differently across various\nmedia domains. Therefore, a unified product representation is essential for\nachieving cross-domain product recognition to ensure an optimal user search\nexperience and effective product recommendations. Despite the urgent industrial\nneed for a unified cross-domain product representation, previous studies have\npredominantly focused only on product pages without taking into account short\nvideos and live streams. To fill the gap in the rich-content e-commerce area,\nin this paper, we introduce a large-scale cRoss-dOmain Product Ecognition\ndataset, called ROPE. ROPE covers a wide range of product categories and\ncontains over 180,000 products, corresponding to millions of short videos and\nlive streams. It is the first dataset to cover product pages, short videos, and\nlive streams simultaneously, providing the basis for establishing a unified\nproduct representation across different media domains. Furthermore, we propose\na Cross-dOmain Product rEpresentation framework, namely COPE, which unifies\nproduct representations in different domains through multimodal learning\nincluding text and vision. Extensive experiments on downstream tasks\ndemonstrate the effectiveness of COPE in learning a joint feature space for all\nproduct domains.\n","authors":["Xuehan Bai","Yan Li","Yanhua Cheng","Wenjie Yang","Quan Chen","Han Li"],"pdf_url":"https://arxiv.org/pdf/2308.05550v1.pdf","comment":"ICCV23"},{"id":"http://arxiv.org/abs/2308.05543v1","updated":"2023-08-10T12:53:30Z","published":"2023-08-10T12:53:30Z","title":"Deep Richardson-Lucy Deconvolution for Low-Light Image Deblurring","summary":" Images taken under the low-light condition often contain blur and saturated\npixels at the same time. Deblurring images with saturated pixels is quite\nchallenging. Because of the limited dynamic range, the saturated pixels are\nusually clipped in the imaging process and thus cannot be modeled by the linear\nblur model. Previous methods use manually designed smooth functions to\napproximate the clipping procedure. Their deblurring processes often require\nempirically defined parameters, which may not be the optimal choices for\ndifferent images. In this paper, we develop a data-driven approach to model the\nsaturated pixels by a learned latent map. Based on the new model, the non-blind\ndeblurring task can be formulated into a maximum a posterior (MAP) problem,\nwhich can be effectively solved by iteratively computing the latent map and the\nlatent image. Specifically, the latent map is computed by learning from a map\nestimation network (MEN), and the latent image estimation process is\nimplemented by a Richardson-Lucy (RL)-based updating scheme. To estimate\nhigh-quality deblurred images without amplified artifacts, we develop a prior\nestimation network (PEN) to obtain prior information, which is further\nintegrated into the RL scheme. Experimental results demonstrate that the\nproposed method performs favorably against state-of-the-art algorithms both\nquantitatively and qualitatively on synthetic and real-world images.\n","authors":["Liang Chen","Jiawei Zhang","Zhenhua Li","Yunxuan Wei","Faming Fang","Jimmy Ren","Jinshan Pan"],"pdf_url":"https://arxiv.org/pdf/2308.05543v1.pdf","comment":"Accepted by IJCV"},{"id":"http://arxiv.org/abs/2210.04087v3","updated":"2023-08-10T12:42:06Z","published":"2022-10-08T18:49:58Z","title":"Symmetry Defense Against CNN Adversarial Perturbation Attacks","summary":" This paper uses symmetry to make Convolutional Neural Network classifiers\n(CNNs) robust against adversarial perturbation attacks. Such attacks add\nperturbation to original images to generate adversarial images that fool\nclassifiers such as road sign classifiers of autonomous vehicles. Although\nsymmetry is a pervasive aspect of the natural world, CNNs are unable to handle\nsymmetry well. For example, a CNN can classify an image differently from its\nmirror image. For an adversarial image that misclassifies with a wrong label\n$l_w$, CNN inability to handle symmetry means that a symmetric adversarial\nimage can classify differently from the wrong label $l_w$. Further than that,\nwe find that the classification of a symmetric adversarial image reverts to the\ncorrect label. To classify an image when adversaries are unaware of the\ndefense, we apply symmetry to the image and use the classification label of the\nsymmetric image. To classify an image when adversaries are aware of the\ndefense, we use mirror symmetry and pixel inversion symmetry to form a symmetry\ngroup. We apply all the group symmetries to the image and decide on the output\nlabel based on the agreement of any two of the classification labels of the\nsymmetry images. Adaptive attacks fail because they need to rely on loss\nfunctions that use conflicting CNN output values for symmetric images. Without\nattack knowledge, the proposed symmetry defense succeeds against both\ngradient-based and random-search attacks, with up to near-default accuracies\nfor ImageNet. The defense even improves the classification accuracy of original\nimages.\n","authors":["Blerta Lindqvist"],"pdf_url":"https://arxiv.org/pdf/2210.04087v3.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2308.05542v1","updated":"2023-08-10T12:41:08Z","published":"2023-08-10T12:41:08Z","title":"Robust Asymmetric Loss for Multi-Label Long-Tailed Learning","summary":" In real medical data, training samples typically show long-tailed\ndistributions with multiple labels. Class distribution of the medical data has\na long-tailed shape, in which the incidence of different diseases is quite\nvaried, and at the same time, it is not unusual for images taken from\nsymptomatic patients to be multi-label diseases. Therefore, in this paper, we\nconcurrently address these two issues by putting forth a robust asymmetric loss\non the polynomial function. Since our loss tackles both long-tailed and\nmulti-label classification problems simultaneously, it leads to a complex\ndesign of the loss function with a large number of hyper-parameters. Although a\nmodel can be highly fine-tuned due to a large number of hyper-parameters, it is\ndifficult to optimize all hyper-parameters at the same time, and there might be\na risk of overfitting a model. Therefore, we regularize the loss function using\nthe Hill loss approach, which is beneficial to be less sensitive against the\nnumerous hyper-parameters so that it reduces the risk of overfitting the model.\nFor this reason, the proposed loss is a generic method that can be applied to\nmost medical image classification tasks and does not make the training process\nmore time-consuming. We demonstrate that the proposed robust asymmetric loss\nperforms favorably against the long-tailed with multi-label medical image\nclassification in addition to the various long-tailed single-label datasets.\nNotably, our method achieves Top-5 results on the CXR-LT dataset of the ICCV\nCVAMD 2023 competition. We opensource our implementation of the robust\nasymmetric loss in the public repository: https://github.com/kalelpark/RAL.\n","authors":["Wongi Park","Inhyuk Park","Sungeun Kim","Jongbin Ryu"],"pdf_url":"https://arxiv.org/pdf/2308.05542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05533v1","updated":"2023-08-10T12:23:47Z","published":"2023-08-10T12:23:47Z","title":"Is there progress in activity progress prediction?","summary":" Activity progress prediction aims to estimate what percentage of an activity\nhas been completed. Currently this is done with machine learning approaches,\ntrained and evaluated on complicated and realistic video datasets. The videos\nin these datasets vary drastically in length and appearance. And some of the\nactivities have unanticipated developments, making activity progression\ndifficult to estimate. In this work, we examine the results obtained by\nexisting progress prediction methods on these datasets. We find that current\nprogress prediction methods seem not to extract useful visual information for\nthe progress prediction task. Therefore, these methods fail to exceed simple\nframe-counting baselines. We design a precisely controlled dataset for activity\nprogress prediction and on this synthetic dataset we show that the considered\nmethods can make use of the visual information, when this directly relates to\nthe progress prediction. We conclude that the progress prediction task is\nill-posed on the currently used real-world datasets. Moreover, to fairly\nmeasure activity progression we advise to consider a, simple but effective,\nframe-counting baseline.\n","authors":["Frans de Boer","Jan C. van Gemert","Jouke Dijkstra","Silvia L. Pintea"],"pdf_url":"https://arxiv.org/pdf/2308.05533v1.pdf","comment":"Accepted at ICCVw-2023 (AI for Creative Video Editing and\n Understanding, ICCV workshop 2023)"},{"id":"http://arxiv.org/abs/2308.05525v1","updated":"2023-08-10T12:06:03Z","published":"2023-08-10T12:06:03Z","title":"Critical Points ++: An Agile Point Cloud Importance Measure for Robust\n Classification, Adversarial Defense and Explainable AI","summary":" The ability to cope accurately and fast with Out-Of-Distribution (OOD)\nsamples is crucial in real-world safety demanding applications. In this work we\nfirst study the interplay between critical points of 3D point clouds and OOD\nsamples. Our findings are that common corruptions and outliers are often\ninterpreted as critical points. We generalize the notion of critical points\ninto importance measures. We show that training a classification network based\nonly on less important points dramatically improves robustness, at a cost of\nminor performance loss on the clean set. We observe that normalized entropy is\nhighly informative for corruption analysis. An adaptive threshold based on\nnormalized entropy is suggested for selecting the set of uncritical points. Our\nproposed importance measure is extremely fast to compute. We show it can be\nused for a variety of applications, such as Explainable AI (XAI), Outlier\nRemoval, Uncertainty Estimation, Robust Classification and Adversarial Defense.\nWe reach SOTA results on the two latter tasks.\n","authors":["Meir Yossef Levi","Guy Gilboa"],"pdf_url":"https://arxiv.org/pdf/2308.05525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13720v4","updated":"2023-08-10T11:54:53Z","published":"2023-06-23T18:08:00Z","title":"Decoupled Diffusion Models with Explicit Transition Probability","summary":" Recent diffusion probabilistic models (DPMs) have shown remarkable abilities\nof generated content, however, they often suffer from complex forward\nprocesses, resulting in inefficient solutions for the reversed process and\nprolonged sampling times. In this paper, we aim to address the aforementioned\nchallenges by focusing on the diffusion process itself that we propose to\ndecouple the intricate diffusion process into two comparatively simpler process\nto improve the generative efficacy and speed. In particular, we present a novel\ndiffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito\ndiffusion process, in which the image distribution is approximated by an\nexplicit transition probability while the noise path is controlled by the\nstandard Wiener process. We find that decoupling the diffusion process reduces\nthe learning difficulty and the explicit transition probability improves the\ngenerative speed significantly. We prove a new training objective for DPM,\nwhich enables the model to learn to predict the noise and image components\nseparately. Moreover, given the novel forward diffusion equation, we derive the\nreverse denoising formula of DDM that naturally supports fewer steps of\ngeneration without ordinary differential equation (ODE) based accelerators. Our\nexperiments demonstrate that DDM outperforms previous DPMs by a large margin in\nfewer function evaluations setting and gets comparable performances in long\nfunction evaluations setting. We also show that our framework can be applied to\nimage-conditioned generation and high-resolution image synthesis, and that it\ncan generate high-quality images with only 10 function evaluations.\n","authors":["Yuhang Huang","Zheng Qin","Xinwang Liu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2306.13720v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10816v3","updated":"2023-08-10T11:54:46Z","published":"2023-07-20T12:25:06Z","title":"BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained\n Diffusion","summary":" Recent text-to-image diffusion models have demonstrated an astonishing\ncapacity to generate high-quality images. However, researchers mainly studied\nthe way of synthesizing images with only text prompts. While some works have\nexplored using other modalities as conditions, considerable paired data, e.g.,\nbox/mask-image pairs, and fine-tuning time are required for nurturing models.\nAs such paired data is time-consuming and labor-intensive to acquire and\nrestricted to a closed set, this potentially becomes the bottleneck for\napplications in an open world. This paper focuses on the simplest form of\nuser-provided conditions, e.g., box or scribble. To mitigate the aforementioned\nproblem, we propose a training-free method to control objects and contexts in\nthe synthesized images adhering to the given spatial conditions. Specifically,\nthree spatial constraints, i.e., Inner-Box, Outer-Box, and Corner Constraints,\nare designed and seamlessly integrated into the denoising step of diffusion\nmodels, requiring no additional training and massive annotated layout data.\nExtensive results show that the proposed constraints can control what and where\nto present in the images while retaining the ability of the Stable Diffusion\nmodel to synthesize with high fidelity and diverse concept coverage. The code\nis publicly available at https://github.com/Sierkinhane/BoxDiff.\n","authors":["Jinheng Xie","Yuexiang Li","Yawen Huang","Haozhe Liu","Wentian Zhang","Yefeng Zheng","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2307.10816v3.pdf","comment":"Accepted by ICCV 2023. Code is available at:\n https://github.com/Sierkinhane/BoxDiff"},{"id":"http://arxiv.org/abs/2308.04868v2","updated":"2023-08-10T11:27:16Z","published":"2023-08-09T11:02:00Z","title":"InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering","summary":" Recent advances in full-head reconstruction have been obtained by optimizing\na neural field through differentiable surface or volume rendering to represent\na single scene. While these techniques achieve an unprecedented accuracy, they\ntake several minutes, or even hours, due to the expensive optimization process\nrequired. In this work, we introduce InstantAvatar, a method that recovers\nfull-head avatars from few images (down to just one) in a few seconds on\ncommodity hardware. In order to speed up the reconstruction process, we propose\na system that combines, for the first time, a voxel-grid neural field\nrepresentation with a surface renderer. Notably, a naive combination of these\ntwo techniques leads to unstable optimizations that do not converge to valid\nsolutions. In order to overcome this limitation, we present a novel statistical\nmodel that learns a prior distribution over 3D head signed distance functions\nusing a voxel-grid based architecture. The use of this prior model, in\ncombination with other design choices, results into a system that achieves 3D\nhead reconstructions with comparable accuracy as the state-of-the-art with a\n100x speed-up.\n","authors":["Antonio Canela","Pol Caselles","Ibrar Malik","Eduard Ramon","Jaime García","Jordi Sánchez-Riera","Gil Triginer","Francesc Moreno-Noguer"],"pdf_url":"https://arxiv.org/pdf/2308.04868v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07944v2","updated":"2023-08-10T11:05:23Z","published":"2023-07-16T04:34:11Z","title":"Revisiting Domain-Adaptive 3D Object Detection by Reliable, Diverse and\n Class-balanced Pseudo-Labeling","summary":" Unsupervised domain adaptation (DA) with the aid of pseudo labeling\ntechniques has emerged as a crucial approach for domain-adaptive 3D object\ndetection. While effective, existing DA methods suffer from a substantial drop\nin performance when applied to a multi-class training setting, due to the\nco-existence of low-quality pseudo labels and class imbalance issues. In this\npaper, we address this challenge by proposing a novel ReDB framework tailored\nfor learning to detect all classes at once. Our approach produces Reliable,\nDiverse, and class-Balanced pseudo 3D boxes to iteratively guide the\nself-training on a distributionally different target domain. To alleviate\ndisruptions caused by the environmental discrepancy (e.g., beam numbers), the\nproposed cross-domain examination (CDE) assesses the correctness of pseudo\nlabels by copy-pasting target instances into a source environment and measuring\nthe prediction consistency. To reduce computational overhead and mitigate the\nobject shift (e.g., scales and point densities), we design an overlapped boxes\ncounting (OBC) metric that allows to uniformly downsample pseudo-labeled\nobjects across different geometric characteristics. To confront the issue of\ninter-class imbalance, we progressively augment the target point clouds with a\nclass-balanced set of pseudo-labeled target instances and source objects, which\nboosts recognition accuracies on both frequently appearing and rare classes.\nExperimental results on three benchmark datasets using both voxel-based (i.e.,\nSECOND) and point-based 3D detectors (i.e., PointRCNN) demonstrate that our\nproposed ReDB approach outperforms existing 3D domain adaptation methods by a\nlarge margin, improving 23.15% mAP on the nuScenes $\\rightarrow$ KITTI task.\nThe code is available at https://github.com/zhuoxiao-chen/ReDB-DA-3Ddet.\n","authors":["Zhuoxiao Chen","Yadan Luo","Zheng Wang","Mahsa Baktashmotlagh","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2307.07944v2.pdf","comment":"Accepted by ICCV 2023, camera-ready"},{"id":"http://arxiv.org/abs/2202.03026v3","updated":"2023-08-10T11:01:14Z","published":"2022-02-07T09:33:45Z","title":"Context Autoencoder for Self-Supervised Representation Learning","summary":" We present a novel masked image modeling (MIM) approach, context autoencoder\n(CAE), for self-supervised representation pretraining. We pretrain an encoder\nby making predictions in the encoded representation space. The pretraining\ntasks include two tasks: masked representation prediction - predict the\nrepresentations for the masked patches, and masked patch reconstruction -\nreconstruct the masked patches. The network is an encoder-regressor-decoder\narchitecture: the encoder takes the visible patches as input; the regressor\npredicts the representations of the masked patches, which are expected to be\naligned with the representations computed from the encoder, using the\nrepresentations of visible patches and the positions of visible and masked\npatches; the decoder reconstructs the masked patches from the predicted encoded\nrepresentations. The CAE design encourages the separation of learning the\nencoder (representation) from completing the pertaining tasks: masked\nrepresentation prediction and masked patch reconstruction tasks, and making\npredictions in the encoded representation space empirically shows the benefit\nto representation learning. We demonstrate the effectiveness of our CAE through\nsuperior transfer performance in downstream tasks: semantic segmentation,\nobject detection and instance segmentation, and classification. The code will\nbe available at https://github.com/Atten4Vis/CAE.\n","authors":["Xiaokang Chen","Mingyu Ding","Xiaodi Wang","Ying Xin","Shentong Mo","Yunhao Wang","Shumin Han","Ping Luo","Gang Zeng","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2202.03026v3.pdf","comment":"Accepted by International Journal of Computer Vision (IJCV)"},{"id":"http://arxiv.org/abs/2308.05493v1","updated":"2023-08-10T10:47:12Z","published":"2023-08-10T10:47:12Z","title":"Look at the Neighbor: Distortion-aware Unsupervised Domain Adaptation\n for Panoramic Semantic Segmentation","summary":" Endeavors have been recently made to transfer knowledge from the labeled\npinhole image domain to the unlabeled panoramic image domain via Unsupervised\nDomain Adaptation (UDA). The aim is to tackle the domain gaps caused by the\nstyle disparities and distortion problem from the non-uniformly distributed\npixels of equirectangular projection (ERP). Previous works typically focus on\ntransferring knowledge based on geometric priors with specially designed\nmulti-branch network architectures. As a result, considerable computational\ncosts are induced, and meanwhile, their generalization abilities are profoundly\nhindered by the variation of distortion among pixels. In this paper, we find\nthat the pixels' neighborhood regions of the ERP indeed introduce less\ndistortion. Intuitively, we propose a novel UDA framework that can effectively\naddress the distortion problems for panoramic semantic segmentation. In\ncomparison, our method is simpler, easier to implement, and more\ncomputationally efficient. Specifically, we propose distortion-aware attention\n(DA) capturing the neighboring pixel distribution without using any geometric\nconstraints. Moreover, we propose a class-wise feature aggregation (CFA) module\nto iteratively update the feature representations with a memory bank. As such,\nthe feature similarity between two domains can be consistently optimized.\nExtensive experiments show that our method achieves new state-of-the-art\nperformance while remarkably reducing 80% parameters.\n","authors":["Xu Zheng","Tianbo Pan","Yunhao Luo","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05493v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04995v2","updated":"2023-08-10T10:43:53Z","published":"2023-08-09T14:48:31Z","title":"IDiff-Face: Synthetic-based Face Recognition through Fizzy\n Identity-Conditioned Diffusion Models","summary":" The availability of large-scale authentic face databases has been crucial to\nthe significant advances made in face recognition research over the past\ndecade. However, legal and ethical concerns led to the recent retraction of\nmany of these databases by their creators, raising questions about the\ncontinuity of future face recognition research without one of its key\nresources. Synthetic datasets have emerged as a promising alternative to\nprivacy-sensitive authentic data for face recognition development. However,\nrecent synthetic datasets that are used to train face recognition models suffer\neither from limitations in intra-class diversity or cross-class (identity)\ndiscrimination, leading to less optimal accuracies, far away from the\naccuracies achieved by models trained on authentic data. This paper targets\nthis issue by proposing IDiff-Face, a novel approach based on conditional\nlatent diffusion models for synthetic identity generation with realistic\nidentity variations for face recognition training. Through extensive\nevaluations, our proposed synthetic-based face recognition approach pushed the\nlimits of state-of-the-art performances, achieving, for example, 98.00%\naccuracy on the Labeled Faces in the Wild (LFW) benchmark, far ahead from the\nrecent synthetic-based face recognition solutions with 95.40% and bridging the\ngap to authentic-based face recognition with 99.82% accuracy.\n","authors":["Fadi Boutros","Jonas Henry Grebe","Arjan Kuijper","Naser Damer"],"pdf_url":"https://arxiv.org/pdf/2308.04995v2.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.05480v1","updated":"2023-08-10T10:12:27Z","published":"2023-08-10T10:12:27Z","title":"YOLO-MS: Rethinking Multi-Scale Representation Learning for Real-time\n Object Detection","summary":" We aim at providing the object detection community with an efficient and\nperformant object detector, termed YOLO-MS. The core design is based on a\nseries of investigations on how convolutions with different kernel sizes affect\nthe detection performance of objects at different scales. The outcome is a new\nstrategy that can strongly enhance multi-scale feature representations of\nreal-time object detectors. To verify the effectiveness of our strategy, we\nbuild a network architecture, termed YOLO-MS. We train our YOLO-MS on the MS\nCOCO dataset from scratch without relying on any other large-scale datasets,\nlike ImageNet, or pre-trained weights. Without bells and whistles, our YOLO-MS\noutperforms the recent state-of-the-art real-time object detectors, including\nYOLO-v7 and RTMDet, when using a comparable number of parameters and FLOPs.\nTaking the XS version of YOLO-MS as an example, with only 4.5M learnable\nparameters and 8.7G FLOPs, it can achieve an AP score of 43%+ on MS COCO, which\nis about 2%+ higher than RTMDet with the same model size. Moreover, our work\ncan also be used as a plug-and-play module for other YOLO models. Typically,\nour method significantly improves the AP of YOLOv8 from 37%+ to 40%+ with even\nfewer parameters and FLOPs. Code is available at\nhttps://github.com/FishAndWasabi/YOLO-MS.\n","authors":["Yuming Chen","Xinbin Yuan","Ruiqi Wu","Jiabao Wang","Qibin Hou","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.05480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05478v1","updated":"2023-08-10T10:10:43Z","published":"2023-08-10T10:10:43Z","title":"Reviewing 3D Object Detectors in the Context of High-Resolution 3+1D\n Radar","summary":" Recent developments and the beginning market introduction of high-resolution\nimaging 4D (3+1D) radar sensors have initialized deep learning-based radar\nperception research. We investigate deep learning-based models operating on\nradar point clouds for 3D object detection. 3D object detection on lidar point\ncloud data is a mature area of 3D vision. Many different architectures have\nbeen proposed, each with strengths and weaknesses. Due to similarities between\n3D lidar point clouds and 3+1D radar point clouds, those existing 3D object\ndetectors are a natural basis to start deep learning-based 3D object detection\non radar data. Thus, the first step is to analyze the detection performance of\nthe existing models on the new data modality and evaluate them in depth. In\norder to apply existing 3D point cloud object detectors developed for lidar\npoint clouds to the radar domain, they need to be adapted first. While some\ndetectors, such as PointPillars, have already been adapted to be applicable to\nradar data, we have adapted others, e.g., Voxel R-CNN, SECOND, PointRCNN, and\nPV-RCNN. To this end, we conduct a cross-model validation (evaluating a set of\nmodels on one particular data set) as well as a cross-data set validation\n(evaluating all models in the model set on several data sets). The\nhigh-resolution radar data used are the View-of-Delft and Astyx data sets.\nFinally, we evaluate several adaptations of the models and their training\nprocedures. We also discuss major factors influencing the detection performance\non radar data and propose possible solutions indicating potential future\nresearch avenues.\n","authors":["Patrick Palmer","Martin Krueger","Richard Altendorfer","Ganesh Adam","Torsten Bertram"],"pdf_url":"https://arxiv.org/pdf/2308.05478v1.pdf","comment":"Published at CVPR 2023 Workshop on 3D Vision and Robotics\n (https://drive.google.com/file/d/1xj4R5ucH3PaR7QdRDJbbkjS-3iBUsruR/view)"},{"id":"http://arxiv.org/abs/2308.05474v1","updated":"2023-08-10T10:01:56Z","published":"2023-08-10T10:01:56Z","title":"Surface Masked AutoEncoder: Self-Supervision for Cortical Imaging Data","summary":" Self-supervision has been widely explored as a means of addressing the lack\nof inductive biases in vision transformer architectures, which limits\ngeneralisation when networks are trained on small datasets. This is crucial in\nthe context of cortical imaging, where phenotypes are complex and\nheterogeneous, but the available datasets are limited in size. This paper\nbuilds upon recent advancements in translating vision transformers to surface\nmeshes and investigates the potential of Masked AutoEncoder (MAE)\nself-supervision for cortical surface learning. By reconstructing surface data\nfrom a masked version of the input, the proposed method effectively models\ncortical structure to learn strong representations that translate to improved\nperformance in downstream tasks. We evaluate our approach on cortical phenotype\nregression using the developing Human Connectome Project (dHCP) and demonstrate\nthat pre-training leads to a 26\\% improvement in performance, with an 80\\%\nfaster convergence, compared to models trained from scratch. Furthermore, we\nestablish that pre-training vision transformer models on large datasets, such\nas the UK Biobank (UKB), enables the acquisition of robust representations for\nfinetuning in low-data scenarios. Our code and pre-trained models are publicly\navailable at \\url{https://github.com/metrics-lab/surface-vision-transformers}.\n","authors":["Simon Dahan","Mariana da Silva","Daniel Rueckert","Emma C Robinson"],"pdf_url":"https://arxiv.org/pdf/2308.05474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08695v2","updated":"2023-08-10T09:36:06Z","published":"2023-07-17T17:57:01Z","title":"Neural Video Depth Stabilizer","summary":" Video depth estimation aims to infer temporally consistent depth. Some\nmethods achieve temporal consistency by finetuning a single-image depth model\nduring test time using geometry and re-projection constraints, which is\ninefficient and not robust. An alternative approach is to learn how to enforce\ntemporal consistency from data, but this requires well-designed models and\nsufficient video depth data. To address these challenges, we propose a\nplug-and-play framework called Neural Video Depth Stabilizer (NVDS) that\nstabilizes inconsistent depth estimations and can be applied to different\nsingle-image depth models without extra effort. We also introduce a large-scale\ndataset, Video Depth in the Wild (VDW), which consists of 14,203 videos with\nover two million frames, making it the largest natural-scene video depth\ndataset to our knowledge. We evaluate our method on the VDW dataset as well as\ntwo public benchmarks and demonstrate significant improvements in consistency,\naccuracy, and efficiency compared to previous approaches. Our work serves as a\nsolid baseline and provides a data foundation for learning-based video depth\nmodels. We will release our dataset and code for future research.\n","authors":["Yiran Wang","Min Shi","Jiaqi Li","Zihao Huang","Zhiguo Cao","Jianming Zhang","Ke Xian","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2307.08695v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.05459v1","updated":"2023-08-10T09:32:20Z","published":"2023-08-10T09:32:20Z","title":"KS-APR: Keyframe Selection for Robust Absolute Pose Regression","summary":" Markerless Mobile Augmented Reality (AR) aims to anchor digital content in\nthe physical world without using specific 2D or 3D objects. Absolute Pose\nRegressors (APR) are end-to-end machine learning solutions that infer the\ndevice's pose from a single monocular image. Thanks to their low computation\ncost, they can be directly executed on the constrained hardware of mobile AR\ndevices. However, APR methods tend to yield significant inaccuracies for input\nimages that are too distant from the training set. This paper introduces\nKS-APR, a pipeline that assesses the reliability of an estimated pose with\nminimal overhead by combining the inference results of the APR and the prior\nimages in the training set. Mobile AR systems tend to rely upon visual-inertial\nodometry to track the relative pose of the device during the experience. As\nsuch, KS-APR favours reliability over frequency, discarding unreliable poses.\nThis pipeline can integrate most existing APR methods to improve accuracy by\nfiltering unreliable images with their pose estimates. We implement the\npipeline on three types of APR models on indoor and outdoor datasets. The\nmedian error on position and orientation is reduced for all models, and the\nproportion of large errors is minimized across datasets. Our method enables\nstate-of-the-art APRs such as DFNetdm to outperform single-image and sequential\nAPR methods. These results demonstrate the scalability and effectiveness of\nKS-APR for visual localization tasks that do not require one-shot decisions.\n","authors":["Changkun Liu","Yukun Zhao","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2308.05459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10705v5","updated":"2023-08-10T09:27:44Z","published":"2022-11-19T14:06:58Z","title":"TORE: Token Reduction for Efficient Human Mesh Recovery with Transformer","summary":" In this paper, we introduce a set of simple yet effective TOken REduction\n(TORE) strategies for Transformer-based Human Mesh Recovery from monocular\nimages. Current SOTA performance is achieved by Transformer-based structures.\nHowever, they suffer from high model complexity and computation cost caused by\nredundant tokens. We propose token reduction strategies based on two important\naspects, i.e., the 3D geometry structure and 2D image feature, where we\nhierarchically recover the mesh geometry with priors from body structure and\nconduct token clustering to pass fewer but more discriminative image feature\ntokens to the Transformer. Our method massively reduces the number of tokens\ninvolved in high-complexity interactions in the Transformer. This leads to a\nsignificantly reduced computational cost while still achieving competitive or\neven higher accuracy in shape recovery. Extensive experiments across a wide\nrange of benchmarks validate the superior effectiveness of the proposed method.\nWe further demonstrate the generalizability of our method on hand mesh\nrecovery. Visit our project page at\nhttps://frank-zy-dou.github.io/projects/Tore/index.html.\n","authors":["Zhiyang Dou","Qingxuan Wu","Cheng Lin","Zeyu Cao","Qiangqiang Wu","Weilin Wan","Taku Komura","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2211.10705v5.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05449v1","updated":"2023-08-10T09:15:15Z","published":"2023-08-10T09:15:15Z","title":"Transforming Breast Cancer Diagnosis: Towards Real-Time Ultrasound to\n Mammogram Conversion for Cost-Effective Diagnosis","summary":" Ultrasound (US) imaging is better suited for intraoperative settings because\nit is real-time and more portable than other imaging techniques, such as\nmammography. However, US images are characterized by lower spatial resolution\nnoise-like artifacts. This research aims to address these limitations by\nproviding surgeons with mammogram-like image quality in real-time from noisy US\nimages. Unlike previous approaches for improving US image quality that aim to\nreduce artifacts by treating them as (speckle noise), we recognize their value\nas informative wave interference pattern (WIP). To achieve this, we utilize the\nStride software to numerically solve the forward model, generating ultrasound\nimages from mammograms images by solving wave-equations. Additionally, we\nleverage the power of domain adaptation to enhance the realism of the simulated\nultrasound images. Then, we utilize generative adversarial networks (GANs) to\ntackle the inverse problem of generating mammogram-quality images from\nultrasound images. The resultant images have considerably more discernible\ndetails than the original US images.\n","authors":["Sahar Almahfouz Nasser","Ashutosh Sharma","Anmol Saraf","Amruta Mahendra Parulekar","Purvi Haria","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2308.05449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01097v4","updated":"2023-08-10T09:10:35Z","published":"2023-07-03T15:19:17Z","title":"MVDiffusion: Enabling Holistic Multi-view Image Generation with\n Correspondence-Aware Diffusion","summary":" This paper introduces MVDiffusion, a simple yet effective method for\ngenerating consistent multi-view images from text prompts given pixel-to-pixel\ncorrespondences (e.g., perspective crops from a panorama or multi-view images\ngiven depth maps and poses). Unlike prior methods that rely on iterative image\nwarping and inpainting, MVDiffusion simultaneously generates all images with a\nglobal awareness, effectively addressing the prevalent error accumulation\nissue. At its core, MVDiffusion processes perspective images in parallel with a\npre-trained text-to-image diffusion model, while integrating novel\ncorrespondence-aware attention layers to facilitate cross-view interactions.\nFor panorama generation, while only trained with 10k panoramas, MVDiffusion is\nable to generate high-resolution photorealistic images for arbitrary texts or\nextrapolate one perspective image to a 360-degree view. For multi-view\ndepth-to-image generation, MVDiffusion demonstrates state-of-the-art\nperformance for texturing a scene mesh. The project page is at\nhttps://mvdiffusion.github.io/.\n","authors":["Shitao Tang","Fuyang Zhang","Jiacheng Chen","Peng Wang","Yasutaka Furukawa"],"pdf_url":"https://arxiv.org/pdf/2307.01097v4.pdf","comment":"Project page, https://mvdiffusion.github.io, new functionality,\n improved results, better writing"},{"id":"http://arxiv.org/abs/2308.05447v1","updated":"2023-08-10T09:09:15Z","published":"2023-08-10T09:09:15Z","title":"A Generalized Physical-knowledge-guided Dynamic Model for Underwater\n Image Enhancement","summary":" Underwater images often suffer from color distortion and low contrast\nresulting in various image types, due to the scattering and absorption of light\nby water. While it is difficult to obtain high-quality paired training samples\nwith a generalized model. To tackle these challenges, we design a Generalized\nUnderwater image enhancement method via a Physical-knowledge-guided Dynamic\nModel (short for GUPDM), consisting of three parts: Atmosphere-based Dynamic\nStructure (ADS), Transmission-guided Dynamic Structure (TDS), and Prior-based\nMulti-scale Structure (PMS). In particular, to cover complex underwater scenes,\nthis study changes the global atmosphere light and the transmission to simulate\nvarious underwater image types (e.g., the underwater image color ranging from\nyellow to blue) through the formation model. We then design ADS and TDS that\nuse dynamic convolutions to adaptively extract prior information from\nunderwater images and generate parameters for PMS. These two modules enable the\nnetwork to select appropriate parameters for various water types adaptively.\nBesides, the multi-scale feature extraction module in PMS uses convolution\nblocks with different kernel sizes and obtains weights for each feature map via\nchannel attention block and fuses them to boost the receptive field of the\nnetwork. The source code will be available at\n\\href{https://github.com/shiningZZ/GUPDM}{https://github.com/shiningZZ/GUPDM}.\n","authors":["Pan Mu","Hanning Xu","Zheyuan Liu","Zheng Wang","Sixian Chan","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2308.05447v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2307.11074v2","updated":"2023-08-10T09:03:48Z","published":"2023-07-20T17:53:57Z","title":"Learning Dense UV Completion for Human Mesh Recovery","summary":" Human mesh reconstruction from a single image is challenging in the presence\nof occlusion, which can be caused by self, objects, or other humans. Existing\nmethods either fail to separate human features accurately or lack proper\nsupervision for feature completion. In this paper, we propose Dense Inpainting\nHuman Mesh Recovery (DIMR), a two-stage method that leverages dense\ncorrespondence maps to handle occlusion. Our method utilizes a dense\ncorrespondence map to separate visible human features and completes human\nfeatures on a structured UV map dense human with an attention-based feature\ncompletion module. We also design a feature inpainting training procedure that\nguides the network to learn from unoccluded features. We evaluate our method on\nseveral datasets and demonstrate its superior performance under heavily\noccluded scenarios compared to other methods. Extensive experiments show that\nour method obviously outperforms prior SOTA methods on heavily occluded images\nand achieves comparable results on the standard benchmarks (3DPW).\n","authors":["Yanjun Wang","Qingping Sun","Wenjia Wang","Jun Ling","Zhongang Cai","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2307.11074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05441v1","updated":"2023-08-10T08:57:31Z","published":"2023-08-10T08:57:31Z","title":"Benchmarking Algorithmic Bias in Face Recognition: An Experimental\n Approach Using Synthetic Faces and Human Evaluation","summary":" We propose an experimental method for measuring bias in face recognition\nsystems. Existing methods to measure bias depend on benchmark datasets that are\ncollected in the wild and annotated for protected (e.g., race, gender) and\nnon-protected (e.g., pose, lighting) attributes. Such observational datasets\nonly permit correlational conclusions, e.g., \"Algorithm A's accuracy is\ndifferent on female and male faces in dataset X.\". By contrast, experimental\nmethods manipulate attributes individually and thus permit causal conclusions,\ne.g., \"Algorithm A's accuracy is affected by gender and skin color.\"\n Our method is based on generating synthetic faces using a neural face\ngenerator, where each attribute of interest is modified independently while\nleaving all other attributes constant. Human observers crucially provide the\nground truth on perceptual identity similarity between synthetic image pairs.\nWe validate our method quantitatively by evaluating race and gender biases of\nthree research-grade face recognition models. Our synthetic pipeline reveals\nthat for these algorithms, accuracy is lower for Black and East Asian\npopulation subgroups. Our method can also quantify how perceptual changes in\nattributes affect face identity distances reported by these models. Our large\nsynthetic dataset, consisting of 48,000 synthetic face image pairs (10,200\nunique synthetic faces) and 555,000 human annotations (individual attributes\nand pairwise identity comparisons) is available to researchers in this\nimportant area.\n","authors":["Hao Liang","Pietro Perona","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2308.05441v1.pdf","comment":"accepted to iccv2023; 18 figures"},{"id":"http://arxiv.org/abs/2308.05438v1","updated":"2023-08-10T08:52:08Z","published":"2023-08-10T08:52:08Z","title":"Deep Fusion Transformer Network with Weighted Vector-Wise Keypoints\n Voting for Robust 6D Object Pose Estimation","summary":" One critical challenge in 6D object pose estimation from a single RGBD image\nis efficient integration of two different modalities, i.e., color and depth. In\nthis work, we tackle this problem by a novel Deep Fusion Transformer~(DFTr)\nblock that can aggregate cross-modality features for improving pose estimation.\nUnlike existing fusion methods, the proposed DFTr can better model\ncross-modality semantic correlation by leveraging their semantic similarity,\nsuch that globally enhanced features from different modalities can be better\nintegrated for improved information extraction. Moreover, to further improve\nrobustness and efficiency, we introduce a novel weighted vector-wise voting\nalgorithm that employs a non-iterative global optimization strategy for precise\n3D keypoint localization while achieving near real-time inference. Extensive\nexperiments show the effectiveness and strong generalization capability of our\nproposed 3D keypoint voting algorithm. Results on four widely used benchmarks\nalso demonstrate that our method outperforms the state-of-the-art methods by\nlarge margins.\n","authors":["Jun Zhou","Kai Chen","Linlin Xu","Qi Dou","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2308.05438v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.05430v1","updated":"2023-08-10T08:43:20Z","published":"2023-08-10T08:43:20Z","title":"Ensemble Modeling for Multimodal Visual Action Recognition","summary":" In this work, we propose an ensemble modeling approach for multimodal action\nrecognition. We independently train individual modality models using a variant\nof focal loss tailored to handle the long-tailed distribution of the MECCANO\n[21] dataset. Based on the underlying principle of focal loss, which captures\nthe relationship between tail (scarce) classes and their prediction\ndifficulties, we propose an exponentially decaying variant of focal loss for\nour current task. It initially emphasizes learning from the hard misclassified\nexamples and gradually adapts to the entire range of examples in the dataset.\nThis annealing process encourages the model to strike a balance between\nfocusing on the sparse set of hard samples, while still leveraging the\ninformation provided by the easier ones. Additionally, we opt for the late\nfusion strategy to combine the resultant probability distributions from RGB and\nDepth modalities for final action prediction. Experimental evaluations on the\nMECCANO dataset demonstrate the effectiveness of our approach.\n","authors":["Jyoti Kini","Sarah Fleischer","Ishan Dave","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2308.05430v1.pdf","comment":"Technical Report accepted at the Multimodal Action Recognition\n Challenge on the MECCANO Dataset - ICIAP 2023"},{"id":"http://arxiv.org/abs/2308.05428v1","updated":"2023-08-10T08:42:20Z","published":"2023-08-10T08:42:20Z","title":"Speech-Driven 3D Face Animation with Composite and Regional Facial\n Movements","summary":" Speech-driven 3D face animation poses significant challenges due to the\nintricacy and variability inherent in human facial movements. This paper\nemphasizes the importance of considering both the composite and regional\nnatures of facial movements in speech-driven 3D face animation. The composite\nnature pertains to how speech-independent factors globally modulate\nspeech-driven facial movements along the temporal dimension. Meanwhile, the\nregional nature alludes to the notion that facial movements are not globally\ncorrelated but are actuated by local musculature along the spatial dimension.\nIt is thus indispensable to incorporate both natures for engendering vivid\nanimation. To address the composite nature, we introduce an adaptive modulation\nmodule that employs arbitrary facial movements to dynamically adjust\nspeech-driven facial movements across frames on a global scale. To accommodate\nthe regional nature, our approach ensures that each constituent of the facial\nfeatures for every frame focuses on the local spatial movements of 3D faces.\nMoreover, we present a non-autoregressive backbone for translating audio to 3D\nfacial movements, which maintains high-frequency nuances of facial movements\nand facilitates efficient inference. Comprehensive experiments and user studies\ndemonstrate that our method surpasses contemporary state-of-the-art approaches\nboth qualitatively and quantitatively.\n","authors":["Haozhe Wu","Songtao Zhou","Jia Jia","Junliang Xing","Qi Wen","Xiang Wen"],"pdf_url":"https://arxiv.org/pdf/2308.05428v1.pdf","comment":"Accepted by MM 2023, 9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.05426v1","updated":"2023-08-10T08:39:59Z","published":"2023-08-10T08:39:59Z","title":"Adaptive Low Rank Adaptation of Segment Anything to Salient Object\n Detection","summary":" Foundation models, such as OpenAI's GPT-3 and GPT-4, Meta's LLaMA, and\nGoogle's PaLM2, have revolutionized the field of artificial intelligence. A\nnotable paradigm shift has been the advent of the Segment Anything Model (SAM),\nwhich has exhibited a remarkable capability to segment real-world objects,\ntrained on 1 billion masks and 11 million images. Although SAM excels in\ngeneral object segmentation, it lacks the intrinsic ability to detect salient\nobjects, resulting in suboptimal performance in this domain. To address this\nchallenge, we present the Segment Salient Object Model (SSOM), an innovative\napproach that adaptively fine-tunes SAM for salient object detection by\nharnessing the low-rank structure inherent in deep learning. Comprehensive\nqualitative and quantitative evaluations across five challenging RGB benchmark\ndatasets demonstrate the superior performance of our approach, surpassing\nstate-of-the-art methods.\n","authors":["Ruikai Cui","Siyuan He","Shi Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.05426v1.pdf","comment":"13 pages, 0 figures"},{"id":"http://arxiv.org/abs/2308.05421v1","updated":"2023-08-10T08:29:36Z","published":"2023-08-10T08:29:36Z","title":"Progressive Spatio-temporal Perception for Audio-Visual Question\n Answering","summary":" Audio-Visual Question Answering (AVQA) task aims to answer questions about\ndifferent visual objects, sounds, and their associations in videos. Such\nnaturally multi-modal videos are composed of rich and complex dynamic\naudio-visual components, where most of which could be unrelated to the given\nquestions, or even play as interference in answering the content of interest.\nOppositely, only focusing on the question-aware audio-visual content could get\nrid of influence, meanwhile enabling the model to answer more efficiently. In\nthis paper, we propose a Progressive Spatio-Temporal Perception Network\n(PSTP-Net), which contains three modules that progressively identify key\nspatio-temporal regions w.r.t. questions. Specifically, a temporal segment\nselection module is first introduced to select the most relevant audio-visual\nsegments related to the given question. Then, a spatial region selection module\nis utilized to choose the most relevant regions associated with the question\nfrom the selected temporal segments. To further refine the selection of\nfeatures, an audio-guided visual attention module is employed to perceive the\nassociation between auido and selected spatial regions. Finally, the\nspatio-temporal features from these modules are integrated for answering the\nquestion. Extensive experimental results on the public MUSIC-AVQA and AVQA\ndatasets provide compelling evidence of the effectiveness and efficiency of\nPSTP-Net. Code is available at:\n\\href{https://github.com/GeWu-Lab/PSTP-Net}{https://github.com/GeWu-Lab/PSTP-Net}\n","authors":["Guangyao Li","Wenxuan Hou","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2308.05421v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.05410v1","updated":"2023-08-10T08:10:01Z","published":"2023-08-10T08:10:01Z","title":"SC3K: Self-supervised and Coherent 3D Keypoints Estimation from Rotated,\n Noisy, and Decimated Point Cloud Data","summary":" This paper proposes a new method to infer keypoints from arbitrary object\ncategories in practical scenarios where point cloud data (PCD) are noisy,\ndown-sampled and arbitrarily rotated. Our proposed model adheres to the\nfollowing principles: i) keypoints inference is fully unsupervised (no\nannotation given), ii) keypoints position error should be low and resilient to\nPCD perturbations (robustness), iii) keypoints should not change their indexes\nfor the intra-class objects (semantic coherence), iv) keypoints should be close\nto or proximal to PCD surface (compactness). We achieve these desiderata by\nproposing a new self-supervised training strategy for keypoints estimation that\ndoes not assume any a priori knowledge of the object class, and a model\narchitecture with coupled auxiliary losses that promotes the desired keypoints\nproperties. We compare the keypoints estimated by the proposed approach with\nthose of the state-of-the-art unsupervised approaches. The experiments show\nthat our approach outperforms by estimating keypoints with improved coverage\n(+9.41%) while being semantically consistent (+4.66%) that best characterizes\nthe object's 3D shape for downstream tasks. Code and data are available at:\nhttps://github.com/IITPAVIS/SC3K\n","authors":["Mohammad Zohaib","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.05410v1.pdf","comment":"This paper has been accepted in International Conference on Computer\n Vision (ICCV) 2023. For code and data, please refer to the following GitHub\n page: https://github.com/IITPAVIS/SC3K"},{"id":"http://arxiv.org/abs/2207.03190v2","updated":"2023-08-10T08:06:05Z","published":"2022-07-07T09:44:44Z","title":"Learning Music-Dance Representations through Explicit-Implicit Rhythm\n Synchronization","summary":" Although audio-visual representation has been proved to be applicable in many\ndownstream tasks, the representation of dancing videos, which is more specific\nand always accompanied by music with complex auditory contents, remains\nchallenging and uninvestigated. Considering the intrinsic alignment between the\ncadent movement of dancer and music rhythm, we introduce MuDaR, a novel\nMusic-Dance Representation learning framework to perform the synchronization of\nmusic and dance rhythms both in explicit and implicit ways. Specifically, we\nderive the dance rhythms based on visual appearance and motion cues inspired by\nthe music rhythm analysis. Then the visual rhythms are temporally aligned with\nthe music counterparts, which are extracted by the amplitude of sound\nintensity. Meanwhile, we exploit the implicit coherence of rhythms implied in\naudio and visual streams by contrastive learning. The model learns the joint\nembedding by predicting the temporal consistency between audio-visual pairs.\nThe music-dance representation, together with the capability of detecting audio\nand visual rhythms, can further be applied to three downstream tasks: (a) dance\nclassification, (b) music-dance retrieval, and (c) music-dance retargeting.\nExtensive experiments demonstrate that our proposed framework outperforms other\nself-supervised methods by a large margin.\n","authors":["Jiashuo Yu","Junfu Pu","Ying Cheng","Rui Feng","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2207.03190v2.pdf","comment":"Accepted for publication in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2308.05407v1","updated":"2023-08-10T08:03:58Z","published":"2023-08-10T08:03:58Z","title":"A Comparative Assessment of Multi-view fusion learning for Crop\n Classification","summary":" With a rapidly increasing amount and diversity of remote sensing (RS) data\nsources, there is a strong need for multi-view learning modeling. This is a\ncomplex task when considering the differences in resolution, magnitude, and\nnoise of RS data. The typical approach for merging multiple RS sources has been\ninput-level fusion, but other - more advanced - fusion strategies may\noutperform this traditional approach. This work assesses different fusion\nstrategies for crop classification in the CropHarvest dataset. The fusion\nmethods proposed in this work outperform models based on individual views and\nprevious fusion methods. We do not find one single fusion method that\nconsistently outperforms all other approaches. Instead, we present a comparison\nof multi-view fusion methods for three different datasets and show that,\ndepending on the test region, different methods obtain the best performance.\nDespite this, we suggest a preliminary criterion for the selection of fusion\nmethods.\n","authors":["Francisco Mena","Diego Arenas","Marlon Nuske","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2308.05407v1.pdf","comment":"Accepted at IEEE International Geoscience and Remote Sensing\n Symposium 2023"},{"id":"http://arxiv.org/abs/2308.05404v1","updated":"2023-08-10T07:53:06Z","published":"2023-08-10T07:53:06Z","title":"Enhancing Low-light Light Field Images with A Deep Compensation\n Unfolding Network","summary":" This paper presents a novel and interpretable end-to-end learning framework,\ncalled the deep compensation unfolding network (DCUNet), for restoring light\nfield (LF) images captured under low-light conditions. DCUNet is designed with\na multi-stage architecture that mimics the optimization process of solving an\ninverse imaging problem in a data-driven fashion. The framework uses the\nintermediate enhanced result to estimate the illumination map, which is then\nemployed in the unfolding process to produce a new enhanced result.\nAdditionally, DCUNet includes a content-associated deep compensation module at\neach optimization stage to suppress noise and illumination map estimation\nerrors. To properly mine and leverage the unique characteristics of LF images,\nthis paper proposes a pseudo-explicit feature interaction module that\ncomprehensively exploits redundant information in LF images. The experimental\nresults on both simulated and real datasets demonstrate the superiority of our\nDCUNet over state-of-the-art methods, both qualitatively and quantitatively.\nMoreover, DCUNet preserves the essential geometric structure of enhanced LF\nimages much better. The code will be publicly available at\nhttps://github.com/lyuxianqiang/LFLL-DCU.\n","authors":["Xianqiang Lyu","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2308.05404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03382v2","updated":"2023-08-10T07:38:35Z","published":"2023-08-07T08:03:20Z","title":"Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based\n Residual U-Blocks Network","summary":" Nucleus image segmentation is a crucial step in the analysis, pathological\ndiagnosis, and classification, which heavily relies on the quality of nucleus\nsegmentation. However, the complexity of issues such as variations in nucleus\nsize, blurred nucleus contours, uneven staining, cell clustering, and\noverlapping cells poses significant challenges. Current methods for nucleus\nsegmentation primarily rely on nuclear morphology or contour-based approaches.\nNuclear morphology-based methods exhibit limited generalization ability and\nstruggle to effectively predict irregular-shaped nuclei, while contour-based\nextraction methods face challenges in accurately segmenting overlapping nuclei.\nTo address the aforementioned issues, we propose a dual-branch network using\nhybrid attention based residual U-blocks for nucleus instance segmentation. The\nnetwork simultaneously predicts target information and target contours.\nAdditionally, we introduce a post-processing method that combines the target\ninformation and target contours to distinguish overlapping nuclei and generate\nan instance segmentation image. Within the network, we propose a context fusion\nblock (CF-block) that effectively extracts and merges contextual information\nfrom the network. Extensive quantitative evaluations are conducted to assess\nthe performance of our method. Experimental results demonstrate the superior\nperformance of the proposed method compared to state-of-the-art approaches on\nthe BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.\n","authors":["Junzhou Chen","Qian Huang","Yulin Chen","Linyi Qian","Chengyuan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03382v2.pdf","comment":"Nucleus segmentation, Deep learning, Instance segmentation, Medical\n imaging, Dual-Branch network"},{"id":"http://arxiv.org/abs/2303.05194v2","updated":"2023-08-10T07:30:14Z","published":"2023-03-09T11:48:29Z","title":"Contrastive Model Adaptation for Cross-Condition Robustness in Semantic\n Segmentation","summary":" Standard unsupervised domain adaptation methods adapt models from a source to\na target domain using labeled source data and unlabeled target data jointly. In\nmodel adaptation, on the other hand, access to the labeled source data is\nprohibited, i.e., only the source-trained model and unlabeled target data are\navailable. We investigate normal-to-adverse condition model adaptation for\nsemantic segmentation, whereby image-level correspondences are available in the\ntarget domain. The target set consists of unlabeled pairs of adverse- and\nnormal-condition street images taken at GPS-matched locations. Our method --\nCMA -- leverages such image pairs to learn condition-invariant features via\ncontrastive learning. In particular, CMA encourages features in the embedding\nspace to be grouped according to their condition-invariant semantic content and\nnot according to the condition under which respective inputs are captured. To\nobtain accurate cross-domain semantic correspondences, we warp the normal image\nto the viewpoint of the adverse image and leverage warp-confidence scores to\ncreate robust, aggregated features. With this approach, we achieve\nstate-of-the-art semantic segmentation performance for model adaptation on\nseveral normal-to-adverse adaptation benchmarks, such as ACDC and Dark Zurich.\nWe also evaluate CMA on a newly procured adverse-condition generalization\nbenchmark and report favorable results compared to standard unsupervised domain\nadaptation methods, despite the comparative handicap of CMA due to source data\ninaccessibility. Code is available at https://github.com/brdav/cma.\n","authors":["David Bruggemann","Christos Sakaridis","Tim Brödermann","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.05194v2.pdf","comment":"International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.05396v1","updated":"2023-08-10T07:28:22Z","published":"2023-08-10T07:28:22Z","title":"Learning Gabor Texture Features for Fine-Grained Recognition","summary":" Extracting and using class-discriminative features is critical for\nfine-grained recognition. Existing works have demonstrated the possibility of\napplying deep CNNs to exploit features that distinguish similar classes.\nHowever, CNNs suffer from problems including frequency bias and loss of\ndetailed local information, which restricts the performance of recognizing\nfine-grained categories. To address the challenge, we propose a novel texture\nbranch as complimentary to the CNN branch for feature extraction. We\ninnovatively utilize Gabor filters as a powerful extractor to exploit texture\nfeatures, motivated by the capability of Gabor filters in effectively capturing\nmulti-frequency features and detailed local information. We implement several\ndesigns to enhance the effectiveness of Gabor filters, including imposing\nconstraints on parameter values and developing a learning method to determine\nthe optimal parameters. Moreover, we introduce a statistical feature extractor\nto utilize informative statistical information from the signals captured by\nGabor filters, and a gate selection mechanism to enable efficient computation\nby only considering qualified regions as input for texture extraction. Through\nthe integration of features from the Gabor-filter-based texture branch and\nCNN-based semantic branch, we achieve comprehensive information extraction. We\ndemonstrate the efficacy of our method on multiple datasets, including\nCUB-200-2011, NA-bird, Stanford Dogs, and GTOS-mobile. State-of-the-art\nperformance is achieved using our approach.\n","authors":["Lanyun Zhu","Tianrun Chen","Jianxiong Yin","Simon See","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05396v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2308.05394v1","updated":"2023-08-10T07:21:35Z","published":"2023-08-10T07:21:35Z","title":"Robust Localization with Visual-Inertial Odometry Constraints for\n Markerless Mobile AR","summary":" Visual Inertial Odometry (VIO) is an essential component of modern Augmented\nReality (AR) applications. However, VIO only tracks the relative pose of the\ndevice, leading to drift over time. Absolute pose estimation methods infer the\ndevice's absolute pose, but their accuracy depends on the input quality. This\npaper introduces VIO-APR, a new framework for markerless mobile AR that\ncombines an absolute pose regressor (APR) with a local VIO tracking system.\nVIO-APR uses VIO to assess the reliability of the APR and the APR to identify\nand compensate for VIO drift. This feedback loop results in more accurate\npositioning and more stable AR experiences. To evaluate VIO-APR, we created a\ndataset that combines camera images with ARKit's VIO system output for six\nindoor and outdoor scenes of various scales. Over this dataset, VIO-APR\nimproves the median accuracy of popular APR by up to 36\\% in position and 29\\%\nin orientation, increases the percentage of frames in the high ($0.25 m,\n2^{\\circ}$) accuracy level by up to 112\\% and reduces the percentage of frames\npredicted below the low ($5 m, 10^\\circ$) accuracy greatly. We implement\nVIO-APR into a mobile AR application using Unity to demonstrate its\ncapabilities. VIO-APR results in noticeably more accurate localization and a\nmore stable overall experience.\n","authors":["Changkun Liu","Yukun Zhao","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2308.05394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05390v1","updated":"2023-08-10T07:09:13Z","published":"2023-08-10T07:09:13Z","title":"Product Review Image Ranking for Fashion E-commerce","summary":" In a fashion e-commerce platform where customers can't physically examine the\nproducts on their own, being able to see other customers' text and image\nreviews of the product is critical while making purchase decisions. Given the\nhigh reliance on these reviews, over the years we have observed customers\nproactively sharing their reviews. With an increase in the coverage of User\nGenerated Content (UGC), there has been a corresponding increase in the number\nof customer images. It is thus imperative to display the most relevant images\non top as it may influence users' online shopping choices and behavior. In this\npaper, we propose a simple yet effective training procedure for ranking\ncustomer images. We created a dataset consisting of Myntra (A Major Indian\nFashion e-commerce company) studio posts and highly engaged (upvotes/downvotes)\nUGC images as our starting point and used selected distortion techniques on the\nimages of the above dataset to bring their quality at par with those of bad UGC\nimages. We train our network to rank bad-quality images lower than high-quality\nones. Our proposed method outperforms the baseline models on two metrics,\nnamely correlation coefficient, and accuracy, by substantial margins.\n","authors":["Sangeet Jaiswal","Dhruv Patel","Sreekanth Vempati","Konduru Saiswaroop"],"pdf_url":"https://arxiv.org/pdf/2308.05390v1.pdf","comment":"Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR\n eCom'22)"},{"id":"http://arxiv.org/abs/2308.05387v1","updated":"2023-08-10T07:03:32Z","published":"2023-08-10T07:03:32Z","title":"HGDNet: A Height-Hierarchy Guided Dual-Decoder Network for Single View\n Building Extraction and Height Estimation","summary":" Unifying the correlative single-view satellite image building extraction and\nheight estimation tasks indicates a promising way to share representations and\nacquire generalist model for large-scale urban 3D reconstruction. However, the\ncommon spatial misalignment between building footprints and\nstereo-reconstructed nDSM height labels incurs degraded performance on both\ntasks. To address this issue, we propose a Height-hierarchy Guided Dual-decoder\nNetwork (HGDNet) to estimate building height. Under the guidance of synthesized\ndiscrete height-hierarchy nDSM, auxiliary height-hierarchical building\nextraction branch enhance the height estimation branch with implicit\nconstraints, yielding an accuracy improvement of more than 6% on the DFC 2023\ntrack2 dataset. Additional two-stage cascade architecture is adopted to achieve\nmore accurate building extraction. Experiments on the DFC 2023 Track 2 dataset\nshows the superiority of the proposed method in building height estimation\n({\\delta}1:0.8012), instance extraction (AP50:0.7730), and the final average\nscore 0.7871 ranks in the first place in test phase.\n","authors":["Chaoran Lu","Ningning Cao","Pan Zhang","Ting Liu","Baochai Peng","Guozhang Liu","Mengke Yuan","Sen Zhang","Simin Huang","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04152v2","updated":"2023-08-10T07:02:13Z","published":"2023-08-08T09:32:43Z","title":"Empowering Vision-Language Models to Follow Interleaved Vision-Language\n Instructions","summary":" Multimodal Large Language Models (MLLMs) have recently sparked significant\ninterest, which demonstrates emergent capabilities to serve as a\ngeneral-purpose model for various vision-language tasks. However, existing\nmethods mainly focus on limited types of instructions with a single image as\nvisual context, which hinders the widespread availability of MLLMs. In this\npaper, we introduce the I4 benchmark to comprehensively evaluate the\ninstruction following ability on complicated interleaved vision-language\ninstructions, which involve intricate image-text sequential context, covering a\ndiverse range of scenarios (e.g., visually-rich webpages/textbooks, lecture\nslides, embodied dialogue). Systematic evaluation on our I4 benchmark reveals a\ncommon defect of existing methods: the Visual Prompt Generator (VPG) trained on\nimage-captioning alignment objective tends to attend to common foreground\ninformation for captioning but struggles to extract specific information\nrequired by particular tasks. To address this issue, we propose a generic and\nlightweight controllable knowledge re-injection module, which utilizes the\nsophisticated reasoning ability of LLMs to control the VPG to conditionally\nextract instruction-specific visual information and re-inject it into the LLM.\nFurther, we introduce an annotation-free cross-attention guided counterfactual\nimage training strategy to methodically learn the proposed module by\ncollaborating a cascade of foundation models. Enhanced by the proposed module\nand training strategy, we present Cheetor, a Transformer-based MLLM that can\neffectively handle a wide variety of interleaved vision-language instructions\nand achieves state-of-the-art zero-shot performance across all tasks of I4,\nwithout high-quality multimodal instruction tuning data. Cheetor also exhibits\ncompetitive performance compared with state-of-the-art instruction tuned models\non MME benchmark.\n","authors":["Juncheng Li","Kaihang Pan","Zhiqi Ge","Minghe Gao","Hanwang Zhang","Wei Ji","Wenqiao Zhang","Tat-Seng Chua","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2308.04152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05382v1","updated":"2023-08-10T06:55:51Z","published":"2023-08-10T06:55:51Z","title":"Interaction-aware Joint Attention Estimation Using People Attributes","summary":" This paper proposes joint attention estimation in a single image. Different\nfrom related work in which only the gaze-related attributes of people are\nindependently employed, (I) their locations and actions are also employed as\ncontextual cues for weighting their attributes, and (ii) interactions among all\nof these attributes are explicitly modeled in our method. For the interaction\nmodeling, we propose a novel Transformer-based attention network to encode\njoint attention as low-dimensional features. We introduce a specialized MLP\nhead with positional embedding to the Transformer so that it predicts pixelwise\nconfidence of joint attention for generating the confidence heatmap. This\npixelwise prediction improves the heatmap accuracy by avoiding the ill-posed\nproblem in which the high-dimensional heatmap is predicted from the\nlow-dimensional features. The estimated joint attention is further improved by\nbeing integrated with general image-based attention estimation. Our method\noutperforms SOTA methods quantitatively in comparative experiments. Code:\nhttps://anonymous.4open.science/r/anonymized_codes-ECA4.\n","authors":["Chihiro Nakatani","Hiroaki Kawashima","Norimichi Ukita"],"pdf_url":"https://arxiv.org/pdf/2308.05382v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2303.06601v2","updated":"2023-08-10T06:53:11Z","published":"2023-03-12T08:05:30Z","title":"Multi-metrics adaptively identifies backdoors in Federated learning","summary":" The decentralized and privacy-preserving nature of federated learning (FL)\nmakes it vulnerable to backdoor attacks aiming to manipulate the behavior of\nthe resulting model on specific adversary-chosen inputs. However, most existing\ndefenses based on statistical differences take effect only against specific\nattacks, especially when the malicious gradients are similar to benign ones or\nthe data are highly non-independent and identically distributed (non-IID). In\nthis paper, we revisit the distance-based defense methods and discover that i)\nEuclidean distance becomes meaningless in high dimensions and ii) malicious\ngradients with diverse characteristics cannot be identified by a single metric.\nTo this end, we present a simple yet effective defense strategy with\nmulti-metrics and dynamic weighting to identify backdoors adaptively.\nFurthermore, our novel defense has no reliance on predefined assumptions over\nattack settings or data distributions and little impact on benign performance.\nTo evaluate the effectiveness of our approach, we conduct comprehensive\nexperiments on different datasets under various attack settings, where our\nmethod achieves the best defensive performance. For instance, we achieve the\nlowest backdoor accuracy of 3.06% under the difficult Edge-case PGD, showing\nsignificant superiority over previous defenses. The results also demonstrate\nthat our method can be well-adapted to a wide range of non-IID degrees without\nsacrificing the benign performance.\n","authors":["Siquan Huang","Yijiang Li","Chong Chen","Leyu Shi","Ying Gao"],"pdf_url":"https://arxiv.org/pdf/2303.06601v2.pdf","comment":"14 pages, 8 figures and 7 tables; 2023 IEEE/CVF International\n Conference on Computer Vision (ICCV)"},{"id":"http://arxiv.org/abs/2308.05371v1","updated":"2023-08-10T06:40:19Z","published":"2023-08-10T06:40:19Z","title":"Flexible Isosurface Extraction for Gradient-Based Mesh Optimization","summary":" This work considers gradient-based mesh optimization, where we iteratively\noptimize for a 3D surface mesh by representing it as the isosurface of a scalar\nfield, an increasingly common paradigm in applications including\nphotogrammetry, generative modeling, and inverse physics. Existing\nimplementations adapt classic isosurface extraction algorithms like Marching\nCubes or Dual Contouring; these techniques were designed to extract meshes from\nfixed, known fields, and in the optimization setting they lack the degrees of\nfreedom to represent high-quality feature-preserving meshes, or suffer from\nnumerical instabilities. We introduce FlexiCubes, an isosurface representation\nspecifically designed for optimizing an unknown mesh with respect to geometric,\nvisual, or even physical objectives. Our main insight is to introduce\nadditional carefully-chosen parameters into the representation, which allow\nlocal flexible adjustments to the extracted mesh geometry and connectivity.\nThese parameters are updated along with the underlying scalar field via\nautomatic differentiation when optimizing for a downstream task. We base our\nextraction scheme on Dual Marching Cubes for improved topological properties,\nand present extensions to optionally generate tetrahedral and\nhierarchically-adaptive meshes. Extensive experiments validate FlexiCubes on\nboth synthetic benchmarks and real-world applications, showing that it offers\nsignificant improvements in mesh quality and geometric fidelity.\n","authors":["Tianchang Shen","Jacob Munkberg","Jon Hasselgren","Kangxue Yin","Zian Wang","Wenzheng Chen","Zan Gojcic","Sanja Fidler","Nicholas Sharp","Jun Gao"],"pdf_url":"https://arxiv.org/pdf/2308.05371v1.pdf","comment":"SIGGRAPH 2023. Project page:\n https://research.nvidia.com/labs/toronto-ai/flexicubes/"},{"id":"http://arxiv.org/abs/2308.05365v1","updated":"2023-08-10T06:20:00Z","published":"2023-08-10T06:20:00Z","title":"TriDo-Former: A Triple-Domain Transformer for Direct PET Reconstruction\n from Low-Dose Sinograms","summary":" To obtain high-quality positron emission tomography (PET) images while\nminimizing radiation exposure, various methods have been proposed for\nreconstructing standard-dose PET (SPET) images from low-dose PET (LPET)\nsinograms directly. However, current methods often neglect boundaries during\nsinogram-to-image reconstruction, resulting in high-frequency distortion in the\nfrequency domain and diminished or fuzzy edges in the reconstructed images.\nFurthermore, the convolutional architectures, which are commonly used, lack the\nability to model long-range non-local interactions, potentially leading to\ninaccurate representations of global structures. To alleviate these problems,\nwe propose a transformer-based model that unites triple domains of sinogram,\nimage, and frequency for direct PET reconstruction, namely TriDo-Former.\nSpecifically, the TriDo-Former consists of two cascaded networks, i.e., a\nsinogram enhancement transformer (SE-Former) for denoising the input LPET\nsinograms and a spatial-spectral reconstruction transformer (SSR-Former) for\nreconstructing SPET images from the denoised sinograms. Different from the\nvanilla transformer that splits an image into 2D patches, based specifically on\nthe PET imaging mechanism, our SE-Former divides the sinogram into 1D\nprojection view angles to maintain its inner-structure while denoising,\npreventing the noise in the sinogram from prorogating into the image domain.\nMoreover, to mitigate high-frequency distortion and improve reconstruction\ndetails, we integrate global frequency parsers (GFPs) into SSR-Former. The GFP\nserves as a learnable frequency filter that globally adjusts the frequency\ncomponents in the frequency domain, enforcing the network to restore\nhigh-frequency details resembling real SPET images. Validations on a clinical\ndataset demonstrate that our TriDo-Former outperforms the state-of-the-art\nmethods qualitatively and quantitatively.\n","authors":["Jiaqi Cui","Pinxian Zeng","Xinyi Zeng","Peng Wang","Xi Wu","Jiliu Zhou","Yan Wang","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2308.05365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05359v1","updated":"2023-08-10T05:56:53Z","published":"2023-08-10T05:56:53Z","title":"Pseudo-label Alignment for Semi-supervised Instance Segmentation","summary":" Pseudo-labeling is significant for semi-supervised instance segmentation,\nwhich generates instance masks and classes from unannotated images for\nsubsequent training. However, in existing pipelines, pseudo-labels that contain\nvaluable information may be directly filtered out due to mismatches in class\nand mask quality. To address this issue, we propose a novel framework, called\npseudo-label aligning instance segmentation (PAIS), in this paper. In PAIS, we\ndevise a dynamic aligning loss (DALoss) that adjusts the weights of\nsemi-supervised loss terms with varying class and mask score pairs. Through\nextensive experiments conducted on the COCO and Cityscapes datasets, we\ndemonstrate that PAIS is a promising framework for semi-supervised instance\nsegmentation, particularly in cases where labeled data is severely limited.\nNotably, with just 1\\% labeled data, PAIS achieves 21.2 mAP (based on\nMask-RCNN) and 19.9 mAP (based on K-Net) on the COCO dataset, outperforming the\ncurrent state-of-the-art model, \\ie, NoisyBoundary with 7.7 mAP, by a margin of\nover 12 points. Code is available at: \\url{https://github.com/hujiecpp/PAIS}.\n","authors":["Jie Hu","Chen Chen","Liujuan Cao","Shengchuan Zhang","Annan Shu","Guannan Jiang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.05359v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05358v1","updated":"2023-08-10T05:54:57Z","published":"2023-08-10T05:54:57Z","title":"Fine-grained building roof instance segmentation based on domain adapted\n pretraining and composite dual-backbone","summary":" The diversity of building architecture styles of global cities situated on\nvarious landforms, the degraded optical imagery affected by clouds and shadows,\nand the significant inter-class imbalance of roof types pose challenges for\ndesigning a robust and accurate building roof instance segmentor. To address\nthese issues, we propose an effective framework to fulfill semantic\ninterpretation of individual buildings with high-resolution optical satellite\nimagery. Specifically, the leveraged domain adapted pretraining strategy and\ncomposite dual-backbone greatly facilitates the discriminative feature\nlearning. Moreover, new data augmentation pipeline, stochastic weight averaging\n(SWA) training and instance segmentation based model ensemble in testing are\nutilized to acquire additional performance boost. Experiment results show that\nour approach ranks in the first place of the 2023 IEEE GRSS Data Fusion Contest\n(DFC) Track 1 test phase ($mAP_{50}$:50.6\\%). Note-worthily, we have also\nexplored the potential of multimodal data fusion with both optical satellite\nimagery and SAR data.\n","authors":["Guozhang Liu","Baochai Peng","Ting Liu","Pan Zhang","Mengke Yuan","Chaoran Lu","Ningning Cao","Sen Zhang","Simin Huang","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05355v1","updated":"2023-08-10T05:51:21Z","published":"2023-08-10T05:51:21Z","title":"TCSloT: Text Guided 3D Context and Slope Aware Triple Network for Dental\n Implant Position Prediction","summary":" In implant prosthesis treatment, the surgical guide of implant is used to\nensure accurate implantation. However, such design heavily relies on the manual\nlocation of the implant position. When deep neural network has been proposed to\nassist the dentist in locating the implant position, most of them take a single\nslice as input, which do not fully explore 3D contextual information and\nignoring the influence of implant slope. In this paper, we design a Text Guided\n3D Context and Slope Aware Triple Network (TCSloT) which enables the perception\nof contextual information from multiple adjacent slices and awareness of\nvariation of implant slopes. A Texture Variation Perception (TVP) module is\ncorrespondingly elaborated to process the multiple slices and capture the\ntexture variation among slices and a Slope-Aware Loss (SAL) is proposed to\ndynamically assign varying weights for the regression head. Additionally, we\ndesign a conditional text guidance (CTG) module to integrate the text condition\n(i.e., left, middle and right) from the CLIP for assisting the implant position\nprediction. Extensive experiments on a dental implant dataset through five-fold\ncross-validation demonstrated that the proposed TCSloT achieves superior\nperformance than existing methods.\n","authors":["Xinquan Yang","Jinheng Xie","Xuechen Li","Xuguang Li","Linlin Shen","Yongqiang Deng"],"pdf_url":"https://arxiv.org/pdf/2308.05355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05346v1","updated":"2023-08-10T05:27:43Z","published":"2023-08-10T05:27:43Z","title":"Towards General and Fast Video Derain via Knowledge Distillation","summary":" As a common natural weather condition, rain can obscure video frames and thus\naffect the performance of the visual system, so video derain receives a lot of\nattention. In natural environments, rain has a wide variety of streak types,\nwhich increases the difficulty of the rain removal task. In this paper, we\npropose a Rain Review-based General video derain Network via knowledge\ndistillation (named RRGNet) that handles different rain streak types with one\npre-training weight. Specifically, we design a frame grouping-based\nencoder-decoder network that makes full use of the temporal information of the\nvideo. Further, we use the old task model to guide the current model in\nlearning new rain streak types while avoiding forgetting. To consolidate the\nnetwork's ability to derain, we design a rain review module to play back data\nfrom old tasks for the current model. The experimental results show that our\ndeveloped general method achieves the best results in terms of running speed\nand derain effect.\n","authors":["Defang Cai","Pan Mu","Sixian Chan","Zhanpeng Shao","Cong Bai"],"pdf_url":"https://arxiv.org/pdf/2308.05346v1.pdf","comment":"6 pages; Accepted at IEEE ICME"},{"id":"http://arxiv.org/abs/2308.05344v1","updated":"2023-08-10T05:20:25Z","published":"2023-08-10T05:20:25Z","title":"Prostate Age Gap (PAG): An MRI surrogate marker of aging for prostate\n cancer detection","summary":" Background: Prostate cancer (PC) MRI-based risk calculators are commonly\nbased on biological (e.g. PSA), MRI markers (e.g. volume), and patient age.\nWhilst patient age measures the amount of years an individual has existed,\nbiological age (BA) might better reflect the physiology of an individual.\nHowever, surrogates from prostate MRI and linkage with clinically significant\nPC (csPC) remain to be explored. Purpose: To obtain and evaluate Prostate Age\nGap (PAG) as an MRI marker tool for csPC risk. Study type: Retrospective.\nPopulation: A total of 7243 prostate MRI slices from 468 participants who had\nundergone prostate biopsies. A deep learning model was trained on 3223 MRI\nslices cropped around the gland from 81 low-grade PC (ncsPC, Gleason score <=6)\nand 131 negative cases and tested on the remaining 256 participants.\nAssessment: Chronological age was defined as the age of the participant at the\ntime of the visit and used to train the deep learning model to predict the age\nof the patient. Following, we obtained PAG, defined as the model predicted age\nminus the patient's chronological age. Multivariate logistic regression models\nwere used to estimate the association through odds ratio (OR) and predictive\nvalue of PAG and compared against PSA levels and PI-RADS>=3. Statistical tests:\nT-test, Mann-Whitney U test, Permutation test and ROC curve analysis. Results:\nThe multivariate adjusted model showed a significant difference in the odds of\nclinically significant PC (csPC, Gleason score >=7) (OR =3.78, 95% confidence\ninterval (CI):2.32-6.16, P <.001). PAG showed a better predictive ability when\ncompared to PI-RADS>=3 and adjusted by other risk factors, including PSA\nlevels: AUC =0.981 vs AUC =0.704, p<.001. Conclusion: PAG was significantly\nassociated with the risk of clinically significant PC and outperformed other\nwell-established PC risk factors.\n","authors":["Alvaro Fernandez-Quilez","Tobias Nordström","Fredrik Jäderling","Svein Reidar Kjosavik","Martin Eklund"],"pdf_url":"https://arxiv.org/pdf/2308.05344v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.05068v2","updated":"2023-08-10T04:26:42Z","published":"2023-08-09T16:58:03Z","title":"Geometric Learning-Based Transformer Network for Estimation of\n Segmentation Errors","summary":" Many segmentation networks have been proposed for 3D volumetric segmentation\nof tumors and organs at risk. Hospitals and clinical institutions seek to\naccelerate and minimize the efforts of specialists in image segmentation.\nStill, in case of errors generated by these networks, clinicians would have to\nmanually edit the generated segmentation maps. Given a 3D volume and its\nputative segmentation map, we propose an approach to identify and measure\nerroneous regions in the segmentation map. Our method can estimate error at any\npoint or node in a 3D mesh generated from a possibly erroneous volumetric\nsegmentation map, serving as a Quality Assurance tool. We propose a graph\nneural network-based transformer based on the Nodeformer architecture to\nmeasure and classify the segmentation errors at any point. We have evaluated\nour network on a high-resolution micro-CT dataset of the human inner-ear bony\nlabyrinth structure by simulating erroneous 3D segmentation maps. Our network\nincorporates a convolutional encoder to compute node-centric features from the\ninput micro-CT data, the Nodeformer to learn the latent graph embeddings, and a\nMulti-Layer Perceptron (MLP) to compute and classify the node-wise errors. Our\nnetwork achieves a mean absolute error of ~0.042 over other Graph Neural\nNetworks (GNN) and an accuracy of 79.53% over other GNNs in estimating and\nclassifying the node-wise errors, respectively. We also put forth vertex-normal\nprediction as a custom pretext task for pre-training the CNN encoder to improve\nthe network's overall performance. Qualitative analysis shows the efficiency of\nour network in correctly classifying errors and reducing misclassifications.\n","authors":["Sneha Sree C","Mohammad Al Fahim","Keerthi Ram","Mohanasankar Sivaprakasam"],"pdf_url":"https://arxiv.org/pdf/2308.05068v2.pdf","comment":"Accepted in MICCAI workshop on ShapeMI, 2023"},{"id":"http://arxiv.org/abs/2304.02970v3","updated":"2023-08-10T04:08:44Z","published":"2023-04-06T09:54:06Z","title":"A Closer Look at Audio-Visual Semantic Segmentation","summary":" Audio-visual segmentation (AVS) is a complex task that involves accurately\nsegmenting the corresponding sounding object based on audio-visual queries.\nSuccessful audio-visual learning requires two essential components: 1) an\nunbiased dataset with high-quality pixel-level multi-class labels, and 2) a\nmodel capable of effectively linking audio information with its corresponding\nvisual object. However, these two requirements are only partially addressed by\ncurrent methods, with training sets containing biased audio-visual data, and\nmodels that generalise poorly beyond this biased training set. In this work, we\npropose a new strategy to build cost-effective and relatively unbiased\naudio-visual semantic segmentation benchmarks. Our strategy, called Visual\nPost-production (VPO), explores the observation that it is not necessary to\nhave explicit audio-visual pairs extracted from single video sources to build\nsuch benchmarks. We also refine the previously proposed AVSBench to transform\nit into the audio-visual semantic segmentation benchmark AVSBench-Single+.\nFurthermore, this paper introduces a new pixel-wise audio-visual contrastive\nlearning method to enable a better generalisation of the model beyond the\ntraining set. We verify the validity of the VPO strategy by showing that\nstate-of-the-art (SOTA) models trained with datasets built by matching audio\nand visual data from different sources or with datasets containing audio and\nvisual data from the same video source produce almost the same accuracy. Then,\nusing the proposed VPO benchmarks and AVSBench-Single+, we show that our method\nproduces more accurate audio-visual semantic segmentation than SOTA models.\nCode and dataset will be available.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07304v2","updated":"2023-08-10T04:04:37Z","published":"2023-05-12T08:19:39Z","title":"CLIP-Count: Towards Text-Guided Zero-Shot Object Counting","summary":" Recent advances in visual-language models have shown remarkable zero-shot\ntext-image matching ability that is transferable to downstream tasks such as\nobject detection and segmentation. Adapting these models for object counting,\nhowever, remains a formidable challenge. In this study, we first investigate\ntransferring vision-language models (VLMs) for class-agnostic object counting.\nSpecifically, we propose CLIP-Count, the first end-to-end pipeline that\nestimates density maps for open-vocabulary objects with text guidance in a\nzero-shot manner. To align the text embedding with dense visual features, we\nintroduce a patch-text contrastive loss that guides the model to learn\ninformative patch-level visual representations for dense prediction. Moreover,\nwe design a hierarchical patch-text interaction module to propagate semantic\ninformation across different resolution levels of visual features. Benefiting\nfrom the full exploitation of the rich image-text alignment knowledge of\npretrained VLMs, our method effectively generates high-quality density maps for\nobjects-of-interest. Extensive experiments on FSC-147, CARPK, and ShanghaiTech\ncrowd counting datasets demonstrate state-of-the-art accuracy and\ngeneralizability of the proposed method. Code is available:\nhttps://github.com/songrise/CLIP-Count.\n","authors":["Ruixiang Jiang","Lingbo Liu","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2305.07304v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.04904v2","updated":"2023-08-10T03:52:49Z","published":"2023-08-09T12:04:36Z","title":"StableVQA: A Deep No-Reference Quality Assessment Model for Video\n Stability","summary":" Video shakiness is an unpleasant distortion of User Generated Content (UGC)\nvideos, which is usually caused by the unstable hold of cameras. In recent\nyears, many video stabilization algorithms have been proposed, yet no specific\nand accurate metric enables comprehensively evaluating the stability of videos.\nIndeed, most existing quality assessment models evaluate video quality as a\nwhole without specifically taking the subjective experience of video stability\ninto consideration. Therefore, these models cannot measure the video stability\nexplicitly and precisely when severe shakes are present. In addition, there is\nno large-scale video database in public that includes various degrees of shaky\nvideos with the corresponding subjective scores available, which hinders the\ndevelopment of Video Quality Assessment for Stability (VQA-S). To this end, we\nbuild a new database named StableDB that contains 1,952 diversely-shaky UGC\nvideos, where each video has a Mean Opinion Score (MOS) on the degree of video\nstability rated by 34 subjects. Moreover, we elaborately design a novel VQA-S\nmodel named StableVQA, which consists of three feature extractors to acquire\nthe optical flow, semantic, and blur features respectively, and a regression\nlayer to predict the final stability score. Extensive experiments demonstrate\nthat the StableVQA achieves a higher correlation with subjective opinions than\nthe existing VQA-S models and generic VQA models. The database and codes are\navailable at https://github.com/QMME/StableVQA.\n","authors":["Tengchuan Kou","Xiaohong Liu","Wei Sun","Jun Jia","Xiongkuo Min","Guangtao Zhai","Ning Liu"],"pdf_url":"https://arxiv.org/pdf/2308.04904v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05320v1","updated":"2023-08-10T03:44:10Z","published":"2023-08-10T03:44:10Z","title":"Adv-Inpainting: Generating Natural and Transferable Adversarial Patch\n via Attention-guided Feature Fusion","summary":" The rudimentary adversarial attacks utilize additive noise to attack facial\nrecognition (FR) models. However, because manipulating the total face is\nimpractical in the physical setting, most real-world FR attacks are based on\nadversarial patches, which limit perturbations to a small area. Previous\nadversarial patch attacks often resulted in unnatural patterns and clear\nboundaries that were easily noticeable. In this paper, we argue that generating\nadversarial patches with plausible content can result in stronger\ntransferability than using additive noise or directly sampling from the latent\nspace. To generate natural-looking and highly transferable adversarial patches,\nwe propose an innovative two-stage coarse-to-fine attack framework called\nAdv-Inpainting. In the first stage, we propose an attention-guided StyleGAN\n(Att-StyleGAN) that adaptively combines texture and identity features based on\nthe attention map to generate high-transferable and natural adversarial\npatches. In the second stage, we design a refinement network with a new\nboundary variance loss to further improve the coherence between the patch and\nits surrounding area. Experiment results demonstrate that Adv-Inpainting is\nstealthy and can produce adversarial patches with stronger transferability and\nimproved visual quality than previous adversarial patch attacks.\n","authors":["Yanjie Li","Mingxing Duan","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.05320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11316v2","updated":"2023-08-10T03:41:10Z","published":"2023-03-20T17:55:37Z","title":"Generative Semantic Segmentation","summary":" We present Generative Semantic Segmentation (GSS), a generative learning\napproach for semantic segmentation. Uniquely, we cast semantic segmentation as\nan image-conditioned mask generation problem. This is achieved by replacing the\nconventional per-pixel discriminative learning with a latent prior learning\nprocess. Specifically, we model the variational posterior distribution of\nlatent variables given the segmentation mask. To that end, the segmentation\nmask is expressed with a special type of image (dubbed as maskige). This\nposterior distribution allows to generate segmentation masks unconditionally.\nTo achieve semantic segmentation on a given image, we further introduce a\nconditioning network. It is optimized by minimizing the divergence between the\nposterior distribution of maskige (i.e., segmentation masks) and the latent\nprior distribution of input training images. Extensive experiments on standard\nbenchmarks show that our GSS can perform competitively to prior art\nalternatives in the standard semantic segmentation setting, whilst achieving a\nnew state of the art in the more challenging cross-domain setting.\n","authors":["Jiaqi Chen","Jiachen Lu","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.11316v2.pdf","comment":"To appear at CVPR2023, code at http://github.com/fudan-zvg/GSS"},{"id":"http://arxiv.org/abs/2308.05318v1","updated":"2023-08-10T03:14:19Z","published":"2023-08-10T03:14:19Z","title":"RLSAC: Reinforcement Learning enhanced Sample Consensus for End-to-End\n Robust Estimation","summary":" Robust estimation is a crucial and still challenging task, which involves\nestimating model parameters in noisy environments. Although conventional\nsampling consensus-based algorithms sample several times to achieve robustness,\nthese algorithms cannot use data features and historical information\neffectively. In this paper, we propose RLSAC, a novel Reinforcement Learning\nenhanced SAmple Consensus framework for end-to-end robust estimation. RLSAC\nemploys a graph neural network to utilize both data and memory features to\nguide exploring directions for sampling the next minimum set. The feedback of\ndownstream tasks serves as the reward for unsupervised training. Therefore,\nRLSAC can avoid differentiating to learn the features and the feedback of\ndownstream tasks for end-to-end robust estimation. In addition, RLSAC\nintegrates a state transition module that encodes both data and memory\nfeatures. Our experimental results demonstrate that RLSAC can learn from\nfeatures to gradually explore a better hypothesis. Through analysis, it is\napparent that RLSAC can be easily transferred to other sampling consensus-based\nrobust estimation tasks. To the best of our knowledge, RLSAC is also the first\nmethod that uses reinforcement learning to sample consensus for end-to-end\nrobust estimation. We release our codes at https://github.com/IRMVLab/RLSAC.\n","authors":["Chang Nie","Guangming Wang","Zhe Liu","Luca Cavalli","Marc Pollefeys","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05318v1.pdf","comment":"Accepted by ICCV2023. Codes are released at\n https://github.com/IRMVLab/RLSAC"},{"id":"http://arxiv.org/abs/2308.05314v1","updated":"2023-08-10T03:07:28Z","published":"2023-08-10T03:07:28Z","title":"Deep Semantic Graph Matching for Large-scale Outdoor Point Clouds\n Registration","summary":" The current point cloud registration methods are mainly based on geometric\ninformation and usually ignore the semantic information in the point clouds. In\nthis paper, we treat the point cloud registration problem as semantic instance\nmatching and registration task, and propose a deep semantic graph matching\nmethod for large-scale outdoor point cloud registration. Firstly, the semantic\ncategory labels of 3D point clouds are obtained by utilizing large-scale point\ncloud semantic segmentation network. The adjacent points with the same category\nlabels are then clustered together by using Euclidean clustering algorithm to\nobtain the semantic instances. Secondly, the semantic adjacency graph is\nconstructed based on the spatial adjacency relation of semantic instances.\nThree kinds of high-dimensional features including geometric shape features,\nsemantic categorical features and spatial distribution features are learned\nthrough graph convolutional network, and enhanced based on attention mechanism.\nThirdly, the semantic instance matching problem is modeled as an optimal\ntransport problem, and solved through an optimal matching layer. Finally,\naccording to the matched semantic instances, the geometric transformation\nmatrix between two point clouds is first obtained by SVD algorithm and then\nrefined by ICP algorithm. The experiments are cconducted on the KITTI Odometry\ndataset, and the average relative translation error and average relative\nrotation error of the proposed method are 6.6cm and 0.229{\\deg} respectively.\n","authors":["Shaocong Liu","Tao Wang","Yan Zhang","Ruqin Zhou","Li Li","Chenguang Dai","Yongsheng Zhang","Hanyun Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05314v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.05311v1","updated":"2023-08-10T02:59:40Z","published":"2023-08-10T02:59:40Z","title":"DAOT: Domain-Agnostically Aligned Optimal Transport for Domain-Adaptive\n Crowd Counting","summary":" Domain adaptation is commonly employed in crowd counting to bridge the domain\ngaps between different datasets. However, existing domain adaptation methods\ntend to focus on inter-dataset differences while overlooking the\nintra-differences within the same dataset, leading to additional learning\nambiguities. These domain-agnostic factors, e.g., density, surveillance\nperspective, and scale, can cause significant in-domain variations, and the\nmisalignment of these factors across domains can lead to a drop in performance\nin cross-domain crowd counting. To address this issue, we propose a\nDomain-agnostically Aligned Optimal Transport (DAOT) strategy that aligns\ndomain-agnostic factors between domains. The DAOT consists of three steps.\nFirst, individual-level differences in domain-agnostic factors are measured\nusing structural similarity (SSIM). Second, the optimal transfer (OT) strategy\nis employed to smooth out these differences and find the optimal\ndomain-to-domain misalignment, with outlier individuals removed via a virtual\n\"dustbin\" column. Third, knowledge is transferred based on the aligned\ndomain-agnostic factors, and the model is retrained for domain adaptation to\nbridge the gap across domains. We conduct extensive experiments on five\nstandard crowd-counting benchmarks and demonstrate that the proposed method has\nstrong generalizability across diverse datasets. Our code will be available at:\nhttps://github.com/HopooLinZ/DAOT/.\n","authors":["Huilin Zhu","Jingling Yuan","Xian Zhong","Zhengwei Yang","Zheng Wang","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2308.05311v1.pdf","comment":"11 pages, 12 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.05305v1","updated":"2023-08-10T02:48:57Z","published":"2023-08-10T02:48:57Z","title":"From CNN to Transformer: A Review of Medical Image Segmentation Models","summary":" Medical image segmentation is an important step in medical image analysis,\nespecially as a crucial prerequisite for efficient disease diagnosis and\ntreatment. The use of deep learning for image segmentation has become a\nprevalent trend. The widely adopted approach currently is U-Net and its\nvariants. Additionally, with the remarkable success of pre-trained models in\nnatural language processing tasks, transformer-based models like TransUNet have\nachieved desirable performance on multiple medical image segmentation datasets.\nIn this paper, we conduct a survey of the most representative four medical\nimage segmentation models in recent years. We theoretically analyze the\ncharacteristics of these models and quantitatively evaluate their performance\non two benchmark datasets (i.e., Tuberculosis Chest X-rays and ovarian tumors).\nFinally, we discuss the main challenges and future trends in medical image\nsegmentation. Our work can assist researchers in the related field to quickly\nestablish medical segmentation models tailored to specific regions.\n","authors":["Wenjian Yao","Jiajun Bai","Wei Liao","Yuheng Chen","Mengjuan Liu","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2308.05305v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.05303v1","updated":"2023-08-10T02:47:36Z","published":"2023-08-10T02:47:36Z","title":"Multi-Visual-Inertial System: Analysis,Calibration and Estimation","summary":" In this paper, we study state estimation of multi-visual-inertial systems\n(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary\nnumber of asynchronous inertial measurement units (IMUs) or gyroscopes and\nglobal and(or) rolling shutter cameras. We are especially interested in the\nfull calibration of the associated visual-inertial sensors, including the IMU\nor camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as\nwell as the image readout time of rolling-shutter cameras (if used). To this\nend, we develop a new analytic combined IMU integration with intrinsics-termed\nACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary\nIMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial\nmeasurements to include all the necessary inertial intrinsic and IMU-IMU\nspatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body\nconstraints to eliminate the necessity of auxiliary inertial poses and thus\nreducing computational complexity. By performing observability analysis of\nMVIS, we prove that the standard four unobservable directions remain - no\nmatter how many inertial sensors are used, and also identify, for the first\ntime, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary\ninertial intrinsics. In addition to the extensive simulations that validate our\nanalysis and algorithms, we have built our own MVIS sensor rig and collected\nover 25 real-world datasets to experimentally verify the proposed calibration\nagainst the state-of-the-art calibration method such as Kalibr. We show that\nthe proposed MVIS calibration is able to achieve competing accuracy with\nimproved convergence and repeatability, which is open sourced to better benefit\nthe community.\n","authors":["Yulin Yang","Patrick Geneva","Guoquan Huang"],"pdf_url":"https://arxiv.org/pdf/2308.05303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16361v2","updated":"2023-08-10T02:45:55Z","published":"2023-07-31T01:34:24Z","title":"Benchmarking and Analyzing Robust Point Cloud Recognition: Bag of Tricks\n for Defending Adversarial Examples","summary":" Deep Neural Networks (DNNs) for 3D point cloud recognition are vulnerable to\nadversarial examples, threatening their practical deployment. Despite the many\nresearch endeavors have been made to tackle this issue in recent years, the\ndiversity of adversarial examples on 3D point clouds makes them more\nchallenging to defend against than those on 2D images. For examples, attackers\ncan generate adversarial examples by adding, shifting, or removing points.\nConsequently, existing defense strategies are hard to counter unseen point\ncloud adversarial examples. In this paper, we first establish a comprehensive,\nand rigorous point cloud adversarial robustness benchmark to evaluate\nadversarial robustness, which can provide a detailed understanding of the\neffects of the defense and attack methods. We then collect existing defense\ntricks in point cloud adversarial defenses and then perform extensive and\nsystematic experiments to identify an effective combination of these tricks.\nFurthermore, we propose a hybrid training augmentation methods that consider\nvarious types of point cloud adversarial examples to adversarial training,\nsignificantly improving the adversarial robustness. By combining these tricks,\nwe construct a more robust defense framework achieving an average accuracy of\n83.45\\% against various attacks, demonstrating its capability to enabling\nrobust learners. Our codebase are open-sourced on:\n\\url{https://github.com/qiufan319/benchmark_pc_attack.git}.\n","authors":["Qiufan Ji","Lin Wang","Cong Shi","Shengshan Hu","Yingying Chen","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2307.16361v2.pdf","comment":"8 pages 6 figures"},{"id":"http://arxiv.org/abs/2308.05298v1","updated":"2023-08-10T02:41:18Z","published":"2023-08-10T02:41:18Z","title":"Double-chain Constraints for 3D Human Pose Estimation in Images and\n Videos","summary":" Reconstructing 3D poses from 2D poses lacking depth information is\nparticularly challenging due to the complexity and diversity of human motion.\nThe key is to effectively model the spatial constraints between joints to\nleverage their inherent dependencies. Thus, we propose a novel model, called\nDouble-chain Graph Convolutional Transformer (DC-GCT), to constrain the pose\nthrough a double-chain design consisting of local-to-global and global-to-local\nchains to obtain a complex representation more suitable for the current human\npose. Specifically, we combine the advantages of GCN and Transformer and design\na Local Constraint Module (LCM) based on GCN and a Global Constraint Module\n(GCM) based on self-attention mechanism as well as a Feature Interaction Module\n(FIM). The proposed method fully captures the multi-level dependencies between\nhuman body joints to optimize the modeling capability of the model. Moreover,\nwe propose a method to use temporal information into the single-frame model by\nguiding the video sequence embedding through the joint embedding of the target\nframe, with negligible increase in computational cost. Experimental results\ndemonstrate that DC-GCT achieves state-of-the-art performance on two\nchallenging datasets (Human3.6M and MPI-INF-3DHP). Notably, our model achieves\nstate-of-the-art performance on all action categories in the Human3.6M dataset\nusing detected 2D poses from CPN, and our code is available at:\nhttps://github.com/KHB1698/DC-GCT.\n","authors":["Hongbo Kang","Yong Wang","Mengyuan Liu","Doudou Wu","Peng Liu","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2308.05298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12384v3","updated":"2023-08-10T02:39:22Z","published":"2023-03-22T08:47:37Z","title":"RegFormer: An Efficient Projection-Aware Transformer Network for\n Large-Scale Point Cloud Registration","summary":" Although point cloud registration has achieved remarkable advances in\nobject-level and indoor scenes, large-scale registration methods are rarely\nexplored. Challenges mainly arise from the huge point number, complex\ndistribution, and outliers of outdoor LiDAR scans. In addition, most existing\nregistration works generally adopt a two-stage paradigm: They first find\ncorrespondences by extracting discriminative local features and then leverage\nestimators (eg. RANSAC) to filter outliers, which are highly dependent on\nwell-designed descriptors and post-processing choices. To address these\nproblems, we propose an end-to-end transformer network (RegFormer) for\nlarge-scale point cloud alignment without any further post-processing.\nSpecifically, a projection-aware hierarchical transformer is proposed to\ncapture long-range dependencies and filter outliers by extracting point\nfeatures globally. Our transformer has linear complexity, which guarantees high\nefficiency even for large-scale scenes. Furthermore, to effectively reduce\nmismatches, a bijective association transformer is designed for regressing the\ninitial transformation. Extensive experiments on KITTI and NuScenes datasets\ndemonstrate that our RegFormer achieves competitive performance in terms of\nboth accuracy and efficiency.\n","authors":["Jiuming Liu","Guangming Wang","Zhe Liu","Chaokang Jiang","Marc Pollefeys","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2303.12384v3.pdf","comment":"Accepted by ICCV2023. Codes are released at\n https://github.com/IRMVLab/RegFormer"},{"id":"http://arxiv.org/abs/2308.03463v3","updated":"2023-08-10T02:26:16Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n Synthesis","summary":" In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03463v3.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2306.11029v2","updated":"2023-08-10T02:05:45Z","published":"2023-06-19T15:46:41Z","title":"RemoteCLIP: A Vision Language Foundation Model for Remote Sensing","summary":" General-purpose foundation models have become increasingly important in the\nfield of artificial intelligence. While self-supervised learning (SSL) and\nMasked Image Modeling (MIM) have led to promising results in building such\nfoundation models for remote sensing, these models primarily learn low-level\nfeatures, require annotated data for fine-tuning, and not applicable for\nretrieval and zero-shot applications due to the lack of language understanding.\nIn response to these limitations, we propose RemoteCLIP, the first\nvision-language foundation model for remote sensing that aims to learn robust\nvisual features with rich semantics, as well as aligned text embeddings for\nseamless downstream application. To address the scarcity of pre-training data,\nwe leverage data scaling, converting heterogeneous annotations based on\nBox-to-Caption (B2C) and Mask-to-Box (M2B) conversion, and further\nincorporating UAV imagery, resulting a 12xlarger pretraining dataset.\nRemoteCLIP can be applied to a variety of downstream tasks, including zero-shot\nimage classification, linear probing, k-NN classification, few-shot\nclassification, image-text retrieval, and object counting. Evaluations on 16\ndatasets, including a newly introduced RemoteCount benchmark to test the object\ncounting ability, show that RemoteCLIP consistently outperforms baseline\nfoundation models across different model scales. Impressively, RemoteCLIP\noutperform previous SoTA by 9.14% mean recall on RSICD dataset and by 8.92% on\nRSICD dataset. For zero-shot classification, our RemoteCLIP outperform CLIP\nbaseline by up to 6.39% average accuracy on 12 downstream datasets.Pretrained\nmodels is available at https://github.com/ChenDelong1999/RemoteCLIP .\n","authors":["Fan Liu","Delong Chen","Zhangqingyun Guan","Xiaocong Zhou","Jiale Zhu","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.11029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05286v1","updated":"2023-08-10T02:04:01Z","published":"2023-08-10T02:04:01Z","title":"Informative Scene Graph Generation via Debiasing","summary":" Scene graph generation aims to detect visual relationship triplets, (subject,\npredicate, object). Due to biases in data, current models tend to predict\ncommon predicates, e.g. \"on\" and \"at\", instead of informative ones, e.g.\n\"standing on\" and \"looking at\". This tendency results in the loss of precise\ninformation and overall performance. If a model only uses \"stone on road\"\nrather than \"stone blocking road\" to describe an image, it may be a grave\nmisunderstanding. We argue that this phenomenon is caused by two imbalances:\nsemantic space level imbalance and training sample level imbalance. For this\nproblem, we propose DB-SGG, an effective framework based on debiasing but not\nthe conventional distribution fitting. It integrates two components: Semantic\nDebiasing (SD) and Balanced Predicate Learning (BPL), for these imbalances. SD\nutilizes a confusion matrix and a bipartite graph to construct predicate\nrelationships. BPL adopts a random undersampling strategy and an ambiguity\nremoving strategy to focus on informative predicates. Benefiting from the\nmodel-agnostic process, our method can be easily applied to SGG models and\noutperforms Transformer by 136.3%, 119.5%, and 122.6% on mR@20 at three SGG\nsub-tasks on the SGG-VG dataset. Our method is further verified on another\ncomplex SGG dataset (SGG-GQA) and two downstream tasks (sentence-to-graph\nretrieval and image captioning).\n","authors":["Lianli Gao","Xinyu Lyu","Yuyu Guo","Yuxuan Hu","Yuan-Fang Li","Lu Xu","Heng Tao Shen","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2308.05286v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2108.13129"},{"id":"http://arxiv.org/abs/2308.04952v2","updated":"2023-08-10T01:59:23Z","published":"2023-08-09T13:38:52Z","title":"Prototypical Kernel Learning and Open-set Foreground Perception for\n Generalized Few-shot Semantic Segmentation","summary":" Generalized Few-shot Semantic Segmentation (GFSS) extends Few-shot Semantic\nSegmentation (FSS) to simultaneously segment unseen classes and seen classes\nduring evaluation. Previous works leverage additional branch or prototypical\naggregation to eliminate the constrained setting of FSS. However,\nrepresentation division and embedding prejudice, which heavily results in poor\nperformance of GFSS, have not been synthetical considered. We address the\naforementioned problems by jointing the prototypical kernel learning and\nopen-set foreground perception. Specifically, a group of learnable kernels is\nproposed to perform segmentation with each kernel in charge of a stuff class.\nThen, we explore to merge the prototypical learning to the update of base-class\nkernels, which is consistent with the prototype knowledge aggregation of\nfew-shot novel classes. In addition, a foreground contextual perception module\ncooperating with conditional bias based inference is adopted to perform\nclass-agnostic as well as open-set foreground detection, thus to mitigate the\nembedding prejudice and prevent novel targets from being misclassified as\nbackground. Moreover, we also adjust our method to the Class Incremental\nFew-shot Semantic Segmentation (CIFSS) which takes the knowledge of novel\nclasses in a incremental stream. Extensive experiments on PASCAL-5i and\nCOCO-20i datasets demonstrate that our method performs better than previous\nstate-of-the-art.\n","authors":["Kai Huang","Feigege Wang","Ye Xi","Yutao Gao"],"pdf_url":"https://arxiv.org/pdf/2308.04952v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.04733v2","updated":"2023-08-10T01:51:55Z","published":"2023-08-09T06:59:29Z","title":"TextPainter: Multimodal Text Image Generation withVisual-harmony and\n Text-comprehension for Poster Design","summary":" Text design is one of the most critical procedures in poster design, as it\nrelies heavily on the creativity and expertise of humans to design text images\nconsidering the visual harmony and text-semantic. This study introduces\nTextPainter, a novel multimodal approach that leverages contextual visual\ninformation and corresponding text semantics to generate text images.\nSpecifically, TextPainter takes the global-local background image as a hint of\nstyle and guides the text image generation with visual harmony. Furthermore, we\nleverage the language model and introduce a text comprehension module to\nachieve both sentence-level and word-level style variations. Besides, we\nconstruct the PosterT80K dataset, consisting of about 80K posters annotated\nwith sentence-level bounding boxes and text contents. We hope this dataset will\npave the way for further research on multimodal text image generation.\nExtensive quantitative and qualitative experiments demonstrate that TextPainter\ncan generate visually-and-semantically-harmonious text images for posters.\n","authors":["Yifan Gao","Jinpeng Lin","Min Zhou","Chuanbin Liu","Hongtao Xie","Tiezheng Ge","Yuning Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.04733v2.pdf","comment":"Accepted to ACM MM 2023. Dataset Link:\n https://tianchi.aliyun.com/dataset/160034"},{"id":"http://arxiv.org/abs/2307.14527v2","updated":"2023-08-10T01:46:11Z","published":"2023-07-26T22:09:29Z","title":"Open Problems in Computer Vision for Wilderness SAR and The Search for\n Patricia Wu-Murad","summary":" This paper details the challenges in applying two computer vision systems, an\nEfficientDET supervised learning model and the unsupervised RX spectral\nclassifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and\nrescue (WSAR) effort in Japan and identifies 3 directions for future research.\nThere have been at least 19 proposed approaches and 3 datasets aimed at\nlocating missing persons in drone imagery, but only 3 approaches (2\nunsupervised and 1 of an unknown structure) are referenced in the literature as\nhaving been used in an actual WSAR operation. Of these proposed approaches, the\nEfficientDET architecture and the unsupervised spectral RX classifier were\nselected as the most appropriate for this setting. The EfficientDET model was\napplied to the HERIDAL dataset and despite achieving performance that is\nstatistically equivalent to the state-of-the-art, the model fails to translate\nto the real world in terms of false positives (e.g., identifying tree limbs and\nrocks as people), and false negatives (e.g., failing to identify members of the\nsearch team). The poor results in practice for algorithms that showed good\nresults on datasets suggest 3 areas of future research: more realistic datasets\nfor wilderness SAR, computer vision models that are capable of seamlessly\nhandling the variety of imagery that can be collected during actual WSAR\noperations, and better alignment on performance measures.\n","authors":["Thomas Manzini","Robin Murphy"],"pdf_url":"https://arxiv.org/pdf/2307.14527v2.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.05274v1","updated":"2023-08-10T01:24:25Z","published":"2023-08-10T01:24:25Z","title":"Local-Global Information Interaction Debiasing for Dynamic Scene Graph\n Generation","summary":" The task of dynamic scene graph generation (DynSGG) aims to generate scene\ngraphs for given videos, which involves modeling the spatial-temporal\ninformation in the video. However, due to the long-tailed distribution of\nsamples in the dataset, previous DynSGG models fail to predict the tail\npredicates. We argue that this phenomenon is due to previous methods that only\npay attention to the local spatial-temporal information and neglect the\nconsistency of multiple frames. To solve this problem, we propose a novel\nDynSGG model based on multi-task learning, DynSGG-MTL, which introduces the\nlocal interaction information and global human-action interaction information.\nThe interaction between objects and frame features makes the model more fully\nunderstand the visual context of the single image. Long-temporal human actions\nsupervise the model to generate multiple scene graphs that conform to the\nglobal constraints and avoid the model being unable to learn the tail\npredicates. Extensive experiments on Action Genome dataset demonstrate the\nefficacy of our proposed framework, which not only improves the dynamic scene\ngraph generation but also alleviates the long-tail problem.\n","authors":["Xinyu Lyu","Jingwei Liu","Yuyu Guo","Lianli Gao"],"pdf_url":"https://arxiv.org/pdf/2308.05274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05264v1","updated":"2023-08-10T00:26:34Z","published":"2023-08-10T00:26:34Z","title":"TrainFors: A Large Benchmark Training Dataset for Image Manipulation\n Detection and Localization","summary":" The evaluation datasets and metrics for image manipulation detection and\nlocalization (IMDL) research have been standardized. But the training dataset\nfor such a task is still nonstandard. Previous researchers have used\nunconventional and deviating datasets to train neural networks for detecting\nimage forgeries and localizing pixel maps of manipulated regions. For a fair\ncomparison, the training set, test set, and evaluation metrics should be\npersistent. Hence, comparing the existing methods may not seem fair as the\nresults depend heavily on the training datasets as well as the model\narchitecture. Moreover, none of the previous works release the synthetic\ntraining dataset used for the IMDL task. We propose a standardized benchmark\ntraining dataset for image splicing, copy-move forgery, removal forgery, and\nimage enhancement forgery. Furthermore, we identify the problems with the\nexisting IMDL datasets and propose the required modifications. We also train\nthe state-of-the-art IMDL methods on our proposed TrainFors1 dataset for a fair\nevaluation and report the actual performance of these methods under similar\nconditions.\n","authors":["Soumyaroop Nandi","Prem Natarajan","Wael Abd-Almageed"],"pdf_url":"https://arxiv.org/pdf/2308.05264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08095v2","updated":"2023-08-10T00:22:27Z","published":"2022-11-15T12:25:33Z","title":"Will Large-scale Generative Models Corrupt Future Datasets?","summary":" Recently proposed large-scale text-to-image generative models such as\nDALL$\\cdot$E 2, Midjourney, and StableDiffusion can generate high-quality and\nrealistic images from users' prompts. Not limited to the research community,\nordinary Internet users enjoy these generative models, and consequently, a\ntremendous amount of generated images have been shared on the Internet.\nMeanwhile, today's success of deep learning in the computer vision field owes a\nlot to images collected from the Internet. These trends lead us to a research\nquestion: \"\\textbf{will such generated images impact the quality of future\ndatasets and the performance of computer vision models positively or\nnegatively?}\" This paper empirically answers this question by simulating\ncontamination. Namely, we generate ImageNet-scale and COCO-scale datasets using\na state-of-the-art generative model and evaluate models trained with\n\"contaminated\" datasets on various tasks, including image classification and\nimage generation. Throughout experiments, we conclude that generated images\nnegatively affect downstream performance, while the significance depends on\ntasks and the amount of generated images. The generated datasets and the codes\nfor experiments will be publicly released for future research. Generated\ndatasets and source codes are available from\n\\url{https://github.com/moskomule/dataset-contamination}.\n","authors":["Ryuichiro Hataya","Han Bao","Hiromi Arai"],"pdf_url":"https://arxiv.org/pdf/2211.08095v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05881v1","updated":"2023-08-10T23:53:07Z","published":"2023-08-10T23:53:07Z","title":"Aphid Cluster Recognition and Detection in the Wild Using Deep Learning\n Models","summary":" Aphid infestation poses a significant threat to crop production, rural\ncommunities, and global food security. While chemical pest control is crucial\nfor maximizing yields, applying chemicals across entire fields is both\nenvironmentally unsustainable and costly. Hence, precise localization and\nmanagement of aphids are essential for targeted pesticide application. The\npaper primarily focuses on using deep learning models for detecting aphid\nclusters. We propose a novel approach for estimating infection levels by\ndetecting aphid clusters. To facilitate this research, we have captured a\nlarge-scale dataset from sorghum fields, manually selected 5,447 images\ncontaining aphids, and annotated each individual aphid cluster within these\nimages. To facilitate the use of machine learning models, we further process\nthe images by cropping them into patches, resulting in a labeled dataset\ncomprising 151,380 image patches. Then, we implemented and compared the\nperformance of four state-of-the-art object detection models (VFNet, GFLV2,\nPAA, and ATSS) on the aphid dataset. Extensive experimental results show that\nall models yield stable similar performance in terms of average precision and\nrecall. We then propose to merge close neighboring clusters and remove tiny\nclusters caused by cropping, and the performance is further boosted by around\n17%. The study demonstrates the feasibility of automatically detecting and\nmanaging insects using machine learning models. The labeled dataset will be\nmade openly available to the research community.\n","authors":["Tianxiao Zhang","Kaidong Li","Xiangyu Chen","Cuncong Zhong","Bo Luo","Ivan Grijalva","Brian McCornack","Daniel Flippo","Ajay Sharda","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07972v2","updated":"2023-08-10T23:40:37Z","published":"2023-07-16T07:51:18Z","title":"Dual-level Interaction for Domain Adaptive Semantic Segmentation","summary":" Self-training approach recently secures its position in domain adaptive\nsemantic segmentation, where a model is trained with target domain\npseudo-labels. Current advances have mitigated noisy pseudo-labels resulting\nfrom the domain gap. However, they still struggle with erroneous pseudo-labels\nnear the boundaries of the semantic classifier. In this paper, we tackle this\nissue by proposing a dual-level interaction for domain adaptation (DIDA) in\nsemantic segmentation. Explicitly, we encourage the different augmented views\nof the same pixel to have not only similar class prediction (semantic-level)\nbut also akin similarity relationship with respect to other pixels\n(instance-level). As it's impossible to keep features of all pixel instances\nfor a dataset, we, therefore, maintain a labeled instance bank with dynamic\nupdating strategies to selectively store the informative features of instances.\nFurther, DIDA performs cross-level interaction with scattering and gathering\ntechniques to regenerate more reliable pseudo-labels. Our method outperforms\nthe state-of-the-art by a notable margin, especially on confusing and\nlong-tailed classes. Code is available at\n\\href{https://github.com/RainJamesY/DIDA}\n","authors":["Dongyu Yao","Boheng Li"],"pdf_url":"https://arxiv.org/pdf/2307.07972v2.pdf","comment":"Accepted to ICCVW on Uncertainty Quantification for Computer Vision\n (UnCV), 2023"},{"id":"http://arxiv.org/abs/2308.05872v1","updated":"2023-08-10T22:57:31Z","published":"2023-08-10T22:57:31Z","title":"Vision Backbone Enhancement via Multi-Stage Cross-Scale Attention","summary":" Convolutional neural networks (CNNs) and vision transformers (ViTs) have\nachieved remarkable success in various vision tasks. However, many\narchitectures do not consider interactions between feature maps from different\nstages and scales, which may limit their performance. In this work, we propose\na simple add-on attention module to overcome these limitations via multi-stage\nand cross-scale interactions. Specifically, the proposed Multi-Stage\nCross-Scale Attention (\\meth) module takes feature maps from different stages\nto enable multi-stage interactions and achieves cross-scale interactions by\ncomputing self-attention at different scales based on the multi-stage feature\nmaps. Our experiments on several downstream tasks show that \\meth~provides a\nsignificant performance boost with modest additional FLOPs and runtime.\n","authors":["Liang Shang","Yanli Liu","Zhengyang Lou","Shuxue Quan","Nagesh Adluru","Bochen Guan","William A. Sethares"],"pdf_url":"https://arxiv.org/pdf/2308.05872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05864v1","updated":"2023-08-10T21:59:23Z","published":"2023-08-10T21:59:23Z","title":"The Multi-modality Cell Segmentation Challenge: Towards Universal\n Solutions","summary":" Cell segmentation is a critical step for quantitative single-cell analysis in\nmicroscopy images. Existing cell segmentation methods are often tailored to\nspecific modalities or require manual interventions to specify hyperparameters\nin different experimental settings. Here, we present a multi-modality cell\nsegmentation benchmark, comprising over 1500 labeled images derived from more\nthan 50 diverse biological experiments. The top participants developed a\nTransformer-based deep-learning algorithm that not only exceeds existing\nmethods, but can also be applied to diverse microscopy images across imaging\nplatforms and tissue types without manual parameter adjustments. This benchmark\nand the improved algorithm offer promising avenues for more accurate and\nversatile cell analysis in microscopy imaging.\n","authors":["Jun Ma","Ronald Xie","Shamini Ayyadhury","Cheng Ge","Anubha Gupta","Ritu Gupta","Song Gu","Yao Zhang","Gihun Lee","Joonkee Kim","Wei Lou","Haofeng Li","Eric Upschulte","Timo Dickscheid","José Guilherme de Almeida","Yixin Wang","Lin Han","Xin Yang","Marco Labagnara","Sahand Jamal Rahi","Carly Kempster","Alice Pollitt","Leon Espinosa","Tâm Mignot","Jan Moritz Middeke","Jan-Niklas Eckardt","Wangkai Li","Zhaoyang Li","Xiaochen Cai","Bizhe Bai","Noah F. Greenwald","David Van Valen","Erin Weisbart","Beth A. Cimini","Zhuoshi Li","Chao Zuo","Oscar Brück","Gary D. Bader","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05864v1.pdf","comment":"NeurIPS22 Cell Segmentation Challenge:\n https://neurips22-cellseg.grand-challenge.org/"},{"id":"http://arxiv.org/abs/2308.05862v1","updated":"2023-08-10T21:51:48Z","published":"2023-08-10T21:51:48Z","title":"Unleashing the Strengths of Unlabeled Data in Pan-cancer Abdominal Organ\n Quantification: the FLARE22 Challenge","summary":" Quantitative organ assessment is an essential step in automated abdominal\ndisease diagnosis and treatment planning. Artificial intelligence (AI) has\nshown great potential to automatize this process. However, most existing AI\nalgorithms rely on many expert annotations and lack a comprehensive evaluation\nof accuracy and efficiency in real-world multinational settings. To overcome\nthese limitations, we organized the FLARE 2022 Challenge, the largest abdominal\norgan analysis challenge to date, to benchmark fast, low-resource, accurate,\nannotation-efficient, and generalized AI algorithms. We constructed an\nintercontinental and multinational dataset from more than 50 medical groups,\nincluding Computed Tomography (CT) scans with different races, diseases,\nphases, and manufacturers. We independently validated that a set of AI\nalgorithms achieved a median Dice Similarity Coefficient (DSC) of 90.0\\% by\nusing 50 labeled scans and 2000 unlabeled scans, which can significantly reduce\nannotation requirements. The best-performing algorithms successfully\ngeneralized to holdout external validation sets, achieving a median DSC of\n89.5\\%, 90.9\\%, and 88.3\\% on North American, European, and Asian cohorts,\nrespectively. They also enabled automatic extraction of key organ biology\nfeatures, which was labor-intensive with traditional manual measurements. This\nopens the potential to use unlabeled data to boost performance and alleviate\nannotation shortages for modern AI models.\n","authors":["Jun Ma","Yao Zhang","Song Gu","Cheng Ge","Shihao Ma","Adamo Young","Cheng Zhu","Kangkang Meng","Xin Yang","Ziyan Huang","Fan Zhang","Wentao Liu","YuanKe Pan","Shoujin Huang","Jiacheng Wang","Mingze Sun","Weixin Xu","Dengqiang Jia","Jae Won Choi","Natália Alves","Bram de Wilde","Gregor Koehler","Yajun Wu","Manuel Wiesenfarth","Qiongjie Zhu","Guoqiang Dong","Jian He","the FLARE Challenge Consortium","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05862v1.pdf","comment":"MICCAI FLARE22: https://flare22.grand-challenge.org/"},{"id":"http://arxiv.org/abs/2003.03229v5","updated":"2023-08-10T21:19:32Z","published":"2020-02-02T21:09:39Z","title":"Non-linear Neurons with Human-like Apical Dendrite Activations","summary":" In order to classify linearly non-separable data, neurons are typically\norganized into multi-layer neural networks that are equipped with at least one\nhidden layer. Inspired by some recent discoveries in neuroscience, we propose a\nnew model of artificial neuron along with a novel activation function enabling\nthe learning of nonlinear decision boundaries using a single neuron. We show\nthat a standard neuron followed by our novel apical dendrite activation (ADA)\ncan learn the XOR logical function with 100% accuracy. Furthermore, we conduct\nexperiments on six benchmark data sets from computer vision, signal processing\nand natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,\nTiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions\nprovide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and\nSwish, for various neural network architectures, e.g. one-hidden-layer or\ntwo-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural\nnetworks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain\nfurther performance improvements when we change the standard model of the\nneuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our\ncode is available at: https://github.com/raduionescu/pynada.\n","authors":["Mariana-Iuliana Georgescu","Radu Tudor Ionescu","Nicolae-Catalin Ristea","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2003.03229v5.pdf","comment":"Accepted for publication in Applied Intelligence"},{"id":"http://arxiv.org/abs/2212.04875v3","updated":"2023-08-10T21:05:54Z","published":"2022-12-09T14:29:57Z","title":"Expeditious Saliency-guided Mix-up through Random Gradient Thresholding","summary":" Mix-up training approaches have proven to be effective in improving the\ngeneralization ability of Deep Neural Networks. Over the years, the research\ncommunity expands mix-up methods into two directions, with extensive efforts to\nimprove saliency-guided procedures but minimal focus on the arbitrary path,\nleaving the randomization domain unexplored. In this paper, inspired by the\nsuperior qualities of each direction over one another, we introduce a novel\nmethod that lies at the junction of the two routes. By combining the best\nelements of randomness and saliency utilization, our method balances speed,\nsimplicity, and accuracy. We name our method R-Mix following the concept of\n\"Random Mix-up\". We demonstrate its effectiveness in generalization, weakly\nsupervised object localization, calibration, and robustness to adversarial\nattacks. Finally, in order to address the question of whether there exists a\nbetter decision protocol, we train a Reinforcement Learning agent that decides\nthe mix-up policies based on the classifier's performance, reducing dependency\non human-designed objectives and hyperparameter tuning. Extensive experiments\nfurther show that the agent is capable of performing at the cutting-edge level,\nlaying the foundation for a fully automatic mix-up. Our code is released at\n[https://github.com/minhlong94/Random-Mixup].\n","authors":["Minh-Long Luu","Zeyi Huang","Eric P. Xing","Yong Jae Lee","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2212.04875v3.pdf","comment":"Accepted Long paper at 2nd Practical-DL Workshop at AAAI 2023"},{"id":"http://arxiv.org/abs/2308.05851v1","updated":"2023-08-10T20:35:48Z","published":"2023-08-10T20:35:48Z","title":"SegDA: Maximum Separable Segment Mask with Pseudo Labels for Domain\n Adaptive Semantic Segmentation","summary":" Unsupervised Domain Adaptation (UDA) aims to solve the problem of label\nscarcity of the target domain by transferring the knowledge from the label rich\nsource domain. Usually, the source domain consists of synthetic images for\nwhich the annotation is easily obtained using the well known computer graphics\ntechniques. However, obtaining annotation for real world images (target domain)\nrequire lot of manual annotation effort and is very time consuming because it\nrequires per pixel annotation. To address this problem we propose SegDA module\nto enhance transfer performance of UDA methods by learning the maximum\nseparable segment representation. This resolves the problem of identifying\nvisually similar classes like pedestrian/rider, sidewalk/road etc. We leveraged\nEquiangular Tight Frame (ETF) classifier inspired from Neural Collapse for\nmaximal separation between segment classes. This causes the source domain pixel\nrepresentation to collapse to a single vector forming a simplex vertices which\nare aligned to the maximal separable ETF classifier. We use this phenomenon to\npropose the novel architecture for domain adaptation of segment representation\nfor target domain. Additionally, we proposed to estimate the noise in labelling\nthe target domain images and update the decoder for noise correction which\nencourages the discovery of pixels for classes not identified in pseudo labels.\nWe have used four UDA benchmarks simulating synthetic-to-real,\ndaytime-to-nighttime, clear-to-adverse weather scenarios. Our proposed approach\noutperforms +2.2 mIoU on GTA -> Cityscapes, +2.0 mIoU on Synthia -> Cityscapes,\n+5.9 mIoU on Cityscapes -> DarkZurich, +2.6 mIoU on Cityscapes -> ACDC.\n","authors":["Anant Khandelwal"],"pdf_url":"https://arxiv.org/pdf/2308.05851v1.pdf","comment":"11 pages, 4 Tables, 3 Figures, accepted at ICCVW 2023 (ICCV 2023: 4th\n Workshop on Visual Perception for Navigation in Human Environments)"},{"id":"http://arxiv.org/abs/2308.05846v1","updated":"2023-08-10T19:56:15Z","published":"2023-08-10T19:56:15Z","title":"Seed Kernel Counting using Domain Randomization and Object Tracking\n Neural Networks","summary":" High-throughput phenotyping (HTP) of seeds, also known as seed phenotyping,\nis the comprehensive assessment of complex seed traits such as growth,\ndevelopment, tolerance, resistance, ecology, yield, and the measurement of\nparameters that form more complex traits. One of the key aspects of seed\nphenotyping is cereal yield estimation that the seed production industry relies\nupon to conduct their business. While mechanized seed kernel counters are\navailable in the market currently, they are often priced high and sometimes\noutside the range of small scale seed production firms' affordability. The\ndevelopment of object tracking neural network models such as You Only Look Once\n(YOLO) enables computer scientists to design algorithms that can estimate\ncereal yield inexpensively. The key bottleneck with neural network models is\nthat they require a plethora of labelled training data before they can be put\nto task. We demonstrate that the use of synthetic imagery serves as a feasible\nsubstitute to train neural networks for object tracking that includes the tasks\nof object classification and detection. Furthermore, we propose a seed kernel\ncounter that uses a low-cost mechanical hopper, trained YOLOv8 neural network\nmodel, and object tracking algorithms on StrongSORT and ByteTrack to estimate\ncereal yield from videos. The experiment yields a seed kernel count with an\naccuracy of 95.2\\% and 93.2\\% for Soy and Wheat respectively using the\nStrongSORT algorithm, and an accuray of 96.8\\% and 92.4\\% for Soy and Wheat\nrespectively using the ByteTrack algorithm.\n","authors":["Venkat Margapuri","Prapti Thapaliya","Mitchell Neilsen"],"pdf_url":"https://arxiv.org/pdf/2308.05846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2105.13061v2","updated":"2023-08-10T18:54:19Z","published":"2021-05-27T11:07:09Z","title":"The Imaginative Generative Adversarial Network: Automatic Data\n Augmentation for Dynamic Skeleton-Based Hand Gesture and Human Action\n Recognition","summary":" Deep learning approaches deliver state-of-the-art performance in recognition\nof spatiotemporal human motion data. However, one of the main challenges in\nthese recognition tasks is limited available training data. Insufficient\ntraining data results in over-fitting and data augmentation is one approach to\naddress this challenge. Existing data augmentation strategies based on scaling,\nshifting and interpolating offer limited generalizability and typically require\ndetailed inspection of the dataset as well as hundreds of GPU hours for\nhyperparameter optimization. In this paper, we present a novel automatic data\naugmentation model, the Imaginative Generative Adversarial Network (GAN), that\napproximates the distribution of the input data and samples new data from this\ndistribution. It is automatic in that it requires no data inspection and little\nhyperparameter tuning and therefore it is a low-cost and low-effort approach to\ngenerate synthetic data. We demonstrate our approach on small-scale\nskeleton-based datasets with a comprehensive experimental analysis. Our results\nshow that the augmentation strategy is fast to train and can improve\nclassification accuracy for both conventional neural networks and\nstate-of-the-art methods.\n","authors":["Junxiao Shen","John Dudley","Per Ola Kristensson"],"pdf_url":"https://arxiv.org/pdf/2105.13061v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05822v1","updated":"2023-08-10T18:43:44Z","published":"2023-08-10T18:43:44Z","title":"Encode-Store-Retrieve: Enhancing Memory Augmentation through\n Language-Encoded Egocentric Perception","summary":" We depend on our own memory to encode, store, and retrieve our experiences.\nHowever, memory lapses can occur. One promising avenue for achieving memory\naugmentation is through the use of augmented reality head-mounted displays to\ncapture and preserve egocentric videos, a practice commonly referred to as life\nlogging. However, a significant challenge arises from the sheer volume of video\ndata generated through life logging, as the current technology lacks the\ncapability to encode and store such large amounts of data efficiently. Further,\nretrieving specific information from extensive video archives requires\nsubstantial computational power, further complicating the task of quickly\naccessing desired content. To address these challenges, we propose a memory\naugmentation system that involves leveraging natural language encoding for\nvideo data and storing them in a vector database. This approach harnesses the\npower of large vision language models to perform the language encoding process.\nAdditionally, we propose using large language models to facilitate natural\nlanguage querying. Our system underwent extensive evaluation using the QA-Ego4D\ndataset and achieved state-of-the-art results with a BLEU score of 8.3,\noutperforming conventional machine learning models that scored between 3.4 and\n5.8. Additionally, in a user study, our system received a higher mean response\nscore of 4.13/5 compared to the human participants' score of 2.46/5 on\nreal-life episodic memory tasks.\n","authors":["Junxiao Shen","John Dudley","Per Ola Kristensson"],"pdf_url":"https://arxiv.org/pdf/2308.05822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05820v1","updated":"2023-08-10T18:39:35Z","published":"2023-08-10T18:39:35Z","title":"Recognizing Handwritten Mathematical Expressions of Vertical Addition\n and Subtraction","summary":" Handwritten Mathematical Expression Recognition (HMER) is a challenging task\nwith many educational applications. Recent methods for HMER have been developed\nfor complex mathematical expressions in standard horizontal format. However,\nsolutions for elementary mathematical expression, such as vertical addition and\nsubtraction, have not been explored in the literature. This work proposes a new\nhandwritten elementary mathematical expression dataset composed of addition and\nsubtraction expressions in a vertical format. We also extended the MNIST\ndataset to generate artificial images with this structure. Furthermore, we\nproposed a solution for offline HMER, able to recognize vertical addition and\nsubtraction expressions. Our analysis evaluated the object detection algorithms\nYOLO v7, YOLO v8, YOLO-NAS, NanoDet and FCOS for identifying the mathematical\nsymbols. We also proposed a transcription method to map the bounding boxes from\nthe object detection stage to a mathematical expression in the LATEX markup\nsequence. Results show that our approach is efficient, achieving a high\nexpression recognition rate. The code and dataset are available at\nhttps://github.com/Danielgol/HME-VAS\n","authors":["Daniel Rosa","Filipe R. Cordeiro","Ruan Carvalho","Everton Souza","Sergio Chevtchenko","Luiz Rodrigues","Marcelo Marinho","Thales Vieira","Valmir Macario"],"pdf_url":"https://arxiv.org/pdf/2308.05820v1.pdf","comment":"Paper accepted at SIBGRAPI 2023"},{"id":"http://arxiv.org/abs/2308.05818v1","updated":"2023-08-10T18:35:22Z","published":"2023-08-10T18:35:22Z","title":"Absorption-Based, Passive Range Imaging from Hyperspectral Thermal\n Measurements","summary":" Passive hyperspectral long-wave infrared measurements are remarkably\ninformative about the surroundings, such as remote object material composition,\ntemperature, and range; and air temperature and gas concentrations. Remote\nobject material and temperature determine the spectrum of thermal radiance, and\nrange, air temperature, and gas concentrations determine how this spectrum is\nmodified by propagation to the sensor. We computationally separate these\nphenomena, introducing a novel passive range imaging method based on\natmospheric absorption of ambient thermal radiance. Previously demonstrated\npassive absorption-based ranging methods assume hot and highly emitting\nobjects. However, the temperature variation in natural scenes is usually low,\nmaking range imaging challenging. Our method benefits from explicit\nconsideration of air emission and parametric modeling of atmospheric\nabsorption. To mitigate noise in low-contrast scenarios, we jointly estimate\nrange and intrinsic object properties by exploiting a variety of absorption\nlines spread over the infrared spectrum. Along with Monte Carlo simulations\nthat demonstrate the importance of regularization, temperature differentials,\nand availability of many spectral bands, we apply this method to long-wave\ninfrared (8--13 $\\mu$m) hyperspectral image data acquired from natural scenes\nwith no active illumination. Range features from 15m to 150m are recovered,\nwith good qualitative match to unaligned lidar data.\n","authors":["Unay Dorken Gallastegi","Hoover Rueda-Chacon","Martin J. Stevens","Vivek K Goyal"],"pdf_url":"https://arxiv.org/pdf/2308.05818v1.pdf","comment":"15 pages, 14 figures"},{"id":"http://arxiv.org/abs/2201.09201v2","updated":"2023-08-10T18:34:17Z","published":"2022-01-23T07:18:55Z","title":"Vision-Based UAV Self-Positioning in Low-Altitude Urban Environments","summary":" Unmanned Aerial Vehicles (UAVs) rely on satellite systems for stable\npositioning. However, due to limited satellite coverage or communication\ndisruptions, UAVs may lose signals from satellite-based positioning systems. In\nsuch situations, vision-based techniques can serve as an alternative, ensuring\nthe self-positioning capability of UAVs. However, most of the existing datasets\nare developed for the geo-localization tasks of the objects identified by UAVs,\nrather than the self-positioning task of UAVs. Furthermore, the current UAV\ndatasets use discrete sampling on synthetic data, such as Google Maps, thereby\nneglecting the crucial aspects of dense sampling and the uncertainties commonly\nexperienced in real-world scenarios. To address these issues, this paper\npresents a new dataset, DenseUAV, which is the first publicly available dataset\ndesigned for the UAV self-positioning task. DenseUAV adopts dense sampling on\nUAV images obtained in low-altitude urban settings. In total, over 27K UAV-view\nand satellite-view images of 14 university campuses are collected and\nannotated, establishing a new benchmark. In terms of model development, we\nfirst verify the superiority of Transformers over CNNs in this task. Then, we\nincorporate metric learning into representation learning to enhance the\ndiscriminative capacity of the model and to lessen the modality discrepancy.\nBesides, to facilitate joint learning from both perspectives, we propose a\nmutually supervised learning approach. Last, we enhance the Recall@K metric and\nintroduce a new measurement, SDM@K, to evaluate the performance of a trained\nmodel from both the retrieval and localization perspectives simultaneously. As\na result, the proposed baseline method achieves a remarkable Recall@1 score of\n83.05% and an SDM@1 score of 86.24% on DenseUAV. The dataset and code will be\nmade publicly available on https://github.com/Dmmm1997/DenseUAV.\n","authors":["Ming Dai","Enhui Zheng","Zhenhua Feng","Jiedong Zhuang","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2201.09201v2.pdf","comment":"13 pages,8 figures"},{"id":"http://arxiv.org/abs/2308.05810v1","updated":"2023-08-10T18:09:44Z","published":"2023-08-10T18:09:44Z","title":"Spintronics for image recognition : performance benchmarking via\n ultrafast data-driven simulations","summary":" We present a demonstration of image classification using a hardware-based\necho-state network (ESN) that relies on spintronic nanostructures known as\nvortex-based spin-torque oscillators (STVOs). Our network is realized using a\nsingle STVO multiplexed in time. To circumvent the challenges associated with\nrepeated experimental manipulation of such a nanostructured system, we employ\nan ultrafast data-driven simulation framework called the data-driven Thiele\nequation approach (DD-TEA) to simulate the STVO dynamics. We use this approach\nto efficiently develop, optimize and test an STVO-based ESN for image\nclassification using the MNIST dataset. We showcase the versatility of our\nsolution by successfully applying it to solve classification challenges with\nthe EMNIST-letters and Fashion MNIST datasets. Through our simulations, we\ndetermine that within a large ESN the results obtained using the STVO dynamics\nas an activation function are comparable to the ones obtained with other\nconventional nonlinear activation functions like the reLU and the sigmoid.\nWhile achieving state-of-the-art accuracy levels on the MNIST dataset, our\nmodel's performance on EMNIST-letters and Fashion MNIST is lower due to the\nrelative simplicity of the system architecture and the increased complexity of\nthe tasks. We expect that the DD-TEA framework will enable the exploration of\nmore specialized neural architectures, ultimately leading to improved\nclassification accuracy. This approach also holds promise for investigating and\ndeveloping dedicated learning rules to further enhance classification\nperformance.\n","authors":["Anatole Moureaux","Chloé Chopin","Laurent Jacques","Flavio Abreu Araujo"],"pdf_url":"https://arxiv.org/pdf/2308.05810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05787v1","updated":"2023-08-10T17:35:47Z","published":"2023-08-10T17:35:47Z","title":"Temporally-Adaptive Models for Efficient Video Understanding","summary":" Spatial convolutions are extensively used in numerous deep video models. It\nfundamentally assumes spatio-temporal invariance, i.e., using shared weights\nfor every location in different frames. This work presents Temporally-Adaptive\nConvolutions (TAdaConv) for video understanding, which shows that adaptive\nweight calibration along the temporal dimension is an efficient way to\nfacilitate modeling complex temporal dynamics in videos. Specifically, TAdaConv\nempowers spatial convolutions with temporal modeling abilities by calibrating\nthe convolution weights for each frame according to its local and global\ntemporal context. Compared to existing operations for temporal modeling,\nTAdaConv is more efficient as it operates over the convolution kernels instead\nof the features, whose dimension is an order of magnitude smaller than the\nspatial resolutions. Further, kernel calibration brings an increased model\ncapacity. Based on this readily plug-in operation TAdaConv as well as its\nextension, i.e., TAdaConvV2, we construct TAdaBlocks to empower ConvNeXt and\nVision Transformer to have strong temporal modeling capabilities. Empirical\nresults show TAdaConvNeXtV2 and TAdaFormer perform competitively against\nstate-of-the-art convolutional and Transformer-based models in various video\nunderstanding benchmarks. Our codes and models are released at:\nhttps://github.com/alibaba-mmai-research/TAdaConv.\n","authors":["Ziyuan Huang","Shiwei Zhang","Liang Pan","Zhiwu Qing","Yingya Zhang","Ziwei Liu","Marcelo H. Ang Jr"],"pdf_url":"https://arxiv.org/pdf/2308.05787v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2110.06178"},{"id":"http://arxiv.org/abs/2308.05785v1","updated":"2023-08-10T16:44:24Z","published":"2023-08-10T16:44:24Z","title":"Leverage Weakly Annotation to Pixel-wise Annotation via Zero-shot\n Segment Anything Model for Molecular-empowered Learning","summary":" Precise identification of multiple cell classes in high-resolution Giga-pixel\nwhole slide imaging (WSI) is critical for various clinical scenarios. Building\nan AI model for this purpose typically requires pixel-level annotations, which\nare often unscalable and must be done by skilled domain experts (e.g.,\npathologists). However, these annotations can be prone to errors, especially\nwhen distinguishing between intricate cell types (e.g., podocytes and mesangial\ncells) using only visual inspection. Interestingly, a recent study showed that\nlay annotators, when using extra immunofluorescence (IF) images for reference\n(referred to as molecular-empowered learning), can sometimes outperform domain\nexperts in labeling. Despite this, the resource-intensive task of manual\ndelineation remains a necessity during the annotation process. In this paper,\nwe explore the potential of bypassing pixel-level delineation by employing the\nrecent segment anything model (SAM) on weak box annotation in a zero-shot\nlearning approach. Specifically, we harness SAM's ability to produce\npixel-level annotations from box annotations and utilize these SAM-generated\nlabels to train a segmentation model. Our findings show that the proposed\nSAM-assisted molecular-empowered learning (SAM-L) can diminish the labeling\nefforts for lay annotators by only requiring weak box annotations. This is\nachieved without compromising annotation accuracy or the performance of the\ndeep learning-based segmentation. This research represents a significant\nadvancement in democratizing the annotation process for training pathological\nimage segmentation, relying solely on non-expert annotators.\n","authors":["Xueyuan Li","Ruining Deng","Yucheng Tang","Shunxing Bao","Haichun Yang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2308.05785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05784v1","updated":"2023-08-10T16:33:59Z","published":"2023-08-10T16:33:59Z","title":"High-performance Data Management for Whole Slide Image Analysis in\n Digital Pathology","summary":" When dealing with giga-pixel digital pathology in whole-slide imaging, a\nnotable proportion of data records holds relevance during each analysis\noperation. For instance, when deploying an image analysis algorithm on\nwhole-slide images (WSI), the computational bottleneck often lies in the\ninput-output (I/O) system. This is particularly notable as patch-level\nprocessing introduces a considerable I/O load onto the computer system.\nHowever, this data management process can be potentially further paralleled,\ngiven the typical independence of patch-level image processes across different\npatches. This paper details our endeavors in tackling this data access\nchallenge through the implementation of the Adaptable IO System version 2\n(ADIOS2). Our focus has been on constructing and releasing a digital\npathology-centric pipeline using ADIOS2, which facilitates streamlined data\nmanagement across WSIs. Additionally, we've developed strategies aimed at\ncurtailing data retrieval times. The performance evaluation encompasses two key\nscenarios: (1) a pure CPU-based image analysis scenario (termed the \"CPU\nscenario\"), and (2) a GPU-based deep learning framework scenario (referred to\nas the \"GPU scenario\"). Our findings reveal noteworthy outcomes. Under the CPU\nscenario, ADIOS2 showcases an impressive two-fold speed-up in comparison to the\nbrute-force approach. In the GPU scenario, its performance stands on par with\nthe cutting-edge GPU I/O acceleration framework, NVIDIA Magnum IO GPU Direct\nStorage (GDS). From what we know, this appears to be among the initial\ninstances, if any, of utilizing ADIOS2 within the field of digital pathology.\nThe source code has been made publicly available at\nhttps://github.com/hrlblab/adios.\n","authors":["Haoju Leng","Ruining Deng","Shunxing Bao","Dazheng Fang","Bryan A. Millis","Yucheng Tang","Haichun Yang","Lipeng Wan","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2308.05784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05782v1","updated":"2023-08-10T16:26:03Z","published":"2023-08-10T16:26:03Z","title":"Multi-scale Multi-site Renal Microvascular Structures Segmentation for\n Whole Slide Imaging in Renal Pathology","summary":" Segmentation of microvascular structures, such as arterioles, venules, and\ncapillaries, from human kidney whole slide images (WSI) has become a focal\npoint in renal pathology. Current manual segmentation techniques are\ntime-consuming and not feasible for large-scale digital pathology images. While\ndeep learning-based methods offer a solution for automatic segmentation, most\nsuffer from a limitation: they are designed for and restricted to training on\nsingle-site, single-scale data. In this paper, we present Omni-Seg, a novel\nsingle dynamic network method that capitalizes on multi-site, multi-scale\ntraining data. Unique to our approach, we utilize partially labeled images,\nwhere only one tissue type is labeled per training image, to segment\nmicrovascular structures. We train a singular deep network using images from\ntwo datasets, HuBMAP and NEPTUNE, across different magnifications (40x, 20x,\n10x, and 5x). Experimental results indicate that Omni-Seg outperforms in terms\nof both the Dice Similarity Coefficient (DSC) and Intersection over Union\n(IoU). Our proposed method provides renal pathologists with a powerful\ncomputational tool for the quantitative analysis of renal microvascular\nstructures.\n","authors":["Franklin Hu","Ruining Deng","Shunxing Bao","Haichun Yang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2308.05782v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.05697v1","updated":"2023-08-10T16:59:36Z","published":"2023-08-10T16:59:36Z","title":"SSLRec: A Self-Supervised Learning Library for Recommendation","summary":" Self-supervised learning (SSL) has gained significant interest in recent\nyears as a solution to address the challenges posed by sparse and noisy data in\nrecommender systems. Despite the growing number of SSL algorithms designed to\nprovide state-of-the-art performance in various recommendation scenarios (e.g.,\ngraph collaborative filtering, sequential recommendation, social\nrecommendation, KG-enhanced recommendation), there is still a lack of unified\nframeworks that integrate recommendation algorithms across different domains.\nSuch a framework could serve as the cornerstone for self-supervised\nrecommendation algorithms, unifying the validation of existing methods and\ndriving the design of new ones. To address this gap, we introduce SSLRec, a\nnovel benchmark platform that provides a standardized, flexible, and\ncomprehensive framework for evaluating various SSL-enhanced recommenders. The\nSSLRec library features a modular architecture that allows users to easily\nevaluate state-of-the-art models and a complete set of data augmentation and\nself-supervised toolkits to help create SSL recommendation models with specific\nneeds. Furthermore, SSLRec simplifies the process of training and evaluating\ndifferent recommendation models with consistent and fair settings. Our SSLRec\nplatform covers a comprehensive set of state-of-the-art SSL-enhanced\nrecommendation models across different scenarios, enabling researchers to\nevaluate these cutting-edge models and drive further innovation in the field.\nOur implemented SSLRec framework is available at the source code repository\nhttps://github.com/HKUDS/SSLRec.\n","authors":["Xubin Ren","Lianghao Xia","Yuhao Yang","Wei Wei","Tianle Wang","Xuheng Cai","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.05697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05680v1","updated":"2023-08-10T16:33:17Z","published":"2023-08-10T16:33:17Z","title":"Finding Already Debunked Narratives via Multistage Retrieval: Enabling\n Cross-Lingual, Cross-Dataset and Zero-Shot Learning","summary":" The task of retrieving already debunked narratives aims to detect stories\nthat have already been fact-checked. The successful detection of claims that\nhave already been debunked not only reduces the manual efforts of professional\nfact-checkers but can also contribute to slowing the spread of misinformation.\nMainly due to the lack of readily available data, this is an understudied\nproblem, particularly when considering the cross-lingual task, i.e. the\nretrieval of fact-checking articles in a language different from the language\nof the online post being checked. This paper fills this gap by (i) creating a\nnovel dataset to enable research on cross-lingual retrieval of already debunked\nnarratives, using tweets as queries to a database of fact-checking articles;\n(ii) presenting an extensive experiment to benchmark fine-tuned and\noff-the-shelf multilingual pre-trained Transformer models for this task; and\n(iii) proposing a novel multistage framework that divides this cross-lingual\ndebunk retrieval task into refinement and re-ranking stages. Results show that\nthe task of cross-lingual retrieval of already debunked narratives is\nchallenging and off-the-shelf Transformer models fail to outperform a strong\nlexical-based baseline (BM25). Nevertheless, our multistage retrieval framework\nis robust, outperforming BM25 in most scenarios and enabling cross-domain and\nzero-shot learning, without significantly harming the model's performance.\n","authors":["Iknoor Singh","Carolina Scarton","Xingyi Song","Kalina Bontcheva"],"pdf_url":"https://arxiv.org/pdf/2308.05680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05609v1","updated":"2023-08-10T14:41:17Z","published":"2023-08-10T14:41:17Z","title":"LASIGE and UNICAGE solution to the NASA LitCoin NLP Competition","summary":" Biomedical Natural Language Processing (NLP) tends to become cumbersome for\nmost researchers, frequently due to the amount and heterogeneity of text to be\nprocessed. To address this challenge, the industry is continuously developing\nhighly efficient tools and creating more flexible engineering solutions. This\nwork presents the integration between industry data engineering solutions for\nefficient data processing and academic systems developed for Named Entity\nRecognition (LasigeUnicage\\_NER) and Relation Extraction (BiOnt). Our design\nreflects an integration of those components with external knowledge in the form\nof additional training data from other datasets and biomedical ontologies. We\nused this pipeline in the 2022 LitCoin NLP Challenge, where our team\nLasigeUnicage was awarded the 7th Prize out of approximately 200 participating\nteams, reflecting a successful collaboration between the academia (LASIGE) and\nthe industry (Unicage). The software supporting this work is available at\n\\url{https://github.com/lasigeBioTM/Litcoin-Lasige_Unicage}.\n","authors":["Pedro Ruas","Diana F. Sousa","André Neves","Carlos Cruz","Francisco M. Couto"],"pdf_url":"https://arxiv.org/pdf/2308.05609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05508v1","updated":"2023-08-10T11:41:34Z","published":"2023-08-10T11:41:34Z","title":"Multi-domain Recommendation with Embedding Disentangling and Domain\n Alignment","summary":" Multi-domain recommendation (MDR) aims to provide recommendations for\ndifferent domains (e.g., types of products) with overlapping users/items and is\ncommon for platforms such as Amazon, Facebook, and LinkedIn that host multiple\nservices. Existing MDR models face two challenges: First, it is difficult to\ndisentangle knowledge that generalizes across domains (e.g., a user likes cheap\nitems) and knowledge specific to a single domain (e.g., a user likes blue\nclothing but not blue cars). Second, they have limited ability to transfer\nknowledge across domains with small overlaps. We propose a new MDR method named\nEDDA with two key components, i.e., embedding disentangling recommender and\ndomain alignment, to tackle the two challenges respectively. In particular, the\nembedding disentangling recommender separates both the model and embedding for\nthe inter-domain part and the intra-domain part, while most existing MDR\nmethods only focus on model-level disentangling. The domain alignment leverages\nrandom walks from graph processing to identify similar user/item pairs from\ndifferent domains and encourages similar user/item pairs to have similar\nembeddings, enhancing knowledge transfer. We compare EDDA with 12\nstate-of-the-art baselines on 3 real datasets. The results show that EDDA\nconsistently outperforms the baselines on all datasets and domains. All\ndatasets and codes are available at https://github.com/Stevenn9981/EDDA.\n","authors":["Wentao Ning","Xiao Yan","Weiwen Liu","Reynold Cheng","Rui Zhang","Bo Tang"],"pdf_url":"https://arxiv.org/pdf/2308.05508v1.pdf","comment":"Accepted by CIKM'23"},{"id":"http://arxiv.org/abs/2308.05502v1","updated":"2023-08-10T11:14:22Z","published":"2023-08-10T11:14:22Z","title":"Bringing order into the realm of Transformer-based language models for\n artificial intelligence and law","summary":" Transformer-based language models (TLMs) have widely been recognized to be a\ncutting-edge technology for the successful development of deep-learning-based\nsolutions to problems and applications that require natural language processing\nand understanding. Like for other textual domains, TLMs have indeed pushed the\nstate-of-the-art of AI approaches for many tasks of interest in the legal\ndomain. Despite the first Transformer model being proposed about six years ago,\nthere has been a rapid progress of this technology at an unprecedented rate,\nwhereby BERT and related models represent a major reference, also in the legal\ndomain. This article provides the first systematic overview of TLM-based\nmethods for AI-driven problems and tasks in the legal sphere. A major goal is\nto highlight research advances in this field so as to understand, on the one\nhand, how the Transformers have contributed to the success of AI in supporting\nlegal processes, and on the other hand, what are the current limitations and\nopportunities for further research development.\n","authors":["Candida M. Greco","Andrea Tagarelli"],"pdf_url":"https://arxiv.org/pdf/2308.05502v1.pdf","comment":"Accepted for publication with Artificial Intelligence and Law,\n Springer Nature"},{"id":"http://arxiv.org/abs/2304.03531v2","updated":"2023-08-10T10:52:39Z","published":"2023-04-07T08:09:50Z","title":"From Retrieval to Generation: Efficient and Effective Entity Set\n Expansion","summary":" Entity Set Expansion (ESE) is a critical task aiming to expand entities of\nthe target semantic class described by a small seed entity set. Most existing\nESE methods are retrieval-based frameworks that need to extract the contextual\nfeatures of entities and calculate the similarity between seed entities and\ncandidate entities. To achieve the two purposes, they should iteratively\ntraverse the corpus and the entity vocabulary provided in the datasets,\nresulting in poor efficiency and scalability. The experimental results indicate\nthat the time consumed by the retrieval-based ESE methods increases linearly\nwith entity vocabulary and corpus size. In this paper, we firstly propose a\ngenerative ESE framework, Generative Entity Set Expansion (GenExpan), which\nutilizes a generative pre-trained language model to accomplish ESE task.\nSpecifically, a prefix tree is employed to guarantee the validity of entity\ngeneration, and automatically generated class names are adopted to guide the\nmodel to generate target entities. Moreover, we propose Knowledge Calibration\nand Generative Ranking to further bridge the gap between generic knowledge of\nthe language model and the goal of ESE task. Experiments on publicly available\ndatasets show that GenExpan is efficient and effective. For efficiency,\nexpansion time consumed by GenExpan is independent of entity vocabulary and\ncorpus size, and GenExpan achieves an average 600% speedup compared to strong\nbaselines. For expansion performance, our framework outperforms previous\nstate-of-the-art ESE methods.\n","authors":["Shulin Huang","Shirong Ma","Yangning Li","Yinghui Li","Hai-Tao Zheng","Yong Jiang","Hong-Gee Kim"],"pdf_url":"https://arxiv.org/pdf/2304.03531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05390v1","updated":"2023-08-10T07:09:13Z","published":"2023-08-10T07:09:13Z","title":"Product Review Image Ranking for Fashion E-commerce","summary":" In a fashion e-commerce platform where customers can't physically examine the\nproducts on their own, being able to see other customers' text and image\nreviews of the product is critical while making purchase decisions. Given the\nhigh reliance on these reviews, over the years we have observed customers\nproactively sharing their reviews. With an increase in the coverage of User\nGenerated Content (UGC), there has been a corresponding increase in the number\nof customer images. It is thus imperative to display the most relevant images\non top as it may influence users' online shopping choices and behavior. In this\npaper, we propose a simple yet effective training procedure for ranking\ncustomer images. We created a dataset consisting of Myntra (A Major Indian\nFashion e-commerce company) studio posts and highly engaged (upvotes/downvotes)\nUGC images as our starting point and used selected distortion techniques on the\nimages of the above dataset to bring their quality at par with those of bad UGC\nimages. We train our network to rank bad-quality images lower than high-quality\nones. Our proposed method outperforms the baseline models on two metrics,\nnamely correlation coefficient, and accuracy, by substantial margins.\n","authors":["Sangeet Jaiswal","Dhruv Patel","Sreekanth Vempati","Konduru Saiswaroop"],"pdf_url":"https://arxiv.org/pdf/2308.05390v1.pdf","comment":"Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR\n eCom'22)"},{"id":"http://arxiv.org/abs/2308.05379v1","updated":"2023-08-10T06:52:53Z","published":"2023-08-10T06:52:53Z","title":"Beyond Semantics: Learning a Behavior Augmented Relevance Model with\n Self-supervised Learning","summary":" Relevance modeling aims to locate desirable items for corresponding queries,\nwhich is crucial for search engines to ensure user experience. Although most\nconventional approaches address this problem by assessing the semantic\nsimilarity between the query and item, pure semantic matching is not\neverything. In reality, auxiliary query-item interactions extracted from user\nhistorical behavior data of the search log could provide hints to reveal users'\nsearch intents further. Drawing inspiration from this, we devise a novel\nBehavior Augmented Relevance Learning model for Alipay Search (BARL-ASe) that\nleverages neighbor queries of target item and neighbor items of target query to\ncomplement target query-item semantic matching. Specifically, our model builds\nmulti-level co-attention for distilling coarse-grained and fine-grained\nsemantic representations from both neighbor and target views. The model\nsubsequently employs neighbor-target self-supervised learning to improve the\naccuracy and robustness of BARL-ASe by strengthening representation and logit\nlearning. Furthermore, we discuss how to deal with the long-tail query-item\nmatching of the mini apps search scenario of Alipay practically. Experiments on\nreal-world industry data and online A/B testing demonstrate our proposal\nachieves promising performance with low latency.\n","authors":["Zeyuan Chen","Wei Chen","Jia Xu","Zhongyi Liu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05379v1.pdf","comment":"CIKM2023"},{"id":"http://arxiv.org/abs/2303.11174v2","updated":"2023-08-10T05:40:01Z","published":"2023-03-14T01:58:04Z","title":"Metric Search for Rank List Compatibility Matching with Applications","summary":" As online dating has become more popular in the past few years, an efficient\nand effective algorithm to match users is needed. In this project, we proposed\na new dating matching algorithm that uses Kendall-Tau distance to measure the\nsimilarity between users based on their ranking for items in a list. (e.g.,\ntheir favourite sports, music, etc.) To increase the performance of the search\nprocess, we applied a tree-based searching structure, Cascading Metric Tree\n(CMT), on this metric. The tree is built on ranked lists from all the users;\nwhen a query target and a radius are provided, our algorithm can return users\nwithin the radius of the target. We tested the scaling of this searching method\non a synthetic dataset by varying list length, population size, and query\nradius. We observed that the algorithm is able to query the best matching\npeople for the user in a practical time, given reasonable parameters. We also\nprovided potential future improvements that can be made to this algorithm based\non the limitations. Finally, we offered more use cases of this search structure\non Kendall-Tau distance and new insight into real-world applications of\ndistance search structures.\n","authors":["Wenqi Guo","Jeffrey Uhlmann"],"pdf_url":"https://arxiv.org/pdf/2303.11174v2.pdf","comment":"Paper for 2023 Multidisciplinary Undergraduate Research Conference\n (MURC)"},{"id":"http://arxiv.org/abs/2308.05013v2","updated":"2023-08-10T02:30:44Z","published":"2023-08-09T15:11:46Z","title":"Dual Intents Graph Modeling for User-centric Group Discovery","summary":" Online groups have become increasingly prevalent, providing users with space\nto share experiences and explore interests. Therefore, user-centric group\ndiscovery task, i.e., recommending groups to users can help both users' online\nexperiences and platforms' long-term developments. Existing recommender methods\ncan not deal with this task as modeling user-group participation into a\nbipartite graph overlooks their item-side interests. Although there exist a few\nworks attempting to address this task, they still fall short in fully\npreserving the social context and ensuring effective interest representation\nlearning.\n In this paper, we focus on exploring the intents that motivate users to\nparticipate in groups, which can be categorized into different types, like the\nsocial-intent and the personal interest-intent. The former refers to users\njoining a group affected by their social links, while the latter relates to\nusers joining groups with like-minded people for self-enjoyment. To comprehend\ndifferent intents, we propose a novel model, DiRec, that first models each\nintent separately and then fuses them together for predictions. Specifically,\nfor social-intent, we introduce the hypergraph structure to model the\nrelationship between groups and members, leading to a richer understanding of\nthe social context. As for interest-intent, we employ novel structural\nrefinement on the interactive graph to uncover more intricate user behaviors\nand group interests, realizing better representation learning of interests.\nFurthermore, we also observe the intent overlapping in real-world scenarios and\ndevise a novel self-supervised learning loss that encourages such alignment for\nfinal recommendations. Extensive experiments on three public datasets show the\nsignificant improvement of DiRec over the state-of-the-art methods.\n","authors":["Xixi Wu","Yun Xiong","Yao Zhang","Yizhu Jiao","Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05013v2.pdf","comment":"Accepted by CIKM'23 as Long Paper"},{"id":"http://arxiv.org/abs/2308.05281v1","updated":"2023-08-10T01:51:33Z","published":"2023-08-10T01:51:33Z","title":"Investigating disaster response through social media data and the\n Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S.\n wildfire season","summary":" Effective disaster response is critical for affected communities. Responders\nand decision-makers would benefit from reliable, timely measures of the issues\nimpacting their communities during a disaster, and social media offers a\npotentially rich data source. Social media can reflect public concerns and\ndemands during a disaster, offering valuable insights for decision-makers to\nunderstand evolving situations and optimize resource allocation. We used\nBidirectional Encoder Representations from Transformers (BERT) topic modeling\nto cluster topics from Twitter data. Then, we conducted a temporal-spatial\nanalysis to examine the distribution of these topics across different regions\nduring the 2020 western U.S. wildfire season. Our results show that Twitter\nusers mainly focused on three topics:\"health impact,\" \"damage,\" and\n\"evacuation.\" We used the Susceptible-Infected-Recovered (SIR) theory to\nexplore the magnitude and velocity of topic diffusion on Twitter. The results\ndisplayed a clear relationship between topic trends and wildfire propagation\npatterns. The estimated parameters obtained from the SIR model in selected\ncities revealed that residents exhibited a high level of several concerns\nduring the wildfire. Our study details how the SIR model and topic modeling\nusing social media data can provide decision-makers with a quantitative\napproach to measure disaster response and support their decision-making\nprocesses.\n","authors":["Zihui Ma","Lingyao Li","Libby Hemphill","Gregory B. Baecher"],"pdf_url":"https://arxiv.org/pdf/2308.05281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02542v2","updated":"2023-08-10T20:55:47Z","published":"2023-08-01T15:14:23Z","title":"Collaborative filtering to capture AI user's preferences as norms","summary":" Customising AI technologies to each user's preferences is fundamental to them\nfunctioning well. Unfortunately, current methods require too much user\ninvolvement and fail to capture their true preferences. In fact, to avoid the\nnuisance of manually setting preferences, users usually accept the default\nsettings even if these do not conform to their true preferences. Norms can be\nuseful to regulate behaviour and ensure it adheres to user preferences but,\nwhile the literature has thoroughly studied norms, most proposals take a formal\nperspective. Indeed, while there has been some research on constructing norms\nto capture a user's privacy preferences, these methods rely on domain knowledge\nwhich, in the case of AI technologies, is difficult to obtain and maintain. We\nargue that a new perspective is required when constructing norms, which is to\nexploit the large amount of preference information readily available from whole\nsystems of users. Inspired by recommender systems, we believe that\ncollaborative filtering can offer a suitable approach to identifying a user's\nnorm preferences without excessive user involvement.\n","authors":["Marc Serramia","Natalia Criado","Michael Luck"],"pdf_url":"https://arxiv.org/pdf/2308.02542v2.pdf","comment":"Accepted manuscript at the 24th International Conference on\n Principles and Practice of Multi-Agent Systems (PRIMA 2022)"},{"id":"http://arxiv.org/abs/2308.08446v1","updated":"2023-08-10T19:53:30Z","published":"2023-08-10T19:53:30Z","title":"CSPM: A Contrastive Spatiotemporal Preference Model for CTR Prediction\n in On-Demand Food Delivery Services","summary":" Click-through rate (CTR) prediction is a crucial task in the context of an\nonline on-demand food delivery (OFD) platform for precisely estimating the\nprobability of a user clicking on food items. Unlike universal e-commerce\nplatforms such as Taobao and Amazon, user behaviors and interests on the OFD\nplatform are more location and time-sensitive due to limited delivery ranges\nand regional commodity supplies. However, existing CTR prediction algorithms in\nOFD scenarios concentrate on capturing interest from historical behavior\nsequences, which fails to effectively model the complex spatiotemporal\ninformation within features, leading to poor performance. To address this\nchallenge, this paper introduces the Contrastive Sres under different search\nstates using three modules: contrastive spatiotemporal representation learning\n(CSRL), spatiotemporal preference extractor (StPE), and spatiotemporal\ninformation filter (StIF). CSRL utilizes a contrastive learning framework to\ngenerate a spatiotemporal activation representation (SAR) for the search\naction. StPE employs SAR to activate users' diverse preferences related to\nlocation and time from the historical behavior sequence field, using a\nmulti-head attention mechanism. StIF incorporates SAR into a gating network to\nautomatically capture important features with latent spatiotemporal effects.\nExtensive experiments conducted on two large-scale industrial datasets\ndemonstrate the state-of-the-art performance of CSPM. Notably, CSPM has been\nsuccessfully deployed in Alibaba's online OFD platform Ele.me, resulting in a\nsignificant 0.88% lift in CTR, which has substantial business implications.\n","authors":["Guyu Jiang","Xiaoyun Li","Rongrong Jing","Ruoqi Zhao","Xingliang Ni","Guodong Cao","Ning Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08446v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.05741v1","updated":"2023-08-10T17:58:02Z","published":"2023-08-10T17:58:02Z","title":"Neural Progressive Meshes","summary":" The recent proliferation of 3D content that can be consumed on hand-held\ndevices necessitates efficient tools for transmitting large geometric data,\ne.g., 3D meshes, over the Internet. Detailed high-resolution assets can pose a\nchallenge to storage as well as transmission bandwidth, and level-of-detail\ntechniques are often used to transmit an asset using an appropriate bandwidth\nbudget. It is especially desirable for these methods to transmit data\nprogressively, improving the quality of the geometry with more data. Our key\ninsight is that the geometric details of 3D meshes often exhibit similar local\npatterns even across different shapes, and thus can be effectively represented\nwith a shared learned generative space. We learn this space using a\nsubdivision-based encoder-decoder architecture trained in advance on a large\ncollection of surfaces. We further observe that additional residual features\ncan be transmitted progressively between intermediate levels of subdivision\nthat enable the client to control the tradeoff between bandwidth cost and\nquality of reconstruction, providing a neural progressive mesh representation.\nWe evaluate our method on a diverse set of complex 3D shapes and demonstrate\nthat it outperforms baselines in terms of compression ratio and reconstruction\nquality.\n","authors":["Yun-Chun Chen","Vladimir G. Kim","Noam Aigerman","Alec Jacobson"],"pdf_url":"https://arxiv.org/pdf/2308.05741v1.pdf","comment":"SIGGRAPH 2023"},{"id":"http://arxiv.org/abs/2308.05739v1","updated":"2023-08-10T17:57:22Z","published":"2023-08-10T17:57:22Z","title":"Zero Grads Ever Given: Learning Local Surrogate Losses for\n Non-Differentiable Graphics","summary":" Gradient-based optimization is now ubiquitous across graphics, but\nunfortunately can not be applied to problems with undefined or zero gradients.\nTo circumvent this issue, the loss function can be manually replaced by a\n\"surrogate\" that has similar minima but is differentiable. Our proposed\nframework, ZeroGrads, automates this process by learning a neural approximation\nof the objective function, the surrogate, which in turn can be used to\ndifferentiate through arbitrary black-box graphics pipelines. We train the\nsurrogate on an actively smoothed version of the objective and encourage\nlocality, focusing the surrogate's capacity on what matters at the current\ntraining episode. The fitting is performed online, alongside the parameter\noptimization, and self-supervised, without pre-computed data or pre-trained\nmodels. As sampling the objective is expensive (it requires a full rendering or\nsimulator run), we devise an efficient sampling scheme that allows for\ntractable run-times and competitive performance at little overhead. We\ndemonstrate optimizing diverse non-convex, non-differentiable black-box\nproblems in graphics, such as visibility in rendering, discrete parameter\nspaces in procedural modelling or optimal control in physics-driven animation.\nIn contrast to more traditional algorithms, our approach scales well to higher\ndimensions, which we demonstrate on problems with up to 35k interlinked\nvariables.\n","authors":["Michael Fischer","Tobias Ritschel"],"pdf_url":"https://arxiv.org/pdf/2308.05739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05737v1","updated":"2023-08-10T17:57:06Z","published":"2023-08-10T17:57:06Z","title":"Follow Anything: Open-set detection, tracking, and following in\n real-time","summary":" Tracking and following objects of interest is critical to several robotics\nuse cases, ranging from industrial automation to logistics and warehousing, to\nhealthcare and security. In this paper, we present a robotic system to detect,\ntrack, and follow any object in real-time. Our approach, dubbed ``follow\nanything'' (FAn), is an open-vocabulary and multimodal model -- it is not\nrestricted to concepts seen at training time and can be applied to novel\nclasses at inference time using text, images, or click queries. Leveraging rich\nvisual descriptors from large-scale pre-trained models (foundation models), FAn\ncan detect and segment objects by matching multimodal queries (text, images,\nclicks) against an input image sequence. These detected and segmented objects\nare tracked across image frames, all while accounting for occlusion and object\nre-emergence. We demonstrate FAn on a real-world robotic system (a micro aerial\nvehicle) and report its ability to seamlessly follow the objects of interest in\na real-time control loop. FAn can be deployed on a laptop with a lightweight\n(6-8 GB) graphics card, achieving a throughput of 6-20 frames per second. To\nenable rapid adoption, deployment, and extensibility, we open-source all our\ncode on our project webpage at https://github.com/alaamaalouf/FollowAnything .\nWe also encourage the reader the watch our 5-minutes explainer video in this\nhttps://www.youtube.com/watch?v=6Mgt3EPytrw .\n","authors":["Alaa Maalouf","Ninad Jadhav","Krishna Murthy Jatavallabhula","Makram Chahine","Daniel M. Vogt","Robert J. Wood","Antonio Torralba","Daniela Rus"],"pdf_url":"https://arxiv.org/pdf/2308.05737v1.pdf","comment":"Project webpage: https://github.com/alaamaalouf/FollowAnything\n Explainer video: https://www.youtube.com/watch?v=6Mgt3EPytrw"},{"id":"http://arxiv.org/abs/2308.05732v1","updated":"2023-08-10T17:53:05Z","published":"2023-08-10T17:53:05Z","title":"PDE-Refiner: Achieving Accurate Long Rollouts with Neural PDE Solvers","summary":" Time-dependent partial differential equations (PDEs) are ubiquitous in\nscience and engineering. Recently, mostly due to the high computational cost of\ntraditional solution techniques, deep neural network based surrogates have\ngained increased interest. The practical utility of such neural PDE solvers\nrelies on their ability to provide accurate, stable predictions over long time\nhorizons, which is a notoriously hard problem. In this work, we present a\nlarge-scale analysis of common temporal rollout strategies, identifying the\nneglect of non-dominant spatial frequency information, often associated with\nhigh frequencies in PDE solutions, as the primary pitfall limiting stable,\naccurate rollout performance. Based on these insights, we draw inspiration from\nrecent advances in diffusion models to introduce PDE-Refiner; a novel model\nclass that enables more accurate modeling of all frequency components via a\nmultistep refinement process. We validate PDE-Refiner on challenging benchmarks\nof complex fluid dynamics, demonstrating stable and accurate rollouts that\nconsistently outperform state-of-the-art models, including neural, numerical,\nand hybrid neural-numerical architectures. We further demonstrate that\nPDE-Refiner greatly enhances data efficiency, since the denoising objective\nimplicitly induces a novel form of spectral data augmentation. Finally,\nPDE-Refiner's connection to diffusion models enables an accurate and efficient\nassessment of the model's predictive uncertainty, allowing us to estimate when\nthe surrogate becomes inaccurate.\n","authors":["Phillip Lippe","Bastiaan S. Veeling","Paris Perdikaris","Richard E. Turner","Johannes Brandstetter"],"pdf_url":"https://arxiv.org/pdf/2308.05732v1.pdf","comment":"Project website: https://phlippe.github.io/PDERefiner/"},{"id":"http://arxiv.org/abs/2308.05731v1","updated":"2023-08-10T17:53:03Z","published":"2023-08-10T17:53:03Z","title":"Rethinking Integration of Prediction and Planning in Deep Learning-Based\n Automated Driving Systems: A Review","summary":" Automated driving has the potential to revolutionize personal, public, and\nfreight mobility. Besides the enormous challenge of perception, i.e. accurately\nperceiving the environment using available sensor data, automated driving\ncomprises planning a safe, comfortable, and efficient motion trajectory. To\npromote safety and progress, many works rely on modules that predict the future\nmotion of surrounding traffic. Modular automated driving systems commonly\nhandle prediction and planning as sequential separate tasks. While this\naccounts for the influence of surrounding traffic on the ego-vehicle, it fails\nto anticipate the reactions of traffic participants to the ego-vehicle's\nbehavior. Recent works suggest that integrating prediction and planning in an\ninterdependent joint step is necessary to achieve safe, efficient, and\ncomfortable driving. While various models implement such integrated systems, a\ncomprehensive overview and theoretical understanding of different principles\nare lacking. We systematically review state-of-the-art deep learning-based\nprediction, planning, and integrated prediction and planning models. Different\nfacets of the integration ranging from model architecture and model design to\nbehavioral aspects are considered and related to each other. Moreover, we\ndiscuss the implications, strengths, and limitations of different integration\nmethods. By pointing out research gaps, describing relevant future challenges,\nand highlighting trends in the research field, we identify promising directions\nfor future research.\n","authors":["Steffen Hagedorn","Marcel Hallgarten","Martin Stoll","Alexandru Condurache"],"pdf_url":"https://arxiv.org/pdf/2308.05731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05725v1","updated":"2023-08-10T17:41:19Z","published":"2023-08-10T17:41:19Z","title":"EXPRESSO: A Benchmark and Analysis of Discrete Expressive Speech\n Resynthesis","summary":" Recent work has shown that it is possible to resynthesize high-quality speech\nbased, not on text, but on low bitrate discrete units that have been learned in\na self-supervised fashion and can therefore capture expressive aspects of\nspeech that are hard to transcribe (prosody, voice styles, non-verbal\nvocalization). The adoption of these methods is still limited by the fact that\nmost speech synthesis datasets are read, severely limiting spontaneity and\nexpressivity. Here, we introduce Expresso, a high-quality expressive speech\ndataset for textless speech synthesis that includes both read speech and\nimprovised dialogues rendered in 26 spontaneous expressive styles. We\nillustrate the challenges and potentials of this dataset with an expressive\nresynthesis benchmark where the task is to encode the input in low-bitrate\nunits and resynthesize it in a target voice while preserving content and style.\nWe evaluate resynthesis quality with automatic metrics for different\nself-supervised discrete encoders, and explore tradeoffs between quality,\nbitrate and invariance to speaker and style. All the dataset, evaluation\nmetrics and baseline models are open source\n","authors":["Tu Anh Nguyen","Wei-Ning Hsu","Antony D'Avirro","Bowen Shi","Itai Gat","Maryam Fazel-Zarani","Tal Remez","Jade Copet","Gabriel Synnaeve","Michael Hassid","Felix Kreuk","Yossi Adi","Emmanuel Dupoux"],"pdf_url":"https://arxiv.org/pdf/2308.05725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05724v1","updated":"2023-08-10T17:39:51Z","published":"2023-08-10T17:39:51Z","title":"Optimizing Performance of Feedforward and Convolutional Neural Networks\n through Dynamic Activation Functions","summary":" Deep learning training training algorithms are a huge success in recent years\nin many fields including speech, text,image video etc. Deeper and deeper layers\nare proposed with huge success with resnet structures having around 152 layers.\nShallow convolution neural networks(CNN's) are still an active research, where\nsome phenomena are still unexplained. Activation functions used in the network\nare of utmost importance, as they provide non linearity to the networks. Relu's\nare the most commonly used activation function.We show a complex piece-wise\nlinear(PWL) activation in the hidden layer. We show that these PWL activations\nwork much better than relu activations in our networks for convolution neural\nnetworks and multilayer perceptrons. Result comparison in PyTorch for shallow\nand deep CNNs are given to further strengthen our case.\n","authors":["Chinmay Rane","Kanishka Tyagi","Michael Manry"],"pdf_url":"https://arxiv.org/pdf/2308.05724v1.pdf","comment":"Under submission in Neurocomputing"},{"id":"http://arxiv.org/abs/2301.10822v2","updated":"2023-08-10T17:34:58Z","published":"2023-01-25T20:49:12Z","title":"RobustPdM: Designing Robust Predictive Maintenance against Adversarial\n Attacks","summary":" The state-of-the-art predictive maintenance (PdM) techniques have shown great\nsuccess in reducing maintenance costs and downtime of complicated machines\nwhile increasing overall productivity through extensive utilization of\nInternet-of-Things (IoT) and Deep Learning (DL). Unfortunately, IoT sensors and\nDL algorithms are both prone to cyber-attacks. For instance, DL algorithms are\nknown for their susceptibility to adversarial examples. Such adversarial\nattacks are vastly under-explored in the PdM domain. This is because the\nadversarial attacks in the computer vision domain for classification tasks\ncannot be directly applied to the PdM domain for multivariate time series (MTS)\nregression tasks. In this work, we propose an end-to-end methodology to design\nadversarially robust PdM systems by extensively analyzing the effect of\ndifferent types of adversarial attacks and proposing a novel adversarial\ndefense technique for DL-enabled PdM models. First, we propose novel MTS\nProjected Gradient Descent (PGD) and MTS PGD with random restarts (PGD_r)\nattacks. Then, we evaluate the impact of MTS PGD and PGD_r along with MTS Fast\nGradient Sign Method (FGSM) and MTS Basic Iterative Method (BIM) on Long\nShort-Term Memory (LSTM), Gated Recurrent Unit (GRU), Convolutional Neural\nNetwork (CNN), and Bi-directional LSTM based PdM system. Our results using\nNASA's turbofan engine dataset show that adversarial attacks can cause a severe\ndefect (up to 11X) in the RUL prediction, outperforming the effectiveness of\nthe state-of-the-art PdM attacks by 3X. Furthermore, we present a novel\napproximate adversarial training method to defend against adversarial attacks.\nWe observe that approximate adversarial training can significantly improve the\nrobustness of PdM models (up to 54X) and outperforms the state-of-the-art PdM\ndefense methods by offering 3X more robustness.\n","authors":["Ayesha Siddique","Ripan Kumar Kundu","Gautam Raj Mode","Khaza Anuarul Hoque"],"pdf_url":"https://arxiv.org/pdf/2301.10822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14961v3","updated":"2023-08-10T17:34:48Z","published":"2023-03-27T07:52:58Z","title":"Diffusion Denoised Smoothing for Certified and Adversarial Robust\n Out-Of-Distribution Detection","summary":" As the use of machine learning continues to expand, the importance of\nensuring its safety cannot be overstated. A key concern in this regard is the\nability to identify whether a given sample is from the training distribution,\nor is an \"Out-Of-Distribution\" (OOD) sample. In addition, adversaries can\nmanipulate OOD samples in ways that lead a classifier to make a confident\nprediction. In this study, we present a novel approach for certifying the\nrobustness of OOD detection within a $\\ell_2$-norm around the input, regardless\nof network architecture and without the need for specific components or\nadditional training. Further, we improve current techniques for detecting\nadversarial attacks on OOD samples, while providing high levels of certified\nand adversarial robustness on in-distribution samples. The average of all OOD\ndetection metrics on CIFAR10/100 shows an increase of $\\sim 13 \\% / 5\\%$\nrelative to previous approaches.\n","authors":["Nicola Franco","Daniel Korth","Jeanette Miriam Lorenz","Karsten Roscher","Stephan Guennemann"],"pdf_url":"https://arxiv.org/pdf/2303.14961v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05711v1","updated":"2023-08-10T17:20:02Z","published":"2023-08-10T17:20:02Z","title":"A Comparison of Classical and Deep Reinforcement Learning Methods for\n HVAC Control","summary":" Reinforcement learning (RL) is a promising approach for optimizing HVAC\ncontrol. RL offers a framework for improving system performance, reducing\nenergy consumption, and enhancing cost efficiency. We benchmark two popular\nclassical and deep RL methods (Q-Learning and Deep-Q-Networks) across multiple\nHVAC environments and explore the practical consideration of model\nhyper-parameter selection and reward tuning. The findings provide insight for\nconfiguring RL agents in HVAC systems, promoting energy-efficient and\ncost-effective operation.\n","authors":["Marshall Wang","John Willes","Thomas Jiralerspong","Matin Moezzi"],"pdf_url":"https://arxiv.org/pdf/2308.05711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05707v1","updated":"2023-08-10T17:14:07Z","published":"2023-08-10T17:14:07Z","title":"Shadow Datasets, New challenging datasets for Causal Representation\n Learning","summary":" Discovering causal relations among semantic factors is an emergent topic in\nrepresentation learning. Most causal representation learning (CRL) methods are\nfully supervised, which is impractical due to costly labeling. To resolve this\nrestriction, weakly supervised CRL methods were introduced. To evaluate CRL\nperformance, four existing datasets, Pendulum, Flow, CelebA(BEARD) and\nCelebA(SMILE), are utilized. However, existing CRL datasets are limited to\nsimple graphs with few generative factors. Thus we propose two new datasets\nwith a larger number of diverse generative factors and more sophisticated\ncausal graphs. In addition, current real datasets, CelebA(BEARD) and\nCelebA(SMILE), the originally proposed causal graphs are not aligned with the\ndataset distributions. Thus, we propose modifications to them.\n","authors":["Jiageng Zhu","Hanchen Xie","Jianhua Wu","Jiazhi Li","Mahyar Khayatkhoei","Mohamed E. Hussein","Wael AbdAlmageed"],"pdf_url":"https://arxiv.org/pdf/2308.05707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03152v2","updated":"2023-08-10T17:01:37Z","published":"2023-08-06T15:59:30Z","title":"AI-GOMS: Large AI-Driven Global Ocean Modeling System","summary":" Ocean modeling is a powerful tool for simulating the physical, chemical, and\nbiological processes of the ocean, which is the foundation for marine science\nresearch and operational oceanography. Modern numerical ocean modeling mainly\nconsists of governing equations and numerical algorithms. Nonlinear\ninstability, computational expense, low reusability efficiency and high\ncoupling costs have gradually become the main bottlenecks for the further\ndevelopment of numerical ocean modeling. Recently, artificial\nintelligence-based modeling in scientific computing has shown revolutionary\npotential for digital twins and scientific simulations, but the bottlenecks of\nnumerical ocean modeling have not been further solved. Here, we present\nAI-GOMS, a large AI-driven global ocean modeling system, for accurate and\nefficient global ocean daily prediction. AI-GOMS consists of a backbone model\nwith the Fourier-based Masked Autoencoder structure for basic ocean variable\nprediction and lightweight fine-tuning models incorporating regional\ndownscaling, wave decoding, and biochemistry coupling modules. AI-GOMS has\nachieved the best performance in 30 days of prediction for the global ocean\nbasic variables with 15 depth layers at 1/4{\\deg} spatial resolution. Beyond\nthe good performance in statistical metrics, AI-GOMS realizes the simulation of\nmesoscale eddies in the Kuroshio region at 1/12{\\deg} spatial resolution and\nocean stratification in the tropical Pacific Ocean. AI-GOMS provides a new\nbackbone-downstream paradigm for Earth system modeling, which makes the system\ntransferable, scalable and reusable.\n","authors":["Wei Xiong","Yanfei Xiang","Hao Wu","Shuyi Zhou","Yuze Sun","Muyuan Ma","Xiaomeng Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14679v2","updated":"2023-08-10T16:46:35Z","published":"2023-02-28T15:42:30Z","title":"Synthesizing Mixed-type Electronic Health Records using Diffusion Models","summary":" Electronic Health Records (EHRs) contain sensitive patient information, which\npresents privacy concerns when sharing such data. Synthetic data generation is\na promising solution to mitigate these risks, often relying on deep generative\nmodels such as Generative Adversarial Networks (GANs). However, recent studies\nhave shown that diffusion models offer several advantages over GANs, such as\ngeneration of more realistic synthetic data and stable training in generating\ndata modalities, including image, text, and sound. In this work, we investigate\nthe potential of diffusion models for generating realistic mixed-type tabular\nEHRs, comparing TabDDPM model with existing methods on four datasets in terms\nof data quality, utility, privacy, and augmentation. Our experiments\ndemonstrate that TabDDPM outperforms the state-of-the-art models across all\nevaluation metrics, except for privacy, which confirms the trade-off between\nprivacy and utility.\n","authors":["Taha Ceritli","Ghadeer O. Ghosheh","Vinod Kumar Chauhan","Tingting Zhu","Andrew P. Creagh","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2302.14679v2.pdf","comment":"Page 2, Figure 1 is updated"},{"id":"http://arxiv.org/abs/2308.05681v1","updated":"2023-08-10T16:34:20Z","published":"2023-08-10T16:34:20Z","title":"Hard No-Box Adversarial Attack on Skeleton-Based Human Action\n Recognition with Skeleton-Motion-Informed Gradient","summary":" Recently, methods for skeleton-based human activity recognition have been\nshown to be vulnerable to adversarial attacks. However, these attack methods\nrequire either the full knowledge of the victim (i.e. white-box attacks),\naccess to training data (i.e. transfer-based attacks) or frequent model queries\n(i.e. black-box attacks). All their requirements are highly restrictive,\nraising the question of how detrimental the vulnerability is. In this paper, we\nshow that the vulnerability indeed exists. To this end, we consider a new\nattack task: the attacker has no access to the victim model or the training\ndata or labels, where we coin the term hard no-box attack. Specifically, we\nfirst learn a motion manifold where we define an adversarial loss to compute a\nnew gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our\ngradient contains information of the motion dynamics, which is different from\nexisting gradient-based attack methods that compute the loss gradient assuming\neach dimension in the data is independent. The SMI gradient can augment many\ngradient-based attack methods, leading to a new family of no-box attack\nmethods. Extensive evaluation and comparison show that our method imposes a\nreal threat to existing classifiers. They also show that the SMI gradient\nimproves the transferability and imperceptibility of adversarial samples in\nboth no-box and transfer-based black-box settings.\n","authors":["Zhengzhi Lu","He Wang","Ziyi Chang","Guoan Yang","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2308.05681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05680v1","updated":"2023-08-10T16:33:17Z","published":"2023-08-10T16:33:17Z","title":"Finding Already Debunked Narratives via Multistage Retrieval: Enabling\n Cross-Lingual, Cross-Dataset and Zero-Shot Learning","summary":" The task of retrieving already debunked narratives aims to detect stories\nthat have already been fact-checked. The successful detection of claims that\nhave already been debunked not only reduces the manual efforts of professional\nfact-checkers but can also contribute to slowing the spread of misinformation.\nMainly due to the lack of readily available data, this is an understudied\nproblem, particularly when considering the cross-lingual task, i.e. the\nretrieval of fact-checking articles in a language different from the language\nof the online post being checked. This paper fills this gap by (i) creating a\nnovel dataset to enable research on cross-lingual retrieval of already debunked\nnarratives, using tweets as queries to a database of fact-checking articles;\n(ii) presenting an extensive experiment to benchmark fine-tuned and\noff-the-shelf multilingual pre-trained Transformer models for this task; and\n(iii) proposing a novel multistage framework that divides this cross-lingual\ndebunk retrieval task into refinement and re-ranking stages. Results show that\nthe task of cross-lingual retrieval of already debunked narratives is\nchallenging and off-the-shelf Transformer models fail to outperform a strong\nlexical-based baseline (BM25). Nevertheless, our multistage retrieval framework\nis robust, outperforming BM25 in most scenarios and enabling cross-domain and\nzero-shot learning, without significantly harming the model's performance.\n","authors":["Iknoor Singh","Carolina Scarton","Xingyi Song","Kalina Bontcheva"],"pdf_url":"https://arxiv.org/pdf/2308.05680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11122v3","updated":"2023-08-10T16:30:52Z","published":"2023-05-18T17:09:21Z","title":"Autonomous sputter synthesis of thin film nitrides with composition\n controlled by Bayesian optimization of optical plasma emission","summary":" Autonomous experimentation has emerged as an efficient approach to accelerate\nthe pace of materials discovery. Although instruments for autonomous synthesis\nhave become popular in molecular and polymer science, solution processing of\nhybrid materials and nanoparticles, examples of autonomous tools for physical\nvapor deposition are scarce yet important for the semiconductor industry. Here,\nwe report the design and implementation of an autonomous workflow for sputter\ndeposition of thin films with controlled composition, leveraging a highly\nautomated sputtering reactor custom-controlled by Python, optical emission\nspectroscopy (OES), and a Bayesian optimization algorithm. We modeled film\ncomposition, measured by x-ray fluorescence, as a linear function of emission\nlines monitored during the co-sputtering from elemental Zn and Ti targets in\nN$_2$ atmosphere. A Bayesian control algorithm, informed by OES, navigates the\nspace of sputtering power to fabricate films with user-defined composition, by\nminimizing the absolute error between desired and measured emission signals. We\nvalidated our approach by autonomously fabricating Zn$_x$Ti$_{1-x}$N$_y$ films\nwith deviations from the targeted cation composition within relative 3.5 %,\neven for 15 nm thin films, demonstrating that the proposed approach can\nreliably synthesize thin films with specific composition and minimal human\ninterference. Moreover, the proposed method can be extended to more difficult\nsynthesis experiments where plasma intensity depends non-linearly on pressure,\nor the elemental sticking coefficients strongly depend on the substrate\ntemperature.\n","authors":["Davi M. Febba","Kevin R. Talley","Kendal Johnson","Stephen Schaefer","Sage R. Bauers","John S. Mangum","Rebecca W. Smaha","Andriy Zakutayev"],"pdf_url":"https://arxiv.org/pdf/2305.11122v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03712v2","updated":"2023-08-10T16:23:03Z","published":"2023-08-07T16:31:38Z","title":"Scaling may be all you need for achieving human-level object recognition\n capacity with human-like visual experience","summary":" This paper asks whether current self-supervised learning methods, if\nsufficiently scaled up, would be able to reach human-level visual object\nrecognition capabilities with the same type and amount of visual experience\nhumans learn from. Previous work on this question only considered the scaling\nof data size. Here, we consider the simultaneous scaling of data size, model\nsize, and image resolution. We perform a scaling experiment with vision\ntransformers up to 633M parameters in size (ViT-H/14) trained with up to 5K\nhours of human-like video data (long, continuous, mostly egocentric videos)\nwith image resolutions of up to 476x476 pixels. The efficiency of masked\nautoencoders (MAEs) as a self-supervised learning algorithm makes it possible\nto run this scaling experiment on an unassuming academic budget. We find that\nit is feasible to reach human-level object recognition capacity at sub-human\nscales of model size, data size, and image size, if these factors are scaled up\nsimultaneously. To give a concrete example, we estimate that a 2.5B parameter\nViT model trained with 20K hours (2.3 years) of human-like video data with a\nspatial resolution of 952x952 pixels should be able to reach roughly\nhuman-level accuracy on ImageNet. Human-level competence is thus achievable for\na fundamental perceptual capability from human-like perceptual experience\n(human-like in both amount and type) with extremely generic learning algorithms\nand architectures and without any substantive inductive biases.\n","authors":["A. Emin Orhan"],"pdf_url":"https://arxiv.org/pdf/2308.03712v2.pdf","comment":"v2 adds an Appendix containing results with alternative scaling\n functions; code & models available from\n https://github.com/eminorhan/humanlike-vits"},{"id":"http://arxiv.org/abs/2302.00453v2","updated":"2023-08-10T16:09:55Z","published":"2023-02-01T13:57:32Z","title":"Width and Depth Limits Commute in Residual Networks","summary":" We show that taking the width and depth to infinity in a deep neural network\nwith skip connections, when branches are scaled by $1/\\sqrt{depth}$ (the only\nnontrivial scaling), result in the same covariance structure no matter how that\nlimit is taken. This explains why the standard infinite-width-then-depth\napproach provides practical insights even for networks with depth of the same\norder as width. We also demonstrate that the pre-activations, in this case,\nhave Gaussian distributions which has direct applications in Bayesian deep\nlearning. We conduct extensive simulations that show an excellent match with\nour theoretical findings.\n","authors":["Soufiane Hayou","Greg Yang"],"pdf_url":"https://arxiv.org/pdf/2302.00453v2.pdf","comment":"24 pages, 8 figures. arXiv admin note: text overlap with\n arXiv:2210.00688"},{"id":"http://arxiv.org/abs/2305.03829v4","updated":"2023-08-10T15:51:03Z","published":"2023-05-05T20:08:40Z","title":"Improving Image-Based Precision Medicine with Uncertainty-Aware Causal\n Models","summary":" Image-based precision medicine aims to personalize treatment decisions based\non an individual's unique imaging features so as to improve their clinical\noutcome. Machine learning frameworks that integrate uncertainty estimation as\npart of their treatment recommendations would be safer and more reliable.\nHowever, little work has been done in adapting uncertainty estimation\ntechniques and validation metrics for precision medicine. In this paper, we use\nBayesian deep learning for estimating the posterior distribution over factual\nand counterfactual outcomes on several treatments. This allows for estimating\nthe uncertainty for each treatment option and for the individual treatment\neffects (ITE) between any two treatments. We train and evaluate this model to\npredict future new and enlarging T2 lesion counts on a large, multi-center\ndataset of MR brain images of patients with multiple sclerosis, exposed to\nseveral treatments during randomized controlled trials. We evaluate the\ncorrelation of the uncertainty estimate with the factual error, and, given the\nlack of ground truth counterfactual outcomes, demonstrate how uncertainty for\nthe ITE prediction relates to bounds on the ITE error. Lastly, we demonstrate\nhow knowledge of uncertainty could modify clinical decision-making to improve\nindividual patient and clinical trial outcomes.\n","authors":["Joshua Durso-Finley","Jean-Pierre Falet","Raghav Mehta","Douglas L. Arnold","Nick Pawlowski","Tal Arbel"],"pdf_url":"https://arxiv.org/pdf/2305.03829v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05646v1","updated":"2023-08-10T15:43:46Z","published":"2023-08-10T15:43:46Z","title":"AST-MHSA : Code Summarization using Multi-Head Self-Attention","summary":" Code summarization aims to generate concise natural language descriptions for\nsource code. The prevailing approaches adopt transformer-based encoder-decoder\narchitectures, where the Abstract Syntax Tree (AST) of the source code is\nutilized for encoding structural information. However, ASTs are much longer\nthan the corresponding source code, and existing methods ignore this size\nconstraint by directly feeding the entire linearized AST into the encoders.\nThis simplistic approach makes it challenging to extract truly valuable\ndependency relations from the overlong input sequence and leads to significant\ncomputational overhead due to self-attention applied to all nodes in the AST.\n To address this issue effectively and efficiently, we present a model,\nAST-MHSA that uses multi-head attention to extract the important semantic\ninformation from the AST. The model consists of two main components: an encoder\nand a decoder. The encoder takes as input the abstract syntax tree (AST) of the\ncode and generates a sequence of hidden states. The decoder then takes these\nhidden states as input and generates a natural language summary of the code.\n The multi-head attention mechanism allows the model to learn different\nrepresentations of the input code, which can be combined to generate a more\ncomprehensive summary. The model is trained on a dataset of code and summaries,\nand the parameters of the model are optimized to minimize the loss between the\ngenerated summaries and the ground-truth summaries.\n","authors":["Yeshwanth Nagaraj","Ujjwal Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.05646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02279v2","updated":"2023-08-10T15:30:15Z","published":"2023-07-05T13:26:17Z","title":"From NeurODEs to AutoencODEs: a mean-field control framework for\n width-varying Neural Networks","summary":" The connection between Residual Neural Networks (ResNets) and continuous-time\ncontrol systems (known as NeurODEs) has led to a mathematical analysis of\nneural networks which has provided interesting results of both theoretical and\npractical significance. However, by construction, NeurODEs have been limited to\ndescribing constant-width layers, making them unsuitable for modeling deep\nlearning architectures with layers of variable width. In this paper, we propose\na continuous-time Autoencoder, which we call AutoencODE, based on a\nmodification of the controlled field that drives the dynamics. This adaptation\nenables the extension of the mean-field control framework originally devised\nfor conventional NeurODEs. In this setting, we tackle the case of low Tikhonov\nregularization, resulting in potentially non-convex cost landscapes. While the\nglobal results obtained for high Tikhonov regularization may not hold globally,\nwe show that many of them can be recovered in regions where the loss function\nis locally convex. Inspired by our theoretical findings, we develop a training\nmethod tailored to this specific type of Autoencoders with residual\nconnections, and we validate our approach through numerical experiments\nconducted on various examples.\n","authors":["Cristina Cipriani","Massimo Fornasier","Alessandro Scagliotti"],"pdf_url":"https://arxiv.org/pdf/2307.02279v2.pdf","comment":"35 pages, 11 figures. Minor adjustments and new bibliographical\n references"},{"id":"http://arxiv.org/abs/2308.05633v1","updated":"2023-08-10T15:22:11Z","published":"2023-08-10T15:22:11Z","title":"IIHT: Medical Report Generation with Image-to-Indicator Hierarchical\n Transformer","summary":" Automated medical report generation has become increasingly important in\nmedical analysis. It can produce computer-aided diagnosis descriptions and thus\nsignificantly alleviate the doctors' work. Inspired by the huge success of\nneural machine translation and image captioning, various deep learning methods\nhave been proposed for medical report generation. However, due to the inherent\nproperties of medical data, including data imbalance and the length and\ncorrelation between report sequences, the generated reports by existing methods\nmay exhibit linguistic fluency but lack adequate clinical accuracy. In this\nwork, we propose an image-to-indicator hierarchical transformer (IIHT)\nframework for medical report generation. It consists of three modules, i.e., a\nclassifier module, an indicator expansion module and a generator module. The\nclassifier module first extracts image features from the input medical images\nand produces disease-related indicators with their corresponding states. The\ndisease-related indicators are subsequently utilised as input for the indicator\nexpansion module, incorporating the \"data-text-data\" strategy. The\ntransformer-based generator then leverages these extracted features along with\nimage features as auxiliary information to generate final reports. Furthermore,\nthe proposed IIHT method is feasible for radiologists to modify disease\nindicators in real-world scenarios and integrate the operations into the\nindicator expansion module for fluent and accurate medical report generation.\nExtensive experiments and comparisons with state-of-the-art methods under\nvarious evaluation metrics demonstrate the great performance of the proposed\nmethod.\n","authors":["Keqiang Fan","Xiaohao Cai","Mahesan Niranjan"],"pdf_url":"https://arxiv.org/pdf/2308.05633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05629v1","updated":"2023-08-10T15:18:16Z","published":"2023-08-10T15:18:16Z","title":"ReLU and Addition-based Gated RNN","summary":" We replace the multiplication and sigmoid function of the conventional\nrecurrent gate with addition and ReLU activation. This mechanism is designed to\nmaintain long-term memory for sequence processing but at a reduced\ncomputational cost, thereby opening up for more efficient execution or larger\nmodels on restricted hardware. Recurrent Neural Networks (RNNs) with gating\nmechanisms such as LSTM and GRU have been widely successful in learning from\nsequential data due to their ability to capture long-term dependencies.\nConventionally, the update based on current inputs and the previous state\nhistory is each multiplied with dynamic weights and combined to compute the\nnext state. However, multiplication can be computationally expensive,\nespecially for certain hardware architectures or alternative arithmetic systems\nsuch as homomorphic encryption. It is demonstrated that the novel gating\nmechanism can capture long-term dependencies for a standard synthetic sequence\nlearning task while significantly reducing computational costs such that\nexecution time is reduced by half on CPU and by one-third under encryption.\nExperimental results on handwritten text recognition tasks furthermore show\nthat the proposed architecture can be trained to achieve comparable accuracy to\nconventional GRU and LSTM baselines. The gating mechanism introduced in this\npaper may enable privacy-preserving AI applications operating under homomorphic\nencryption by avoiding the multiplication of encrypted variables. It can also\nsupport quantization in (unencrypted) plaintext applications, with the\npotential for substantial performance gains since the addition-based\nformulation can avoid the expansion to double precision often required for\nmultiplication.\n","authors":["Rickard Brännvall","Henrik Forsgren","Fredrik Sandin","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.05629v1.pdf","comment":"12 pages, 4 tables"},{"id":"http://arxiv.org/abs/2304.05874v2","updated":"2023-08-10T15:13:58Z","published":"2023-04-12T14:13:09Z","title":"Adaptive Gated Graph Convolutional Network for Explainable Diagnosis of\n Alzheimer's Disease using EEG Data","summary":" Graph neural network (GNN) models are increasingly being used for the\nclassification of electroencephalography (EEG) data. However, GNN-based\ndiagnosis of neurological disorders, such as Alzheimer's disease (AD), remains\na relatively unexplored area of research. Previous studies have relied on\nfunctional connectivity methods to infer brain graph structures and used simple\nGNN architectures for the diagnosis of AD. In this work, we propose a novel\nadaptive gated graph convolutional network (AGGCN) that can provide explainable\npredictions. AGGCN adaptively learns graph structures by combining\nconvolution-based node feature enhancement with a well-known correlation-based\nmeasure of functional connectivity. Furthermore, the gated graph convolution\ncan dynamically weigh the contribution of various spatial scales. The proposed\nmodel achieves high accuracy in both eyes-closed and eyes-open conditions,\nindicating the stability of learned representations. Finally, we demonstrate\nthat the proposed AGGCN model generates consistent explanations of its\npredictions that might be relevant for further study of AD-related alterations\nof brain networks.\n","authors":["Dominik Klepl","Fei He","Min Wu","Daniel J. Blackburn","Ptolemaios G. Sarrigiannis"],"pdf_url":"https://arxiv.org/pdf/2304.05874v2.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.05621v1","updated":"2023-08-10T15:10:08Z","published":"2023-08-10T15:10:08Z","title":"Normalized Gradients for All","summary":" In this short note, I show how to adapt to H\\\"{o}lder smoothness using\nnormalized gradients in a black-box way. Moreover, the bound will depend on a\nnovel notion of local H\\\"{o}lder smoothness. The main idea directly comes from\nLevy [2017].\n","authors":["Francesco Orabona"],"pdf_url":"https://arxiv.org/pdf/2308.05621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05619v1","updated":"2023-08-10T15:08:13Z","published":"2023-08-10T15:08:13Z","title":"Updating Clinical Risk Stratification Models Using Rank-Based\n Compatibility: Approaches for Evaluating and Optimizing Clinician-Model Team\n Performance","summary":" As data shift or new data become available, updating clinical machine\nlearning models may be necessary to maintain or improve performance over time.\nHowever, updating a model can introduce compatibility issues when the behavior\nof the updated model does not align with user expectations, resulting in poor\nuser-model team performance. Existing compatibility measures depend on model\ndecision thresholds, limiting their applicability in settings where models are\nused to generate rankings based on estimated risk. To address this limitation,\nwe propose a novel rank-based compatibility measure, $C^R$, and a new loss\nfunction that aims to optimize discriminative performance while encouraging\ngood compatibility. Applied to a case study in mortality risk stratification\nleveraging data from MIMIC, our approach yields more compatible models while\nmaintaining discriminative performance compared to existing model selection\ntechniques, with an increase in $C^R$ of $0.019$ ($95\\%$ confidence interval:\n$0.005$, $0.035$). This work provides new tools to analyze and update risk\nstratification models used in clinical care.\n","authors":["Erkin Ötleş","Brian T. Denton","Jenna Wiens"],"pdf_url":"https://arxiv.org/pdf/2308.05619v1.pdf","comment":"Conference paper accepted at the 2023 Machine Learning for Healthcare\n Conference Includes supplemental: 32 pages, 17 figures"},{"id":"http://arxiv.org/abs/2305.11509v4","updated":"2023-08-10T15:01:16Z","published":"2023-05-19T08:18:49Z","title":"From Random Search to Bandit Learning in Metric Measure Spaces","summary":" Random Search is one of the most widely-used method for Hyperparameter\nOptimization, and is critical to the success of deep learning models. Despite\nits astonishing performance, little non-heuristic theory has been developed to\ndescribe the underlying working mechanism. This paper gives a theoretical\naccounting of Random Search. We introduce the concept of \\emph{scattering\ndimension} that describes the landscape of the underlying function, and\nquantifies the performance of random search. We show that, when the environment\nis noise-free, the output of random search converges to the optimal value in\nprobability at rate $ \\widetilde{\\mathcal{O}} \\left( \\left( \\frac{1}{T}\n\\right)^{ \\frac{1}{d_s} } \\right) $, where $ d_s \\ge 0 $ is the scattering\ndimension of the underlying function. When the observed function values are\ncorrupted by bounded $iid$ noise, the output of random search converges to the\noptimal value in probability at rate $ \\widetilde{\\mathcal{O}} \\left( \\left(\n\\frac{1}{T} \\right)^{ \\frac{1}{d_s + 1} } \\right) $. In addition, based on the\nprinciples of random search, we introduce an algorithm, called BLiN-MOS, for\nLipschitz bandits in doubling metric spaces that are also endowed with a\nprobability measure, and show that BLiN-MOS achieves a regret rate of order $\n\\widetilde{\\mathcal{O}} \\left( T^{ \\frac{d_z}{d_z + 1} } \\right) $, where $d_z$\nis the zooming dimension of the problem instance.\n","authors":["Chuying Han","Yasong Feng","Tianyu Wang"],"pdf_url":"https://arxiv.org/pdf/2305.11509v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12932v2","updated":"2023-08-10T14:44:01Z","published":"2023-05-22T11:25:24Z","title":"Forecasting Irregularly Sampled Time Series using Graphs","summary":" Forecasting irregularly sampled time series with missing values is a crucial\ntask for numerous real-world applications such as healthcare, astronomy, and\nclimate sciences. State-of-the-art approaches to this problem rely on Ordinary\nDifferential Equations (ODEs) which are known to be slow and often require\nadditional features to handle missing values. To address this issue, we propose\na novel model using Graphs for Forecasting Irregularly Sampled Time Series with\nmissing values which we call GraFITi. GraFITi first converts the time series to\na Sparsity Structure Graph which is a sparse bipartite graph, and then\nreformulates the forecasting problem as the edge weight prediction task in the\ngraph. It uses the power of Graph Neural Networks to learn the graph and\npredict the target edge weights. GraFITi has been tested on 3 real-world and 1\nsynthetic irregularly sampled time series dataset with missing values and\ncompared with various state-of-the-art models. The experimental results\ndemonstrate that GraFITi improves the forecasting accuracy by up to 17% and\nreduces the run time up to 5 times compared to the state-of-the-art forecasting\nmodels.\n","authors":["Vijaya Krishna Yalavarthi","Kiran Madhusudhanan","Randolf Sholz","Nourhan Ahmed","Johannes Burchert","Shayan Jawed","Stefan Born","Lars Schmidt-Thieme"],"pdf_url":"https://arxiv.org/pdf/2305.12932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00790v4","updated":"2023-08-10T14:26:00Z","published":"2022-12-30T17:19:00Z","title":"Online learning techniques for prediction of temporal tabular datasets\n with regime changes","summary":" The application of deep learning to non-stationary temporal datasets can lead\nto overfitted models that underperform under regime changes. In this work, we\npropose a modular machine learning pipeline for ranking predictions on temporal\npanel datasets which is robust under regime changes. The modularity of the\npipeline allows the use of different models, including Gradient Boosting\nDecision Trees (GBDTs) and Neural Networks, with and without feature\nengineering. We evaluate our framework on financial data for stock portfolio\nprediction, and find that GBDT models with dropout display high performance,\nrobustness and generalisability with reduced complexity and computational cost.\nWe then demonstrate how online learning techniques, which require no retraining\nof models, can be used post-prediction to enhance the results. First, we show\nthat dynamic feature projection improves robustness by reducing drawdown in\nregime changes. Second, we demonstrate that dynamical model ensembling based on\nselection of models with good recent performance leads to improved Sharpe and\nCalmar ratios of out-of-sample predictions. We also evaluate the robustness of\nour pipeline across different data splits and random seeds with good\nreproducibility.\n","authors":["Thomas Wong","Mauricio Barahona"],"pdf_url":"https://arxiv.org/pdf/2301.00790v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05601v1","updated":"2023-08-10T14:20:43Z","published":"2023-08-10T14:20:43Z","title":"Multi-graph Spatio-temporal Graph Convolutional Network for Traffic Flow\n Prediction","summary":" Inter-city highway transportation is significant for urban life. As one of\nthe key functions in intelligent transportation system (ITS), traffic\nevaluation always plays significant role nowadays, and daily traffic flow\nprediction still faces challenges at network-wide toll stations. On the one\nhand, the data imbalance in practice among various locations deteriorates the\nperformance of prediction. On the other hand, complex correlative\nspatio-temporal factors cannot be comprehensively employed in long-term\nduration. In this paper, a prediction method is proposed for daily traffic flow\nin highway domain through spatio-temporal deep learning. In our method, data\nnormalization strategy is used to deal with data imbalance, due to long-tail\ndistribution of traffic flow at network-wide toll stations. And then, based on\ngraph convolutional network, we construct networks in distinct semantics to\ncapture spatio-temporal features. Beside that, meteorology and calendar\nfeatures are used by our model in the full connection stage to extra external\ncharacteristics of traffic flow. By extensive experiments and case studies in\none Chinese provincial highway, our method shows clear improvement in\npredictive accuracy than baselines and practical benefits in business.\n","authors":["Weilong Ding","Tianpu Zhang","Jianwu Wang","Zhuofeng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.05601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05600v1","updated":"2023-08-10T14:19:58Z","published":"2023-08-10T14:19:58Z","title":"NUPES : Non-Uniform Post-Training Quantization via Power Exponent Search","summary":" Deep neural network (DNN) deployment has been confined to larger hardware\ndevices due to their expensive computational requirements. This challenge has\nrecently reached another scale with the emergence of large language models\n(LLMs). In order to reduce both their memory footprint and latency, a promising\ntechnique is quantization. It consists in converting floating point\nrepresentations to low bit-width fixed point representations, usually by\nassuming a uniform mapping onto a regular grid. This process, referred to in\nthe literature as uniform quantization, may however be ill-suited as most DNN\nweights and activations follow a bell-shaped distribution. This is even worse\non LLMs whose weight distributions are known to exhibit large, high impact,\noutlier values. In this work, we propose an improvement over the most commonly\nadopted way to tackle this limitation in deep learning models quantization,\nnamely, non-uniform quantization. NUPES leverages automorphisms to preserve the\nscalar multiplications. Such transformations are derived from power functions.\nHowever, the optimization of the exponent parameter and weight values remains a\nchallenging and novel problem which could not be solved with previous post\ntraining optimization techniques which only learn to round up or down weight\nvalues in order to preserve the predictive function. We circumvent this\nlimitation with a new paradigm: learning new quantized weights over the entire\nquantized space. Similarly, we enable the optimization of the power exponent,\ni.e. the optimization of the quantization operator itself during training by\nalleviating all the numerical instabilities. The resulting predictive function\nis compatible with integer-only low-bit inference. We show the ability of the\nmethod to achieve state-of-the-art compression rates in both, data-free and\ndata-driven configurations.\n","authors":["Edouard Yvinec","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2308.05600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05011v2","updated":"2023-08-10T14:08:26Z","published":"2023-08-09T15:10:53Z","title":"Multi-Class Deep SVDD: Anomaly Detection Approach in Astronomy with\n Distinct Inlier Categories","summary":" With the increasing volume of astronomical data generated by modern survey\ntelescopes, automated pipelines and machine learning techniques have become\ncrucial for analyzing and extracting knowledge from these datasets. Anomaly\ndetection, i.e. the task of identifying irregular or unexpected patterns in the\ndata, is a complex challenge in astronomy. In this paper, we propose\nMulti-Class Deep Support Vector Data Description (MCDSVDD), an extension of the\nstate-of-the-art anomaly detection algorithm One-Class Deep SVDD, specifically\ndesigned to handle different inlier categories with distinct data\ndistributions. MCDSVDD uses a neural network to map the data into hyperspheres,\nwhere each hypersphere represents a specific inlier category. The distance of\neach sample from the centers of these hyperspheres determines the anomaly\nscore. We evaluate the effectiveness of MCDSVDD by comparing its performance\nwith several anomaly detection algorithms on a large dataset of astronomical\nlight-curves obtained from the Zwicky Transient Facility. Our results\ndemonstrate the efficacy of MCDSVDD in detecting anomalous sources while\nleveraging the presence of different inlier categories. The code and the data\nneeded to reproduce our results are publicly available at\nhttps://github.com/mperezcarrasco/AnomalyALeRCE.\n","authors":["Manuel Pérez-Carrasco","Guillermo Cabrera-Vives","Lorena Hernández-García","Francisco Forster","Paula Sánchez-Sáez","Alejandra Muñoz Arancibia","Nicolás Astorga","Franz Bauer","Amelia Bayo","Martina Cádiz-Leyton","Marcio Catelan"],"pdf_url":"https://arxiv.org/pdf/2308.05011v2.pdf","comment":"Accepted to ICML 2023 Workshop on Machine Learning for Astrophysics"},{"id":"http://arxiv.org/abs/2308.05575v1","updated":"2023-08-10T13:39:19Z","published":"2023-08-10T13:39:19Z","title":"Symmetry Defense Against XGBoost Adversarial Perturbation Attacks","summary":" We examine whether symmetry can be used to defend tree-based ensemble\nclassifiers such as gradient-boosting decision trees (GBDTs) against\nadversarial perturbation attacks. The idea is based on a recent symmetry\ndefense for convolutional neural network classifiers (CNNs) that utilizes CNNs'\nlack of invariance with respect to symmetries. CNNs lack invariance because\nthey can classify a symmetric sample, such as a horizontally flipped image,\ndifferently from the original sample. CNNs' lack of invariance also means that\nCNNs can classify symmetric adversarial samples differently from the incorrect\nclassification of adversarial samples. Using CNNs' lack of invariance, the\nrecent CNN symmetry defense has shown that the classification of symmetric\nadversarial samples reverts to the correct sample classification. In order to\napply the same symmetry defense to GBDTs, we examine GBDT invariance and are\nthe first to show that GBDTs also lack invariance with respect to symmetries.\nWe apply and evaluate the GBDT symmetry defense for nine datasets against six\nperturbation attacks with a threat model that ranges from zero-knowledge to\nperfect-knowledge adversaries. Using the feature inversion symmetry against\nzero-knowledge adversaries, we achieve up to 100% accuracy on adversarial\nsamples even when default and robust classifiers have 0% accuracy. Using the\nfeature inversion and horizontal flip symmetries against perfect-knowledge\nadversaries, we achieve up to over 95% accuracy on adversarial samples for the\nGBDT classifier of the F-MNIST dataset even when default and robust classifiers\nhave 0% accuracy.\n","authors":["Blerta Lindqvist"],"pdf_url":"https://arxiv.org/pdf/2308.05575v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2303.07925v7","updated":"2023-08-10T13:29:37Z","published":"2023-03-14T14:10:37Z","title":"Deep incremental learning models for financial temporal tabular datasets\n with distribution shifts","summary":" We present a robust deep incremental learning framework for regression tasks\non financial temporal tabular datasets which is built upon the incremental use\nof commonly available tabular and time series prediction models to adapt to\ndistributional shifts typical of financial datasets. The framework uses a\nsimple basic building block (decision trees) to build self-similar models of\nany required complexity to deliver robust performance under adverse situations\nsuch as regime changes, fat-tailed distributions, and low signal-to-noise\nratios. As a detailed study, we demonstrate our scheme using XGBoost models\ntrained on the Numerai dataset and show that a two layer deep ensemble of\nXGBoost models over different model snapshots delivers high quality predictions\nunder different market regimes. We also show that the performance of XGBoost\nmodels with different number of boosting rounds in three scenarios (small,\nstandard and large) is monotonically increasing with respect to model size and\nconverges towards the generalisation upper bound. We also evaluate the\nrobustness of the model under variability of different hyperparameters, such as\nmodel complexity and data sampling settings. Our model has low hardware\nrequirements as no specialised neural architectures are used and each base\nmodel can be independently trained in parallel.\n","authors":["Thomas Wong","Mauricio Barahona"],"pdf_url":"https://arxiv.org/pdf/2303.07925v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05566v1","updated":"2023-08-10T13:28:59Z","published":"2023-08-10T13:28:59Z","title":"AutoGluon-TimeSeries: AutoML for Probabilistic Time Series Forecasting","summary":" We introduce AutoGluon-TimeSeries - an open-source AutoML library for\nprobabilistic time series forecasting. Focused on ease of use and robustness,\nAutoGluon-TimeSeries enables users to generate accurate point and quantile\nforecasts with just 3 lines of Python code. Built on the design philosophy of\nAutoGluon, AutoGluon-TimeSeries leverages ensembles of diverse forecasting\nmodels to deliver high accuracy within a short training time.\nAutoGluon-TimeSeries combines both conventional statistical models,\nmachine-learning based forecasting approaches, and ensembling techniques. In\nour evaluation on 29 benchmark datasets, AutoGluon-TimeSeries demonstrates\nstrong empirical performance, outperforming a range of forecasting methods in\nterms of both point and quantile forecast accuracy, and often even improving\nupon the best-in-hindsight combination of prior methods.\n","authors":["Oleksandr Shchur","Caner Turkmen","Nick Erickson","Huibin Shen","Alexander Shirkov","Tony Hu","Yuyang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05566v1.pdf","comment":"Published at AutoML Conference 2023"},{"id":"http://arxiv.org/abs/2308.05564v1","updated":"2023-08-10T13:24:45Z","published":"2023-08-10T13:24:45Z","title":"Efficient Variational Inference for Large Skew-t Copulas with\n Application to Intraday Equity Returns","summary":" Large skew-t factor copula models are attractive for the modeling of\nfinancial data because they allow for asymmetric and extreme tail dependence.\nWe show that the copula implicit in the skew-t distribution of Azzalini and\nCapitanio (2003) allows for a higher level of pairwise asymmetric dependence\nthan two popular alternative skew-t copulas. Estimation of this copula in high\ndimensions is challenging, and we propose a fast and accurate Bayesian\nvariational inference (VI) approach to do so. The method uses a conditionally\nGaussian generative representation of the skew-t distribution to define an\naugmented posterior that can be approximated accurately. A fast stochastic\ngradient ascent algorithm is used to solve the variational optimization. The\nnew methodology is used to estimate copula models for intraday returns from\n2017 to 2021 on 93 U.S. equities. The copula captures substantial heterogeneity\nin asymmetric dependence over equity pairs, in addition to the variability in\npairwise correlations. We show that intraday predictive densities from the\nskew-t copula are more accurate than from some other copula models, while\nportfolio selection strategies based on the estimated pairwise tail\ndependencies improve performance relative to the benchmark index.\n","authors":["Lin Deng","Michael Stanley Smith","Worapree Maneesoonthorn"],"pdf_url":"https://arxiv.org/pdf/2308.05564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06024v3","updated":"2023-08-10T13:03:03Z","published":"2023-03-10T16:23:56Z","title":"A hybrid deep-learning-metaheuristic framework for bi-level network\n design problems","summary":" This study proposes a hybrid deep-learning-metaheuristic framework with a\nbi-level architecture for road network design problems (NDPs). We train a graph\nneural network (GNN) to approximate the solution of the user equilibrium (UE)\ntraffic assignment problem and use inferences made by the trained model to\ncalculate fitness function evaluations of a genetic algorithm (GA) to\napproximate solutions for NDPs. Using three test networks, two NDP variants and\nan exact solver as benchmark, we show that on average, our proposed framework\ncan provide solutions within 1.5% gap of the best results in less than 0.5% of\nthe time used by the exact solution procedure. Our framework can be utilized\nwithin an expert system for infrastructure planning to determine the best\ninfrastructure planning and management decisions under different scenarios.\nGiven the flexibility of the framework, it can easily be adapted to many other\ndecision problems that can be modeled as bi-level problems on graphs. Moreover,\nwe foreseen interesting future research directions, thus we also put forward a\nbrief research agenda for this topic. The key observation from our research\nthat can shape future research is that the fitness function evaluation time\nusing the inferences made by the GNN model was in the order of milliseconds,\nwhich points to an opportunity and a need for novel heuristics that 1) can cope\nwell with noisy fitness function values provided by deep learning models, and\n2) can use the significantly enlarged efficiency of the evaluation step to\nexplore the search space effectively (rather than efficiently). This opens a\nnew avenue for a modern class of metaheuristics that are crafted for use with\nAI-powered predictors.\n","authors":["Bahman Madadi","Goncalo Homem de Almeida Correia"],"pdf_url":"https://arxiv.org/pdf/2303.06024v3.pdf","comment":"Two case studies added, intro, discussion and conclusion extended,\n details added to method and experiments, typos fixed, title revised,\n references added"},{"id":"http://arxiv.org/abs/2210.04087v3","updated":"2023-08-10T12:42:06Z","published":"2022-10-08T18:49:58Z","title":"Symmetry Defense Against CNN Adversarial Perturbation Attacks","summary":" This paper uses symmetry to make Convolutional Neural Network classifiers\n(CNNs) robust against adversarial perturbation attacks. Such attacks add\nperturbation to original images to generate adversarial images that fool\nclassifiers such as road sign classifiers of autonomous vehicles. Although\nsymmetry is a pervasive aspect of the natural world, CNNs are unable to handle\nsymmetry well. For example, a CNN can classify an image differently from its\nmirror image. For an adversarial image that misclassifies with a wrong label\n$l_w$, CNN inability to handle symmetry means that a symmetric adversarial\nimage can classify differently from the wrong label $l_w$. Further than that,\nwe find that the classification of a symmetric adversarial image reverts to the\ncorrect label. To classify an image when adversaries are unaware of the\ndefense, we apply symmetry to the image and use the classification label of the\nsymmetric image. To classify an image when adversaries are aware of the\ndefense, we use mirror symmetry and pixel inversion symmetry to form a symmetry\ngroup. We apply all the group symmetries to the image and decide on the output\nlabel based on the agreement of any two of the classification labels of the\nsymmetry images. Adaptive attacks fail because they need to rely on loss\nfunctions that use conflicting CNN output values for symmetric images. Without\nattack knowledge, the proposed symmetry defense succeeds against both\ngradient-based and random-search attacks, with up to near-default accuracies\nfor ImageNet. The defense even improves the classification accuracy of original\nimages.\n","authors":["Blerta Lindqvist"],"pdf_url":"https://arxiv.org/pdf/2210.04087v3.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2301.05869v2","updated":"2023-08-10T12:35:11Z","published":"2023-01-14T09:41:21Z","title":"Functional Neural Networks: Shift invariant models for functional data\n with applications to EEG classification","summary":" It is desirable for statistical models to detect signals of interest\nindependently of their position. If the data is generated by some smooth\nprocess, this additional structure should be taken into account. We introduce a\nnew class of neural networks that are shift invariant and preserve smoothness\nof the data: functional neural networks (FNNs). For this, we use methods from\nfunctional data analysis (FDA) to extend multi-layer perceptrons and\nconvolutional neural networks to functional data. We propose different model\narchitectures, show that the models outperform a benchmark model from FDA in\nterms of accuracy and successfully use FNNs to classify electroencephalography\n(EEG) data.\n","authors":["Florian Heinrichs","Mavin Heim","Corinna Weber"],"pdf_url":"https://arxiv.org/pdf/2301.05869v2.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2305.19170v2","updated":"2023-08-10T12:26:00Z","published":"2023-05-30T16:15:57Z","title":"Forward-Forward Training of an Optical Neural Network","summary":" Neural networks (NN) have demonstrated remarkable capabilities in various\ntasks, but their computation-intensive nature demands faster and more\nenergy-efficient hardware implementations. Optics-based platforms, using\ntechnologies such as silicon photonics and spatial light modulators, offer\npromising avenues for achieving this goal. However, training multiple trainable\nlayers in tandem with these physical systems poses challenges, as they are\ndifficult to fully characterize and describe with differentiable functions,\nhindering the use of error backpropagation algorithm. The recently introduced\nForward-Forward Algorithm (FFA) eliminates the need for perfect\ncharacterization of the learning system and shows promise for efficient\ntraining with large numbers of programmable parameters. The FFA does not\nrequire backpropagating an error signal to update the weights, rather the\nweights are updated by only sending information in one direction. The local\nloss function for each set of trainable weights enables low-power analog\nhardware implementations without resorting to metaheuristic algorithms or\nreinforcement learning. In this paper, we present an experiment utilizing\nmultimode nonlinear wave propagation in an optical fiber demonstrating the\nfeasibility of the FFA approach using an optical system. The results show that\nincorporating optical transforms in multilayer NN architectures trained with\nthe FFA, can lead to performance improvements, even with a relatively small\nnumber of trainable weights. The proposed method offers a new path to the\nchallenge of training optical NNs and provides insights into leveraging\nphysical transformations for enhancing NN performance.\n","authors":["Ilker Oguz","Junjie Ke","Qifei Wang","Feng Yang","Mustafa Yildirim","Niyazi Ulas Dinc","Jih-Liang Hsieh","Christophe Moser","Demetri Psaltis"],"pdf_url":"https://arxiv.org/pdf/2305.19170v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10382v4","updated":"2023-08-10T12:21:41Z","published":"2023-04-19T17:02:28Z","title":"Conditional Generative Models for Learning Stochastic Processes","summary":" A framework to learn a multi-modal distribution is proposed, denoted as the\nConditional Quantum Generative Adversarial Network (C-qGAN). The neural network\nstructure is strictly within a quantum circuit and, as a consequence, is shown\nto represent a more efficient state preparation procedure than current methods.\nThis methodology has the potential to speed-up algorithms, such as Monte Carlo\nanalysis. In particular, after demonstrating the effectiveness of the network\nin the learning task, the technique is applied to price Asian option\nderivatives, providing the foundation for further research on other\npath-dependent options.\n","authors":["Salvatore Certo","Anh Pham","Nicolas Robles","Andrew Vlasic"],"pdf_url":"https://arxiv.org/pdf/2304.10382v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05525v1","updated":"2023-08-10T12:06:03Z","published":"2023-08-10T12:06:03Z","title":"Critical Points ++: An Agile Point Cloud Importance Measure for Robust\n Classification, Adversarial Defense and Explainable AI","summary":" The ability to cope accurately and fast with Out-Of-Distribution (OOD)\nsamples is crucial in real-world safety demanding applications. In this work we\nfirst study the interplay between critical points of 3D point clouds and OOD\nsamples. Our findings are that common corruptions and outliers are often\ninterpreted as critical points. We generalize the notion of critical points\ninto importance measures. We show that training a classification network based\nonly on less important points dramatically improves robustness, at a cost of\nminor performance loss on the clean set. We observe that normalized entropy is\nhighly informative for corruption analysis. An adaptive threshold based on\nnormalized entropy is suggested for selecting the set of uncritical points. Our\nproposed importance measure is extremely fast to compute. We show it can be\nused for a variety of applications, such as Explainable AI (XAI), Outlier\nRemoval, Uncertainty Estimation, Robust Classification and Adversarial Defense.\nWe reach SOTA results on the two latter tasks.\n","authors":["Meir Yossef Levi","Guy Gilboa"],"pdf_url":"https://arxiv.org/pdf/2308.05525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05522v1","updated":"2023-08-10T12:04:47Z","published":"2023-08-10T12:04:47Z","title":"Models Matter: The Impact of Single-Step Retrosynthesis on Synthesis\n Planning","summary":" Retrosynthesis consists of breaking down a chemical compound recursively\nstep-by-step into molecular precursors until a set of commercially available\nmolecules is found with the goal to provide a synthesis route. Its two primary\nresearch directions, single-step retrosynthesis prediction, which models the\nchemical reaction logic, and multi-step synthesis planning, which tries to find\nthe correct sequence of reactions, are inherently intertwined. Still, this\nconnection is not reflected in contemporary research. In this work, we combine\nthese two major research directions by applying multiple single-step\nretrosynthesis models within multi-step synthesis planning and analyzing their\nimpact using public and proprietary reaction data. We find a disconnection\nbetween high single-step performance and potential route-finding success,\nsuggesting that single-step models must be evaluated within synthesis planning\nin the future. Furthermore, we show that the commonly used single-step\nretrosynthesis benchmark dataset USPTO-50k is insufficient as this evaluation\ntask does not represent model performance and scalability on larger and more\ndiverse datasets. For multi-step synthesis planning, we show that the choice of\nthe single-step model can improve the overall success rate of synthesis\nplanning by up to +28% compared to the commonly used baseline model. Finally,\nwe show that each single-step model finds unique synthesis routes, and differs\nin aspects such as route-finding success, the number of found synthesis routes,\nand chemical validity, making the combination of single-step retrosynthesis\nprediction and multi-step synthesis planning a crucial aspect when developing\nfuture methods.\n","authors":["Paula Torren-Peraire","Alan Kai Hassen","Samuel Genheden","Jonas Verhoeven","Djork-Arne Clevert","Mike Preuss","Igor Tetko"],"pdf_url":"https://arxiv.org/pdf/2308.05522v1.pdf","comment":"The following authors contributed equally: Paula Torren-Peraire, Alan\n Kai Hassen"},{"id":"http://arxiv.org/abs/2308.05509v1","updated":"2023-08-10T11:42:09Z","published":"2023-08-10T11:42:09Z","title":"On the Optimal Expressive Power of ReLU DNNs and Its Application in\n Approximation with Kolmogorov Superposition Theorem","summary":" This paper is devoted to studying the optimal expressive power of ReLU deep\nneural networks (DNNs) and its application in approximation via the Kolmogorov\nSuperposition Theorem. We first constructively prove that any continuous\npiecewise linear functions on $[0,1]$, comprising $O(N^2L)$ segments, can be\nrepresented by ReLU DNNs with $L$ hidden layers and $N$ neurons per layer.\nSubsequently, we demonstrate that this construction is optimal regarding the\nparameter count of the DNNs, achieved through investigating the shattering\ncapacity of ReLU DNNs. Moreover, by invoking the Kolmogorov Superposition\nTheorem, we achieve an enhanced approximation rate for ReLU DNNs of arbitrary\nwidth and depth when dealing with continuous functions in high-dimensional\nspaces.\n","authors":["Juncai He"],"pdf_url":"https://arxiv.org/pdf/2308.05509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07944v2","updated":"2023-08-10T11:05:23Z","published":"2023-07-16T04:34:11Z","title":"Revisiting Domain-Adaptive 3D Object Detection by Reliable, Diverse and\n Class-balanced Pseudo-Labeling","summary":" Unsupervised domain adaptation (DA) with the aid of pseudo labeling\ntechniques has emerged as a crucial approach for domain-adaptive 3D object\ndetection. While effective, existing DA methods suffer from a substantial drop\nin performance when applied to a multi-class training setting, due to the\nco-existence of low-quality pseudo labels and class imbalance issues. In this\npaper, we address this challenge by proposing a novel ReDB framework tailored\nfor learning to detect all classes at once. Our approach produces Reliable,\nDiverse, and class-Balanced pseudo 3D boxes to iteratively guide the\nself-training on a distributionally different target domain. To alleviate\ndisruptions caused by the environmental discrepancy (e.g., beam numbers), the\nproposed cross-domain examination (CDE) assesses the correctness of pseudo\nlabels by copy-pasting target instances into a source environment and measuring\nthe prediction consistency. To reduce computational overhead and mitigate the\nobject shift (e.g., scales and point densities), we design an overlapped boxes\ncounting (OBC) metric that allows to uniformly downsample pseudo-labeled\nobjects across different geometric characteristics. To confront the issue of\ninter-class imbalance, we progressively augment the target point clouds with a\nclass-balanced set of pseudo-labeled target instances and source objects, which\nboosts recognition accuracies on both frequently appearing and rare classes.\nExperimental results on three benchmark datasets using both voxel-based (i.e.,\nSECOND) and point-based 3D detectors (i.e., PointRCNN) demonstrate that our\nproposed ReDB approach outperforms existing 3D domain adaptation methods by a\nlarge margin, improving 23.15% mAP on the nuScenes $\\rightarrow$ KITTI task.\nThe code is available at https://github.com/zhuoxiao-chen/ReDB-DA-3Ddet.\n","authors":["Zhuoxiao Chen","Yadan Luo","Zheng Wang","Mahsa Baktashmotlagh","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2307.07944v2.pdf","comment":"Accepted by ICCV 2023, camera-ready"},{"id":"http://arxiv.org/abs/2308.05483v1","updated":"2023-08-10T10:19:48Z","published":"2023-08-10T10:19:48Z","title":"Quality Diversity under Sparse Reward and Sparse Interaction:\n Application to Grasping in Robotics","summary":" Quality-Diversity (QD) methods are algorithms that aim to generate a set of\ndiverse and high-performing solutions to a given problem. Originally developed\nfor evolutionary robotics, most QD studies are conducted on a limited set of\ndomains - mainly applied to locomotion, where the fitness and the behavior\nsignal are dense. Grasping is a crucial task for manipulation in robotics.\nDespite the efforts of many research communities, this task is yet to be\nsolved. Grasping cumulates unprecedented challenges in QD literature: it\nsuffers from reward sparsity, behavioral sparsity, and behavior space\nmisalignment. The present work studies how QD can address grasping. Experiments\nhave been conducted on 15 different methods on 10 grasping domains,\ncorresponding to 2 different robot-gripper setups and 5 standard objects. An\nevaluation framework that distinguishes the evaluation of an algorithm from its\ninternal components has also been proposed for a fair comparison. The obtained\nresults show that MAP-Elites variants that select successful solutions in\npriority outperform all the compared methods on the studied metrics by a large\nmargin. We also found experimental evidence that sparse interaction can lead to\ndeceptive novelty. To our knowledge, the ability to efficiently produce\nexamples of grasping trajectories demonstrated in this work has no precedent in\nthe literature.\n","authors":["J. Huber","F. Hélénon","M. Coninx","F. Ben Amar","S. Doncieux"],"pdf_url":"https://arxiv.org/pdf/2308.05483v1.pdf","comment":"37 pages, 17 figures. Draft version"},{"id":"http://arxiv.org/abs/2308.05481v1","updated":"2023-08-10T10:12:43Z","published":"2023-08-10T10:12:43Z","title":"LLM As DBA","summary":" Database administrators (DBAs) play a crucial role in managing, maintaining\nand optimizing a database system to ensure data availability, performance, and\nreliability. However, it is hard and tedious for DBAs to manage a large number\nof database instances (e.g., millions of instances on the cloud databases).\nRecently large language models (LLMs) have shown great potential to understand\nvaluable documents and accordingly generate reasonable answers. Thus, we\npropose D-Bot, a LLM-based database administrator that can continuously acquire\ndatabase maintenance experience from textual sources, and provide reasonable,\nwell-founded, in-time diagnosis and optimization advice for target databases.\nThis paper presents a revolutionary LLM-centric framework for database\nmaintenance, including (i) database maintenance knowledge detection from\ndocuments and tools, (ii) tree of thought reasoning for root cause analysis,\nand (iii) collaborative diagnosis among multiple LLMs. Our preliminary\nexperimental results that D-Bot can efficiently and effectively diagnose the\nroot causes and our code is available at\ngithub.com/TsinghuaDatabaseGroup/DB-GPT.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhiyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05476v1","updated":"2023-08-10T10:07:00Z","published":"2023-08-10T10:07:00Z","title":"Exploring Machine Learning and Transformer-based Approaches for\n Deceptive Text Classification: A Comparative Analysis","summary":" Deceptive text classification is a critical task in natural language\nprocessing that aims to identify deceptive or fraudulent content. This study\npresents a comparative analysis of machine learning and transformer-based\napproaches for deceptive text classification. We investigate the effectiveness\nof traditional machine learning algorithms and state-of-the-art transformer\nmodels, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive\ntext. A labeled dataset consisting of deceptive and non-deceptive texts is used\nfor training and evaluation purposes. Through extensive experimentation, we\ncompare the performance metrics, including accuracy, precision, recall, and F1\nscore, of the different approaches. The results of this study shed light on the\nstrengths and limitations of machine learning and transformer-based methods for\ndeceptive text classification, enabling researchers and practitioners to make\ninformed decisions when dealing with deceptive content\n","authors":["Anusuya Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.05476v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.05471v1","updated":"2023-08-10T09:52:44Z","published":"2023-08-10T09:52:44Z","title":"Provably Efficient Algorithm for Nonstationary Low-Rank MDPs","summary":" Reinforcement learning (RL) under changing environment models many real-world\napplications via nonstationary Markov Decision Processes (MDPs), and hence\ngains considerable interest. However, theoretical studies on nonstationary MDPs\nin the literature have mainly focused on tabular and linear (mixture) MDPs,\nwhich do not capture the nature of unknown representation in deep RL. In this\npaper, we make the first effort to investigate nonstationary RL under episodic\nlow-rank MDPs, where both transition kernels and rewards may vary over time,\nand the low-rank model contains unknown representation in addition to the\nlinear state embedding function. We first propose a parameter-dependent policy\noptimization algorithm called PORTAL, and further improve PORTAL to its\nparameter-free version of Ada-PORTAL, which is able to tune its\nhyper-parameters adaptively without any prior knowledge of nonstationarity. For\nboth algorithms, we provide upper bounds on the average dynamic suboptimality\ngap, which show that as long as the nonstationarity is not significantly large,\nPORTAL and Ada-PORTAL are sample-efficient and can achieve arbitrarily small\naverage dynamic suboptimality gap with polynomial sample complexity.\n","authors":["Yuan Cheng","Jing Yang","Yingbin Liang"],"pdf_url":"https://arxiv.org/pdf/2308.05471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05463v1","updated":"2023-08-10T09:42:20Z","published":"2023-08-10T09:42:20Z","title":"$\\mathcal{G}^2Pxy$: Generative Open-Set Node Classification on Graphs\n with Proxy Unknowns","summary":" Node classification is the task of predicting the labels of unlabeled nodes\nin a graph. State-of-the-art methods based on graph neural networks achieve\nexcellent performance when all labels are available during training. But in\nreal-life, models are often applied on data with new classes, which can lead to\nmassive misclassification and thus significantly degrade performance. Hence,\ndeveloping open-set classification methods is crucial to determine if a given\nsample belongs to a known class. Existing methods for open-set node\nclassification generally use transductive learning with part or all of the\nfeatures of real unseen class nodes to help with open-set classification. In\nthis paper, we propose a novel generative open-set node classification method,\ni.e. $\\mathcal{G}^2Pxy$, which follows a stricter inductive learning setting\nwhere no information about unknown classes is available during training and\nvalidation. Two kinds of proxy unknown nodes, inter-class unknown proxies and\nexternal unknown proxies are generated via mixup to efficiently anticipate the\ndistribution of novel classes. Using the generated proxies, a closed-set\nclassifier can be transformed into an open-set one, by augmenting it with an\nextra proxy classifier. Under the constraints of both cross entropy loss and\ncomplement entropy loss, $\\mathcal{G}^2Pxy$ achieves superior effectiveness for\nunknown class detection and known class classification, which is validated by\nexperiments on benchmark graph datasets. Moreover, $\\mathcal{G}^2Pxy$ does not\nhave specific requirement on the GNN architecture and shows good\ngeneralizations.\n","authors":["Qin Zhang","Zelin Shi","Xiaolin Zhang","Xiaojun Chen","Philippe Fournier-Viger","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2308.05463v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.05451v1","updated":"2023-08-10T09:17:07Z","published":"2023-08-10T09:17:07Z","title":"A Forecaster's Review of Judea Pearl's Causality: Models, Reasoning and\n Inference, Second Edition, 2009","summary":" With the big popularity and success of Judea Pearl's original causality book,\nthis review covers the main topics updated in the second edition in 2009 and\nillustrates an easy-to-follow causal inference strategy in a forecast scenario.\nIt further discusses some potential benefits and challenges for causal\ninference with time series forecasting when modeling the counterfactuals,\nestimating the uncertainty and incorporating prior knowledge to estimate causal\neffects in different forecasting scenarios.\n","authors":["Feng Li"],"pdf_url":"https://arxiv.org/pdf/2308.05451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06955v2","updated":"2023-08-10T08:35:56Z","published":"2023-06-12T08:37:38Z","title":"A Brief Review of Hypernetworks in Deep Learning","summary":" Hypernetworks, or hypernets in short, are neural networks that generate\nweights for another neural network, known as the target network. They have\nemerged as a powerful deep learning technique that allows for greater\nflexibility, adaptability, dynamism, faster training, information sharing, and\nmodel compression etc. Hypernets have shown promising results in a variety of\ndeep learning problems, including continual learning, causal inference,\ntransfer learning, weight pruning, uncertainty quantification, zero-shot\nlearning, natural language processing, and reinforcement learning etc. Despite\ntheir success across different problem settings, currently, there is no review\navailable to inform the researchers about the developments and to help in\nutilizing hypernets. To fill this gap, we review the progress in hypernets. We\npresent an illustrative example to train deep neural networks using hypernets\nand propose categorizing hypernets based on five design criteria as inputs,\noutputs, variability of inputs and outputs, and architecture of hypernets. We\nalso review applications of hypernets across different deep learning problem\nsettings, followed by a discussion of general scenarios where hypernets can be\neffectively employed. Finally, we discuss the challenges and future directions\nthat remain under-explored in the field of hypernets. We believe that\nhypernetworks have the potential to revolutionize the field of deep learning.\nThey offer a new way to design and train neural networks, and they have the\npotential to improve the performance of deep learning models on a variety of\ntasks. Through this review, we aim to inspire further advancements in deep\nlearning through hypernetworks.\n","authors":["Vinod Kumar Chauhan","Jiandong Zhou","Ping Lu","Soheila Molaei","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2306.06955v2.pdf","comment":"revised categorisation, added new Section '5 When can we use\n Hypernets?', and other corrections(2 figures and 2 tables) (under review)"},{"id":"http://arxiv.org/abs/2107.02495v3","updated":"2023-08-10T08:16:52Z","published":"2021-07-06T09:24:57Z","title":"InfoNCE is variational inference in a recognition parameterised model","summary":" Here, we show that the InfoNCE objective is equivalent to the ELBO in a new\nclass of probabilistic generative model, the recognition parameterised model\n(RPM). When we learn the optimal prior, the RPM ELBO becomes equal to the\nmutual information (MI; up to a constant), establishing a connection to\npre-existing self-supervised learning methods such as InfoNCE. However,\npractical InfoNCE methods do not use the MI as an objective; the MI is\ninvariant to arbitrary invertible transformations, so using an MI objective can\nlead to highly entangled representations (Tschannen et al., 2019). Instead, the\nactual InfoNCE objective is a simplified lower bound on the MI which is loose\neven in the infinite sample limit. Thus, an objective that works (i.e. the\nactual InfoNCE objective) appears to be motivated as a loose bound on an\nobjective that does not work (i.e. the true MI which gives arbitrarily\nentangled representations). We give an alternative motivation for the actual\nInfoNCE objective. In particular, we show that in the infinite sample limit,\nand for a particular choice of prior, the actual InfoNCE objective is equal to\nthe ELBO (up to a constant); and the ELBO is equal to the marginal likelihood\nwith a deterministic recognition model. Thus, we argue that our VAE perspective\ngives a better motivation for InfoNCE than MI, as the actual InfoNCE objective\nis only loosely bounded by the MI, but is equal to the ELBO/marginal likelihood\n(up to a constant).\n","authors":["Laurence Aitchison","Stoil Ganev"],"pdf_url":"https://arxiv.org/pdf/2107.02495v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09738v7","updated":"2023-08-10T08:12:39Z","published":"2023-02-20T03:31:11Z","title":"Simplifying Momentum-based Positive-definite Submanifold Optimization\n with Applications to Deep Learning","summary":" Riemannian submanifold optimization with momentum is computationally\nchallenging because, to ensure that the iterates remain on the submanifold, we\noften need to solve difficult differential equations. Here, we simplify such\ndifficulties for a class of sparse or structured symmetric positive-definite\nmatrices with the affine-invariant metric. We do so by proposing a generalized\nversion of the Riemannian normal coordinates that dynamically orthonormalizes\nthe metric and locally converts the problem into an unconstrained problem in\nthe Euclidean space. We use our approach to simplify existing approaches for\nstructured covariances and develop matrix-inverse-free $2^\\text{nd}$-order\noptimizers for deep learning with low precision by using only matrix\nmultiplications. Code: https://github.com/yorkerlin/StructuredNGD-DL\n","authors":["Wu Lin","Valentin Duruisseaux","Melvin Leok","Frank Nielsen","Mohammad Emtiyaz Khan","Mark Schmidt"],"pdf_url":"https://arxiv.org/pdf/2302.09738v7.pdf","comment":"An updated version of the ICML 2023 paper. Updated the main text to\n emphasize challenges of using existing Riemannian methods to estimate sparse\n and structured SPD matrices"},{"id":"http://arxiv.org/abs/2308.05411v1","updated":"2023-08-10T08:12:17Z","published":"2023-08-10T08:12:17Z","title":"Explainable AI applications in the Medical Domain: a systematic review","summary":" Artificial Intelligence in Medicine has made significant progress with\nemerging applications in medical imaging, patient care, and other areas. While\nthese applications have proven successful in retrospective studies, very few of\nthem were applied in practice.The field of Medical AI faces various challenges,\nin terms of building user trust, complying with regulations, using data\nethically.Explainable AI (XAI) aims to enable humans understand AI and trust\nits results. This paper presents a literature review on the recent developments\nof XAI solutions for medical decision support, based on a representative sample\nof 198 articles published in recent years. The systematic synthesis of the\nrelevant articles resulted in several findings. (1) model-agnostic XAI\ntechniques were mostly employed in these solutions, (2) deep learning models\nare utilized more than other types of machine learning models, (3)\nexplainability was applied to promote trust, but very few works reported the\nphysicians participation in the loop, (4) visual and interactive user interface\nis more useful in understanding the explanation and the recommendation of the\nsystem. More research is needed in collaboration between medical and AI\nexperts, that could guide the development of suitable frameworks for the\ndesign, implementation, and evaluation of XAI solutions in medicine.\n","authors":["Nicoletta Prentzas","Antonis Kakas","Constantinos S. Pattichis"],"pdf_url":"https://arxiv.org/pdf/2308.05411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05407v1","updated":"2023-08-10T08:03:58Z","published":"2023-08-10T08:03:58Z","title":"A Comparative Assessment of Multi-view fusion learning for Crop\n Classification","summary":" With a rapidly increasing amount and diversity of remote sensing (RS) data\nsources, there is a strong need for multi-view learning modeling. This is a\ncomplex task when considering the differences in resolution, magnitude, and\nnoise of RS data. The typical approach for merging multiple RS sources has been\ninput-level fusion, but other - more advanced - fusion strategies may\noutperform this traditional approach. This work assesses different fusion\nstrategies for crop classification in the CropHarvest dataset. The fusion\nmethods proposed in this work outperform models based on individual views and\nprevious fusion methods. We do not find one single fusion method that\nconsistently outperforms all other approaches. Instead, we present a comparison\nof multi-view fusion methods for three different datasets and show that,\ndepending on the test region, different methods obtain the best performance.\nDespite this, we suggest a preliminary criterion for the selection of fusion\nmethods.\n","authors":["Francisco Mena","Diego Arenas","Marlon Nuske","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2308.05407v1.pdf","comment":"Accepted at IEEE International Geoscience and Remote Sensing\n Symposium 2023"},{"id":"http://arxiv.org/abs/2304.12177v2","updated":"2023-08-10T07:59:33Z","published":"2023-04-24T15:38:22Z","title":"Π-ML: A dimensional analysis-based machine learning parameterization\n of optical turbulence in the atmospheric surface layer","summary":" Turbulent fluctuations of the atmospheric refraction index, so-called optical\nturbulence, can significantly distort propagating laser beams. Therefore,\nmodeling the strength of these fluctuations ($C_n^2$) is highly relevant for\nthe successful development and deployment of future free-space optical\ncommunication links. In this letter, we propose a physics-informed machine\nlearning (ML) methodology, $\\Pi$-ML, based on dimensional analysis and gradient\nboosting to estimate $C_n^2$. Through a systematic feature importance analysis,\nwe identify the normalized variance of potential temperature as the dominating\nfeature for predicting $C_n^2$. For statistical robustness, we train an\nensemble of models which yields high performance on the out-of-sample data of\n$R^2=0.958\\pm0.001$.\n","authors":["Maximilian Pierzyna","Rudolf Saathof","Sukanta Basu"],"pdf_url":"https://arxiv.org/pdf/2304.12177v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04365v3","updated":"2023-08-10T07:44:19Z","published":"2023-08-08T16:04:42Z","title":"SLEM: Machine Learning for Path Modeling and Causal Inference with Super\n Learner Equation Modeling","summary":" Causal inference is a crucial goal of science, enabling researchers to arrive\nat meaningful conclusions regarding the predictions of hypothetical\ninterventions using observational data. Path models, Structural Equation Models\n(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to\nunambiguously specify assumptions regarding the causal structure underlying a\nphenomenon. Unlike DAGs, which make very few assumptions about the functional\nand parametric form, SEM assumes linearity. This can result in functional\nmisspecification which prevents researchers from undertaking reliable effect\nsize estimation. In contrast, we propose Super Learner Equation Modeling, a\npath modeling technique integrating machine learning Super Learner ensembles.\nWe empirically demonstrate its ability to provide consistent and unbiased\nestimates of causal effects, its competitive performance for linear models when\ncompared with SEM, and highlight its superiority over SEM when dealing with\nnon-linear relationships. We provide open-source code, and a tutorial notebook\nwith example usage, accentuating the easy-to-use nature of the method.\n","authors":["Matthew J. Vowels"],"pdf_url":"https://arxiv.org/pdf/2308.04365v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03382v2","updated":"2023-08-10T07:38:35Z","published":"2023-08-07T08:03:20Z","title":"Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based\n Residual U-Blocks Network","summary":" Nucleus image segmentation is a crucial step in the analysis, pathological\ndiagnosis, and classification, which heavily relies on the quality of nucleus\nsegmentation. However, the complexity of issues such as variations in nucleus\nsize, blurred nucleus contours, uneven staining, cell clustering, and\noverlapping cells poses significant challenges. Current methods for nucleus\nsegmentation primarily rely on nuclear morphology or contour-based approaches.\nNuclear morphology-based methods exhibit limited generalization ability and\nstruggle to effectively predict irregular-shaped nuclei, while contour-based\nextraction methods face challenges in accurately segmenting overlapping nuclei.\nTo address the aforementioned issues, we propose a dual-branch network using\nhybrid attention based residual U-blocks for nucleus instance segmentation. The\nnetwork simultaneously predicts target information and target contours.\nAdditionally, we introduce a post-processing method that combines the target\ninformation and target contours to distinguish overlapping nuclei and generate\nan instance segmentation image. Within the network, we propose a context fusion\nblock (CF-block) that effectively extracts and merges contextual information\nfrom the network. Extensive quantitative evaluations are conducted to assess\nthe performance of our method. Experimental results demonstrate the superior\nperformance of the proposed method compared to state-of-the-art approaches on\nthe BNS, MoNuSeg, CoNSeg, and CPM-17 datasets.\n","authors":["Junzhou Chen","Qian Huang","Yulin Chen","Linyi Qian","Chengyuan Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03382v2.pdf","comment":"Nucleus segmentation, Deep learning, Instance segmentation, Medical\n imaging, Dual-Branch network"},{"id":"http://arxiv.org/abs/2308.05390v1","updated":"2023-08-10T07:09:13Z","published":"2023-08-10T07:09:13Z","title":"Product Review Image Ranking for Fashion E-commerce","summary":" In a fashion e-commerce platform where customers can't physically examine the\nproducts on their own, being able to see other customers' text and image\nreviews of the product is critical while making purchase decisions. Given the\nhigh reliance on these reviews, over the years we have observed customers\nproactively sharing their reviews. With an increase in the coverage of User\nGenerated Content (UGC), there has been a corresponding increase in the number\nof customer images. It is thus imperative to display the most relevant images\non top as it may influence users' online shopping choices and behavior. In this\npaper, we propose a simple yet effective training procedure for ranking\ncustomer images. We created a dataset consisting of Myntra (A Major Indian\nFashion e-commerce company) studio posts and highly engaged (upvotes/downvotes)\nUGC images as our starting point and used selected distortion techniques on the\nimages of the above dataset to bring their quality at par with those of bad UGC\nimages. We train our network to rank bad-quality images lower than high-quality\nones. Our proposed method outperforms the baseline models on two metrics,\nnamely correlation coefficient, and accuracy, by substantial margins.\n","authors":["Sangeet Jaiswal","Dhruv Patel","Sreekanth Vempati","Konduru Saiswaroop"],"pdf_url":"https://arxiv.org/pdf/2308.05390v1.pdf","comment":"Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR\n eCom'22)"},{"id":"http://arxiv.org/abs/2303.06601v2","updated":"2023-08-10T06:53:11Z","published":"2023-03-12T08:05:30Z","title":"Multi-metrics adaptively identifies backdoors in Federated learning","summary":" The decentralized and privacy-preserving nature of federated learning (FL)\nmakes it vulnerable to backdoor attacks aiming to manipulate the behavior of\nthe resulting model on specific adversary-chosen inputs. However, most existing\ndefenses based on statistical differences take effect only against specific\nattacks, especially when the malicious gradients are similar to benign ones or\nthe data are highly non-independent and identically distributed (non-IID). In\nthis paper, we revisit the distance-based defense methods and discover that i)\nEuclidean distance becomes meaningless in high dimensions and ii) malicious\ngradients with diverse characteristics cannot be identified by a single metric.\nTo this end, we present a simple yet effective defense strategy with\nmulti-metrics and dynamic weighting to identify backdoors adaptively.\nFurthermore, our novel defense has no reliance on predefined assumptions over\nattack settings or data distributions and little impact on benign performance.\nTo evaluate the effectiveness of our approach, we conduct comprehensive\nexperiments on different datasets under various attack settings, where our\nmethod achieves the best defensive performance. For instance, we achieve the\nlowest backdoor accuracy of 3.06% under the difficult Edge-case PGD, showing\nsignificant superiority over previous defenses. The results also demonstrate\nthat our method can be well-adapted to a wide range of non-IID degrees without\nsacrificing the benign performance.\n","authors":["Siquan Huang","Yijiang Li","Chong Chen","Leyu Shi","Ying Gao"],"pdf_url":"https://arxiv.org/pdf/2303.06601v2.pdf","comment":"14 pages, 8 figures and 7 tables; 2023 IEEE/CVF International\n Conference on Computer Vision (ICCV)"},{"id":"http://arxiv.org/abs/2303.08902v2","updated":"2023-08-10T06:44:54Z","published":"2023-03-15T19:37:33Z","title":"Learning ground states of gapped quantum Hamiltonians with Kernel\n Methods","summary":" Neural network approaches to approximate the ground state of quantum\nhamiltonians require the numerical solution of a highly nonlinear optimization\nproblem. We introduce a statistical learning approach that makes the\noptimization trivial by using kernel methods. Our scheme is an approximate\nrealization of the power method, where supervised learning is used to learn the\nnext step of the power iteration. We show that the ground state properties of\narbitrary gapped quantum hamiltonians can be reached with polynomial resources\nunder the assumption that the supervised learning is efficient. Using kernel\nridge regression, we provide numerical evidence that the learning assumption is\nverified by applying our scheme to find the ground states of several\nprototypical interacting many-body quantum systems, both in one and two\ndimensions, showing the flexibility of our approach.\n","authors":["Clemens Giuliani","Filippo Vicentini","Riccardo Rossi","Giuseppe Carleo"],"pdf_url":"https://arxiv.org/pdf/2303.08902v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05374v1","updated":"2023-08-10T06:43:44Z","published":"2023-08-10T06:43:44Z","title":"Trustworthy LLMs: a Survey and Guideline for Evaluating Large Language\n Models' Alignment","summary":" Ensuring alignment, which refers to making models behave in accordance with\nhuman intentions [1,2], has become a critical task before deploying large\nlanguage models (LLMs) in real-world applications. For instance, OpenAI devoted\nsix months to iteratively aligning GPT-4 before its release [3]. However, a\nmajor challenge faced by practitioners is the lack of clear guidance on\nevaluating whether LLM outputs align with social norms, values, and\nregulations. This obstacle hinders systematic iteration and deployment of LLMs.\nTo address this issue, this paper presents a comprehensive survey of key\ndimensions that are crucial to consider when assessing LLM trustworthiness. The\nsurvey covers seven major categories of LLM trustworthiness: reliability,\nsafety, fairness, resistance to misuse, explainability and reasoning, adherence\nto social norms, and robustness. Each major category is further divided into\nseveral sub-categories, resulting in a total of 29 sub-categories.\nAdditionally, a subset of 8 sub-categories is selected for further\ninvestigation, where corresponding measurement studies are designed and\nconducted on several widely-used LLMs. The measurement results indicate that,\nin general, more aligned models tend to perform better in terms of overall\ntrustworthiness. However, the effectiveness of alignment varies across the\ndifferent trustworthiness categories considered. This highlights the importance\nof conducting more fine-grained analyses, testing, and making continuous\nimprovements on LLM alignment. By shedding light on these key dimensions of LLM\ntrustworthiness, this paper aims to provide valuable insights and guidance to\npractitioners in the field. Understanding and addressing these concerns will be\ncrucial in achieving reliable and ethically sound deployment of LLMs in various\napplications.\n","authors":["Yang Liu","Yuanshun Yao","Jean-Francois Ton","Xiaoying Zhang","Ruocheng Guo Hao Cheng","Yegor Klochkov","Muhammad Faaiz Taufiq","Hang Li"],"pdf_url":"https://arxiv.org/pdf/2308.05374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05371v1","updated":"2023-08-10T06:40:19Z","published":"2023-08-10T06:40:19Z","title":"Flexible Isosurface Extraction for Gradient-Based Mesh Optimization","summary":" This work considers gradient-based mesh optimization, where we iteratively\noptimize for a 3D surface mesh by representing it as the isosurface of a scalar\nfield, an increasingly common paradigm in applications including\nphotogrammetry, generative modeling, and inverse physics. Existing\nimplementations adapt classic isosurface extraction algorithms like Marching\nCubes or Dual Contouring; these techniques were designed to extract meshes from\nfixed, known fields, and in the optimization setting they lack the degrees of\nfreedom to represent high-quality feature-preserving meshes, or suffer from\nnumerical instabilities. We introduce FlexiCubes, an isosurface representation\nspecifically designed for optimizing an unknown mesh with respect to geometric,\nvisual, or even physical objectives. Our main insight is to introduce\nadditional carefully-chosen parameters into the representation, which allow\nlocal flexible adjustments to the extracted mesh geometry and connectivity.\nThese parameters are updated along with the underlying scalar field via\nautomatic differentiation when optimizing for a downstream task. We base our\nextraction scheme on Dual Marching Cubes for improved topological properties,\nand present extensions to optionally generate tetrahedral and\nhierarchically-adaptive meshes. Extensive experiments validate FlexiCubes on\nboth synthetic benchmarks and real-world applications, showing that it offers\nsignificant improvements in mesh quality and geometric fidelity.\n","authors":["Tianchang Shen","Jacob Munkberg","Jon Hasselgren","Kangxue Yin","Zian Wang","Wenzheng Chen","Zan Gojcic","Sanja Fidler","Nicholas Sharp","Jun Gao"],"pdf_url":"https://arxiv.org/pdf/2308.05371v1.pdf","comment":"SIGGRAPH 2023. Project page:\n https://research.nvidia.com/labs/toronto-ai/flexicubes/"},{"id":"http://arxiv.org/abs/2308.05364v1","updated":"2023-08-10T06:17:46Z","published":"2023-08-10T06:17:46Z","title":"Machine Learning aided Computer Architecture Design for CNN Inferencing\n Systems","summary":" Efficient and timely calculations of Machine Learning (ML) algorithms are\nessential for emerging technologies like autonomous driving, the Internet of\nThings (IoT), and edge computing. One of the primary ML algorithms used in such\nsystems is Convolutional Neural Networks (CNNs), which demand high\ncomputational resources. This requirement has led to the use of ML accelerators\nlike GPGPUs to meet design constraints. However, selecting the most suitable\naccelerator involves Design Space Exploration (DSE), a process that is usually\ntime-consuming and requires significant manual effort. Our work presents\napproaches to expedite the DSE process by identifying the most appropriate\nGPGPU for CNN inferencing systems. We have developed a quick and precise\ntechnique for forecasting the power and performance of CNNs during inference,\nwith a MAPE of 5.03% and 5.94%, respectively. Our approach empowers computer\narchitects to estimate power and performance in the early stages of\ndevelopment, reducing the necessity for numerous prototypes. This saves time\nand money while also improving the time-to-market period.\n","authors":["Christopher A. Metz"],"pdf_url":"https://arxiv.org/pdf/2308.05364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05362v1","updated":"2023-08-10T06:10:49Z","published":"2023-08-10T06:10:49Z","title":"FINER: Enhancing State-of-the-art Classifiers with Feature Attribution\n to Facilitate Security Analysis","summary":" Deep learning classifiers achieve state-of-the-art performance in various\nrisk detection applications. They explore rich semantic representations and are\nsupposed to automatically discover risk behaviors. However, due to the lack of\ntransparency, the behavioral semantics cannot be conveyed to downstream\nsecurity experts to reduce their heavy workload in security analysis. Although\nfeature attribution (FA) methods can be used to explain deep learning, the\nunderlying classifier is still blind to what behavior is suspicious, and the\ngenerated explanation cannot adapt to downstream tasks, incurring poor\nexplanation fidelity and intelligibility. In this paper, we propose FINER, the\nfirst framework for risk detection classifiers to generate high-fidelity and\nhigh-intelligibility explanations. The high-level idea is to gather explanation\nefforts from model developer, FA designer, and security experts. To improve\nfidelity, we fine-tune the classifier with an explanation-guided multi-task\nlearning strategy. To improve intelligibility, we engage task knowledge to\nadjust and ensemble FA methods. Extensive evaluations show that FINER improves\nexplanation quality for risk detection. Moreover, we demonstrate that FINER\noutperforms a state-of-the-art tool in facilitating malware analysis.\n","authors":["Yiling He","Jian Lou","Zhan Qin","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2308.05362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05353v1","updated":"2023-08-10T05:49:30Z","published":"2023-08-10T05:49:30Z","title":"Preemptive Detection of Fake Accounts on Social Networks via Multi-Class\n Preferential Attachment Classifiers","summary":" In this paper, we describe a new algorithm called Preferential Attachment\nk-class Classifier (PreAttacK) for detecting fake accounts in a social network.\nRecently, several algorithms have obtained high accuracy on this problem.\nHowever, they have done so by relying on information about fake accounts'\nfriendships or the content they share with others--the very things we seek to\nprevent.\n PreAttacK represents a significant departure from these approaches. We\nprovide some of the first detailed distributional analyses of how new fake (and\nreal) accounts first attempt to request friends after joining a major network\n(Facebook). We show that even before a new account has made friends or shared\ncontent, these initial friend request behaviors evoke a natural multi-class\nextension of the canonical Preferential Attachment model of social network\ngrowth.\n We use this model to derive a new algorithm, PreAttacK. We prove that in\nrelevant problem instances, PreAttacK near-optimally approximates the posterior\nprobability that a new account is fake under this multi-class Preferential\nAttachment model of new accounts' (not-yet-answered) friend requests. These are\nthe first provable guarantees for fake account detection that apply to new\nusers, and that do not require strong homophily assumptions.\n This principled approach also makes PreAttacK the only algorithm with\nprovable guarantees that obtains state-of-the-art performance on new users on\nthe global Facebook network, where it converges to AUC=0.9 after new users send\n+ receive a total of just 20 not-yet-answered friend requests. For comparison,\nstate-of-the-art benchmarks do not obtain this AUC even after observing\nadditional data on new users' first 100 friend requests. Thus, unlike\nmainstream algorithms, PreAttacK converges before the median new fake account\nhas made a single friendship (accepted friend request) with a human.\n","authors":["Adam Breuer","Nazanin Khosravani","Michael Tingley","Bradford Cottel"],"pdf_url":"https://arxiv.org/pdf/2308.05353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05345v1","updated":"2023-08-10T05:24:41Z","published":"2023-08-10T05:24:41Z","title":"RTLLM: An Open-Source Benchmark for Design RTL Generation with Large\n Language Model","summary":" Inspired by the recent success of large language models (LLMs) like ChatGPT,\nresearchers start to explore the adoption of LLMs for agile hardware design,\nsuch as generating design RTL based on natural-language instructions. However,\nin existing works, their target designs are all relatively simple and in a\nsmall scale, and proposed by the authors themselves, making a fair comparison\namong different LLM solutions challenging. In addition, many prior works only\nfocus on the design correctness, without evaluating the design qualities of\ngenerated design RTL. In this work, we propose an open-source benchmark named\nRTLLM, for generating design RTL with natural language instructions. To\nsystematically evaluate the auto-generated design RTL, we summarized three\nprogressive goals, named syntax goal, functionality goal, and design quality\ngoal. This benchmark can automatically provide a quantitative evaluation of any\ngiven LLM-based solution. Furthermore, we propose an easy-to-use yet\nsurprisingly effective prompt engineering technique named self-planning, which\nproves to significantly boost the performance of GPT-3.5 in our proposed\nbenchmark.\n","authors":["Yao Lu","Shang Liu","Qijun Zhang","Zhiyao Xie"],"pdf_url":"https://arxiv.org/pdf/2308.05345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09807v2","updated":"2023-08-10T05:13:33Z","published":"2023-06-16T12:44:10Z","title":"FALL-E: A Foley Sound Synthesis Model and Strategies","summary":" This paper introduces FALL-E, a foley synthesis system and its\ntraining/inference strategies. The FALL-E model employs a cascaded approach\ncomprising low-resolution spectrogram generation, spectrogram super-resolution,\nand a vocoder. We trained every sound-related model from scratch using our\nextensive datasets, and utilized a pre-trained language model. We conditioned\nthe model with dataset-specific texts, enabling it to learn sound quality and\nrecording environment based on text input. Moreover, we leveraged external\nlanguage models to improve text descriptions of our datasets and performed\nprompt engineering for quality, coherence, and diversity. FALL-E was evaluated\nby an objective measure as well as listening tests in the DCASE 2023 challenge\nTask 7. The submission achieved the second place on average, while achieving\nthe best score for diversity, second place for audio quality, and third place\nfor class fitness.\n","authors":["Minsung Kang","Sangshin Oh","Hyeongi Moon","Kyungyun Lee","Ben Sangbae Chon"],"pdf_url":"https://arxiv.org/pdf/2306.09807v2.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2202.09518v3","updated":"2023-08-10T04:11:51Z","published":"2022-02-19T03:49:21Z","title":"Distributed Out-of-Memory NMF on CPU/GPU Architectures","summary":" We propose an efficient distributed out-of-memory implementation of the\nNon-negative Matrix Factorization (NMF) algorithm for heterogeneous\nhigh-performance-computing (HPC) systems. The proposed implementation is based\non prior work on NMFk, which can perform automatic model selection and extract\nlatent variables and patterns from data. In this work, we extend NMFk by adding\nsupport for dense and sparse matrix operation on multi-node, multi-GPU systems.\nThe resulting algorithm is optimized for out-of-memory (OOM) problems where the\nmemory required to factorize a given matrix is greater than the available GPU\nmemory. Memory complexity is reduced by batching/tiling strategies, and sparse\nand dense matrix operations are significantly accelerated with GPU cores (or\ntensor cores when available). Input/Output (I/O) latency associated with batch\ncopies between host and device is hidden using CUDA streams to overlap data\ntransfers and compute asynchronously, and latency associated with collective\ncommunications (both intra-node and inter-node) is reduced using optimized\nNVIDIA Collective Communication Library NCCL based communicators. Benchmark\nresults show significant improvement, from 32X to 76x speedup, with the new\nimplementation using GPUs over the CPU-based NMFk. Good weak scaling was\ndemonstrated on up to 4096 multi-GPU cluster nodes with approximately 25,000\nGPUs when decomposing a dense 340 Terabyte-size matrix and an 11 Exabyte-size\nsparse matrix of density 10e-6.\n","authors":["Ismael Boureima","Manish Bhattarai","Maksim Eren","Erik Skau","Philip Romero","Stephan Eidenbenz","Boian Alexandrov"],"pdf_url":"https://arxiv.org/pdf/2202.09518v3.pdf","comment":"Accepted at Journal of Supercomputing"},{"id":"http://arxiv.org/abs/2308.05326v1","updated":"2023-08-10T04:01:04Z","published":"2023-08-10T04:01:04Z","title":"OpenProteinSet: Training data for structural biology at scale","summary":" Multiple sequence alignments (MSAs) of proteins encode rich biological\ninformation and have been workhorses in bioinformatic methods for tasks like\nprotein design and protein structure prediction for decades. Recent\nbreakthroughs like AlphaFold2 that use transformers to attend directly over\nlarge quantities of raw MSAs have reaffirmed their importance. Generation of\nMSAs is highly computationally intensive, however, and no datasets comparable\nto those used to train AlphaFold2 have been made available to the research\ncommunity, hindering progress in machine learning for proteins. To remedy this\nproblem, we introduce OpenProteinSet, an open-source corpus of more than 16\nmillion MSAs, associated structural homologs from the Protein Data Bank, and\nAlphaFold2 protein structure predictions. We have previously demonstrated the\nutility of OpenProteinSet by successfully retraining AlphaFold2 on it. We\nexpect OpenProteinSet to be broadly useful as training and validation data for\n1) diverse tasks focused on protein structure, function, and design and 2)\nlarge-scale multimodal machine learning research.\n","authors":["Gustaf Ahdritz","Nazim Bouatta","Sachin Kadyan","Lukas Jarosch","Daniel Berenberg","Ian Fisk","Andrew M. Watkins","Stephen Ra","Richard Bonneau","Mohammed AlQuraishi"],"pdf_url":"https://arxiv.org/pdf/2308.05326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09797v5","updated":"2023-08-10T03:41:04Z","published":"2023-04-19T16:29:48Z","title":"Progressive-Hint Prompting Improves Reasoning in Large Language Models","summary":" The performance of Large Language Models (LLMs) in reasoning tasks depends\nheavily on prompt design, with Chain-of-Thought (CoT) and self-consistency\nbeing critical methods that enhance this ability. However, these methods do not\nfully exploit the answers generated by the LLM to guide subsequent responses.\nThis paper proposes a new prompting method, named Progressive-Hint Prompting\n(PHP), that enables automatic multiple interactions between users and LLMs by\nusing previously generated answers as hints to progressively guide toward the\ncorrect answers. PHP is orthogonal to CoT and self-consistency, making it easy\nto combine with state-of-the-art techniques to further improve performance. We\nconducted extensive and comprehensive experiments on seven benchmarks. The\nresults show that PHP significantly improves accuracy while remaining highly\nefficient. For instance, with text-davinci-003, we observed a 4.2% improvement\non GSM8K with greedy decoding compared to Complex CoT, and a 46.17% reduction\nin sample paths with self-consistency. With GPT-4 and PHP, we achieve\nstate-of-the-art performances on SVAMP (89.1% -> 91.9%), GSM8K (92% -> 95.5%),\nAQuA (76.4% -> 79.9%) and MATH (50.3% -> 53.9%).\n","authors":["Chuanyang Zheng","Zhengying Liu","Enze Xie","Zhenguo Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2304.09797v5.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2308.04704v2","updated":"2023-08-10T03:08:03Z","published":"2023-08-09T04:51:28Z","title":"A Feature Set of Small Size for the PDF Malware Detection","summary":" Machine learning (ML)-based malware detection systems are becoming\nincreasingly important as malware threats increase and get more sophisticated.\nPDF files are often used as vectors for phishing attacks because they are\nwidely regarded as trustworthy data resources, and are accessible across\ndifferent platforms. Therefore, researchers have developed many different PDF\nmalware detection methods. Performance in detecting PDF malware is greatly\ninfluenced by feature selection. In this research, we propose a small features\nset that don't require too much domain knowledge of the PDF file. We evaluate\nproposed features with six different machine learning models. We report the\nbest accuracy of 99.75% when using Random Forest model. Our proposed feature\nset, which consists of just 12 features, is one of the most conciseness in the\nfield of PDF malware detection. Despite its modest size, we obtain comparable\nresults to state-of-the-art that employ a much larger set of features.\n","authors":["Ran Liu","Charles Nicholas"],"pdf_url":"https://arxiv.org/pdf/2308.04704v2.pdf","comment":"Accepted for publication at the ACM SIGKDD & Annual KDD Conference\n workshop on Knowledge-infused Machine Learning, 2023"},{"id":"http://arxiv.org/abs/2210.13662v2","updated":"2023-08-10T03:02:21Z","published":"2022-10-24T23:50:12Z","title":"Analyzing Privacy Leakage in Machine Learning via Multiple Hypothesis\n Testing: A Lesson From Fano","summary":" Differential privacy (DP) is by far the most widely accepted framework for\nmitigating privacy risks in machine learning. However, exactly how small the\nprivacy parameter $\\epsilon$ needs to be to protect against certain privacy\nrisks in practice is still not well-understood. In this work, we study data\nreconstruction attacks for discrete data and analyze it under the framework of\nmultiple hypothesis testing. We utilize different variants of the celebrated\nFano's inequality to derive upper bounds on the inferential power of a data\nreconstruction adversary when the model is trained differentially privately.\nImportantly, we show that if the underlying private data takes values from a\nset of size $M$, then the target privacy parameter $\\epsilon$ can be $O(\\log\nM)$ before the adversary gains significant inferential power. Our analysis\noffers theoretical evidence for the empirical effectiveness of DP against data\nreconstruction attacks even at relatively large values of $\\epsilon$.\n","authors":["Chuan Guo","Alexandre Sablayrolles","Maziar Sanjabi"],"pdf_url":"https://arxiv.org/pdf/2210.13662v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.03942v3","updated":"2023-08-10T02:55:51Z","published":"2022-11-08T01:36:15Z","title":"Privacy-Aware Compression for Federated Learning Through Numerical\n Mechanism Design","summary":" In private federated learning (FL), a server aggregates differentially\nprivate updates from a large number of clients in order to train a machine\nlearning model. The main challenge in this setting is balancing privacy with\nboth classification accuracy of the learnt model as well as the number of bits\ncommunicated between the clients and server. Prior work has achieved a good\ntrade-off by designing a privacy-aware compression mechanism, called the\nminimum variance unbiased (MVU) mechanism, that numerically solves an\noptimization problem to determine the parameters of the mechanism. This paper\nbuilds upon it by introducing a new interpolation procedure in the numerical\ndesign process that allows for a far more efficient privacy analysis. The\nresult is the new Interpolated MVU mechanism that is more scalable, has a\nbetter privacy-utility trade-off, and provides SOTA results on\ncommunication-efficient private FL on a variety of datasets.\n","authors":["Chuan Guo","Kamalika Chaudhuri","Pierre Stock","Mike Rabbat"],"pdf_url":"https://arxiv.org/pdf/2211.03942v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05309v1","updated":"2023-08-10T02:53:30Z","published":"2023-08-10T02:53:30Z","title":"Homophily-enhanced Structure Learning for Graph Clustering","summary":" Graph clustering is a fundamental task in graph analysis, and recent advances\nin utilizing graph neural networks (GNNs) have shown impressive results.\nDespite the success of existing GNN-based graph clustering methods, they often\noverlook the quality of graph structure, which is inherent in real-world graphs\ndue to their sparse and multifarious nature, leading to subpar performance.\nGraph structure learning allows refining the input graph by adding missing\nlinks and removing spurious connections. However, previous endeavors in graph\nstructure learning have predominantly centered around supervised settings, and\ncannot be directly applied to our specific clustering tasks due to the absence\nof ground-truth labels. To bridge the gap, we propose a novel method called\n\\textbf{ho}mophily-enhanced structure \\textbf{le}arning for graph clustering\n(HoLe). Our motivation stems from the observation that subtly enhancing the\ndegree of homophily within the graph structure can significantly improve GNNs\nand clustering outcomes. To realize this objective, we develop two\nclustering-oriented structure learning modules, i.e., hierarchical correlation\nestimation and cluster-aware sparsification. The former module enables a more\naccurate estimation of pairwise node relationships by leveraging guidance from\nlatent and clustering spaces, while the latter one generates a sparsified\nstructure based on the similarity matrix and clustering assignments.\nAdditionally, we devise a joint optimization approach alternating between\ntraining the homophily-enhanced structure learning and GNN-based clustering,\nthereby enforcing their reciprocal effects. Extensive experiments on seven\nbenchmark datasets of various types and scales, across a range of clustering\nmetrics, demonstrate the superiority of HoLe against state-of-the-art\nbaselines.\n","authors":["Ming Gu","Gaoming Yang","Sheng Zhou","Ning Ma","Jiawei Chen","Qiaoyu Tan","Meihan Liu","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2308.05309v1.pdf","comment":"11 pages with 7 figures"},{"id":"http://arxiv.org/abs/2308.05305v1","updated":"2023-08-10T02:48:57Z","published":"2023-08-10T02:48:57Z","title":"From CNN to Transformer: A Review of Medical Image Segmentation Models","summary":" Medical image segmentation is an important step in medical image analysis,\nespecially as a crucial prerequisite for efficient disease diagnosis and\ntreatment. The use of deep learning for image segmentation has become a\nprevalent trend. The widely adopted approach currently is U-Net and its\nvariants. Additionally, with the remarkable success of pre-trained models in\nnatural language processing tasks, transformer-based models like TransUNet have\nachieved desirable performance on multiple medical image segmentation datasets.\nIn this paper, we conduct a survey of the most representative four medical\nimage segmentation models in recent years. We theoretically analyze the\ncharacteristics of these models and quantitatively evaluate their performance\non two benchmark datasets (i.e., Tuberculosis Chest X-rays and ovarian tumors).\nFinally, we discuss the main challenges and future trends in medical image\nsegmentation. Our work can assist researchers in the related field to quickly\nestablish medical segmentation models tailored to specific regions.\n","authors":["Wenjian Yao","Jiajun Bai","Wei Liao","Yuheng Chen","Mengjuan Liu","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2308.05305v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2307.16361v2","updated":"2023-08-10T02:45:55Z","published":"2023-07-31T01:34:24Z","title":"Benchmarking and Analyzing Robust Point Cloud Recognition: Bag of Tricks\n for Defending Adversarial Examples","summary":" Deep Neural Networks (DNNs) for 3D point cloud recognition are vulnerable to\nadversarial examples, threatening their practical deployment. Despite the many\nresearch endeavors have been made to tackle this issue in recent years, the\ndiversity of adversarial examples on 3D point clouds makes them more\nchallenging to defend against than those on 2D images. For examples, attackers\ncan generate adversarial examples by adding, shifting, or removing points.\nConsequently, existing defense strategies are hard to counter unseen point\ncloud adversarial examples. In this paper, we first establish a comprehensive,\nand rigorous point cloud adversarial robustness benchmark to evaluate\nadversarial robustness, which can provide a detailed understanding of the\neffects of the defense and attack methods. We then collect existing defense\ntricks in point cloud adversarial defenses and then perform extensive and\nsystematic experiments to identify an effective combination of these tricks.\nFurthermore, we propose a hybrid training augmentation methods that consider\nvarious types of point cloud adversarial examples to adversarial training,\nsignificantly improving the adversarial robustness. By combining these tricks,\nwe construct a more robust defense framework achieving an average accuracy of\n83.45\\% against various attacks, demonstrating its capability to enabling\nrobust learners. Our codebase are open-sourced on:\n\\url{https://github.com/qiufan319/benchmark_pc_attack.git}.\n","authors":["Qiufan Ji","Lin Wang","Cong Shi","Shengshan Hu","Yingying Chen","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2307.16361v2.pdf","comment":"8 pages 6 figures"},{"id":"http://arxiv.org/abs/2308.05292v1","updated":"2023-08-10T02:14:23Z","published":"2023-08-10T02:14:23Z","title":"Byzantine-Robust Decentralized Stochastic Optimization with Stochastic\n Gradient Noise-Independent Learning Error","summary":" This paper studies Byzantine-robust stochastic optimization over a\ndecentralized network, where every agent periodically communicates with its\nneighbors to exchange local models, and then updates its own local model by\nstochastic gradient descent (SGD). The performance of such a method is affected\nby an unknown number of Byzantine agents, which conduct adversarially during\nthe optimization process. To the best of our knowledge, there is no existing\nwork that simultaneously achieves a linear convergence speed and a small\nlearning error. We observe that the learning error is largely dependent on the\nintrinsic stochastic gradient noise. Motivated by this observation, we\nintroduce two variance reduction methods, stochastic average gradient algorithm\n(SAGA) and loopless stochastic variance-reduced gradient (LSVRG), to\nByzantine-robust decentralized stochastic optimization for eliminating the\nnegative effect of the stochastic gradient noise. The two resulting methods,\nBRAVO-SAGA and BRAVO-LSVRG, enjoy both linear convergence speeds and stochastic\ngradient noise-independent learning errors. Such learning errors are optimal\nfor a class of methods based on total variation (TV)-norm regularization and\nstochastic subgradient update. We conduct extensive numerical experiments to\ndemonstrate their effectiveness under various Byzantine attacks.\n","authors":["Jie Peng","Weiyu Li","Qing Ling"],"pdf_url":"https://arxiv.org/pdf/2308.05292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05281v1","updated":"2023-08-10T01:51:33Z","published":"2023-08-10T01:51:33Z","title":"Investigating disaster response through social media data and the\n Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S.\n wildfire season","summary":" Effective disaster response is critical for affected communities. Responders\nand decision-makers would benefit from reliable, timely measures of the issues\nimpacting their communities during a disaster, and social media offers a\npotentially rich data source. Social media can reflect public concerns and\ndemands during a disaster, offering valuable insights for decision-makers to\nunderstand evolving situations and optimize resource allocation. We used\nBidirectional Encoder Representations from Transformers (BERT) topic modeling\nto cluster topics from Twitter data. Then, we conducted a temporal-spatial\nanalysis to examine the distribution of these topics across different regions\nduring the 2020 western U.S. wildfire season. Our results show that Twitter\nusers mainly focused on three topics:\"health impact,\" \"damage,\" and\n\"evacuation.\" We used the Susceptible-Infected-Recovered (SIR) theory to\nexplore the magnitude and velocity of topic diffusion on Twitter. The results\ndisplayed a clear relationship between topic trends and wildfire propagation\npatterns. The estimated parameters obtained from the SIR model in selected\ncities revealed that residents exhibited a high level of several concerns\nduring the wildfire. Our study details how the SIR model and topic modeling\nusing social media data can provide decision-makers with a quantitative\napproach to measure disaster response and support their decision-making\nprocesses.\n","authors":["Zihui Ma","Lingyao Li","Libby Hemphill","Gregory B. Baecher"],"pdf_url":"https://arxiv.org/pdf/2308.05281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14527v2","updated":"2023-08-10T01:46:11Z","published":"2023-07-26T22:09:29Z","title":"Open Problems in Computer Vision for Wilderness SAR and The Search for\n Patricia Wu-Murad","summary":" This paper details the challenges in applying two computer vision systems, an\nEfficientDET supervised learning model and the unsupervised RX spectral\nclassifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and\nrescue (WSAR) effort in Japan and identifies 3 directions for future research.\nThere have been at least 19 proposed approaches and 3 datasets aimed at\nlocating missing persons in drone imagery, but only 3 approaches (2\nunsupervised and 1 of an unknown structure) are referenced in the literature as\nhaving been used in an actual WSAR operation. Of these proposed approaches, the\nEfficientDET architecture and the unsupervised spectral RX classifier were\nselected as the most appropriate for this setting. The EfficientDET model was\napplied to the HERIDAL dataset and despite achieving performance that is\nstatistically equivalent to the state-of-the-art, the model fails to translate\nto the real world in terms of false positives (e.g., identifying tree limbs and\nrocks as people), and false negatives (e.g., failing to identify members of the\nsearch team). The poor results in practice for algorithms that showed good\nresults on datasets suggest 3 areas of future research: more realistic datasets\nfor wilderness SAR, computer vision models that are capable of seamlessly\nhandling the variety of imagery that can be collected during actual WSAR\noperations, and better alignment on performance measures.\n","authors":["Thomas Manzini","Robin Murphy"],"pdf_url":"https://arxiv.org/pdf/2307.14527v2.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.05275v1","updated":"2023-08-10T01:25:28Z","published":"2023-08-10T01:25:28Z","title":"Cross-heterogeneity Graph Few-shot Learning","summary":" In recent years, heterogeneous graph few-shot learning has been proposed to\naddress the label sparsity issue in heterogeneous graphs (HGs), which contain\nvarious types of nodes and edges. The existing methods have achieved good\nperformance by transferring generalized knowledge extracted from rich-labeled\nclasses in source HG(s) to few-labeled classes in a target HG. However, these\nmethods only consider the single-heterogeneity scenario where the source and\ntarget HGs share a fixed set of node/edge types, ignoring the more general\nscenario of cross-heterogeneity, where each HG can have a different and\nnon-fixed set of node/edge types. To this end, we focus on the unexplored\ncross-heterogeneity scenario and propose a novel model for Cross-heterogeneity\nGraph Few-shot Learning, namely CGFL. In CGFL, we first extract meta-patterns\nto capture heterogeneous information and propose a multi-view heterogeneous\ngraph neural network (MHGN) to learn meta-patterns across HGs. Then, we propose\na score module to measure the informativeness of labeled samples and determine\nthe transferability of each source HG. Finally, by integrating MHGN and the\nscore module into a meta-learning mechanism, CGFL can effectively transfer\ngeneralized knowledge to predict new classes with few-labeled data. Extensive\nexperiments on four real-world datasets have demonstrated the superior\nperformance of CGFL over the state-of-the-art methods.\n","authors":["Pengfei Ding","Yan Wang","Guanfeng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05353v1","updated":"2023-08-10T05:49:30Z","published":"2023-08-10T05:49:30Z","title":"Preemptive Detection of Fake Accounts on Social Networks via Multi-Class\n Preferential Attachment Classifiers","summary":" In this paper, we describe a new algorithm called Preferential Attachment\nk-class Classifier (PreAttacK) for detecting fake accounts in a social network.\nRecently, several algorithms have obtained high accuracy on this problem.\nHowever, they have done so by relying on information about fake accounts'\nfriendships or the content they share with others--the very things we seek to\nprevent. PreAttacK represents a significant departure from these approaches. We\nprovide some of the first detailed distributional analyses of how new fake (and\nreal) accounts first attempt to request friends after joining a major network\n(Facebook). We show that even before a new account has made friends or shared\ncontent, these initial friend request behaviors evoke a natural multi-class\nextension of the canonical Preferential Attachment model of social network\ngrowth. We use this model to derive a new algorithm, PreAttacK. We prove that\nin relevant problem instances, PreAttacK near-optimally approximates the\nposterior probability that a new account is fake under this multi-class\nPreferential Attachment model of new accounts' (not-yet-answered) friend\nrequests. These are the first provable guarantees for fake account detection\nthat apply to new users, and that do not require strong homophily assumptions.\nThis principled approach also makes PreAttacK the only algorithm with provable\nguarantees that obtains state-of-the-art performance on new users on the global\nFacebook network, where it converges to AUC=0.9 after new users send + receive\na total of just 20 not-yet-answered friend requests. For comparison,\nstate-of-the-art benchmarks do not obtain this AUC even after observing\nadditional data on new users' first 100 friend requests. Thus, unlike\nmainstream algorithms, PreAttacK converges before the median new fake account\nhas made a single friendship (accepted friend request) with a human.\n","authors":["Adam Breuer","Nazanin Khosravani","Michael Tingley","Bradford Cottel"],"pdf_url":"https://arxiv.org/pdf/2308.05353v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05882v1","updated":"2023-08-10T23:54:12Z","published":"2023-08-10T23:54:12Z","title":"GPLaSDI: Gaussian Process-based Interpretable Latent Space Dynamics\n Identification through Deep Autoencoder","summary":" Numerically solving partial differential equations (PDEs) can be challenging\nand computationally expensive. This has led to the development of reduced-order\nmodels (ROMs) that are accurate but faster than full order models (FOMs).\nRecently, machine learning advances have enabled the creation of non-linear\nprojection methods, such as Latent Space Dynamics Identification (LaSDI). LaSDI\nmaps full-order PDE solutions to a latent space using autoencoders and learns\nthe system of ODEs governing the latent space dynamics. By interpolating and\nsolving the ODE system in the reduced latent space, fast and accurate ROM\npredictions can be made by feeding the predicted latent space dynamics into the\ndecoder. In this paper, we introduce GPLaSDI, a novel LaSDI-based framework\nthat relies on Gaussian process (GP) for latent space ODE interpolations. Using\nGPs offers two significant advantages. First, it enables the quantification of\nuncertainty over the ROM predictions. Second, leveraging this prediction\nuncertainty allows for efficient adaptive training through a greedy selection\nof additional training data points. This approach does not require prior\nknowledge of the underlying PDEs. Consequently, GPLaSDI is inherently\nnon-intrusive and can be applied to problems without a known PDE or its\nresidual. We demonstrate the effectiveness of our approach on the Burgers\nequation, Vlasov equation for plasma physics, and a rising thermal bubble\nproblem. Our proposed method achieves between 200 and 100,000 times speed-up,\nwith up to 7% relative error.\n","authors":["Christophe Bonneville","Youngsoo Choi","Debojyoti Ghosh","Jonathan L. Belof"],"pdf_url":"https://arxiv.org/pdf/2308.05882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05881v1","updated":"2023-08-10T23:53:07Z","published":"2023-08-10T23:53:07Z","title":"Aphid Cluster Recognition and Detection in the Wild Using Deep Learning\n Models","summary":" Aphid infestation poses a significant threat to crop production, rural\ncommunities, and global food security. While chemical pest control is crucial\nfor maximizing yields, applying chemicals across entire fields is both\nenvironmentally unsustainable and costly. Hence, precise localization and\nmanagement of aphids are essential for targeted pesticide application. The\npaper primarily focuses on using deep learning models for detecting aphid\nclusters. We propose a novel approach for estimating infection levels by\ndetecting aphid clusters. To facilitate this research, we have captured a\nlarge-scale dataset from sorghum fields, manually selected 5,447 images\ncontaining aphids, and annotated each individual aphid cluster within these\nimages. To facilitate the use of machine learning models, we further process\nthe images by cropping them into patches, resulting in a labeled dataset\ncomprising 151,380 image patches. Then, we implemented and compared the\nperformance of four state-of-the-art object detection models (VFNet, GFLV2,\nPAA, and ATSS) on the aphid dataset. Extensive experimental results show that\nall models yield stable similar performance in terms of average precision and\nrecall. We then propose to merge close neighboring clusters and remove tiny\nclusters caused by cropping, and the performance is further boosted by around\n17%. The study demonstrates the feasibility of automatically detecting and\nmanaging insects using machine learning models. The labeled dataset will be\nmade openly available to the research community.\n","authors":["Tianxiao Zhang","Kaidong Li","Xiangyu Chen","Cuncong Zhong","Bo Luo","Ivan Grijalva","Brian McCornack","Daniel Flippo","Ajay Sharda","Guanghui Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05878v1","updated":"2023-08-10T23:24:51Z","published":"2023-08-10T23:24:51Z","title":"Composable Core-sets for Diversity Approximation on Multi-Dataset\n Streams","summary":" Core-sets refer to subsets of data that maximize some function that is\ncommonly a diversity or group requirement. These subsets are used in place of\nthe original data to accomplish a given task with comparable or even enhanced\nperformance if biases are removed. Composable core-sets are core-sets with the\nproperty that subsets of the core set can be unioned together to obtain an\napproximation for the original data; lending themselves to be used for streamed\nor distributed data. Recent work has focused on the use of core-sets for\ntraining machine learning models. Preceding solutions such as CRAIG have been\nproven to approximate gradient descent while providing a reduced training time.\nIn this paper, we introduce a core-set construction algorithm for constructing\ncomposable core-sets to summarize streamed data for use in active learning\nenvironments. If combined with techniques such as CRAIG and heuristics to\nenhance construction speed, composable core-sets could be used for real time\ntraining of models when the amount of sensor data is large. We provide\nempirical analysis by considering extrapolated data for the runtime of such a\nbrute force algorithm. This algorithm is then analyzed for efficiency through\naveraged empirical regression and key results and improvements are suggested\nfor further research on the topic.\n","authors":["Stephanie Wang","Michael Flynn","Fangyu Luo"],"pdf_url":"https://arxiv.org/pdf/2308.05878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05877v1","updated":"2023-08-10T23:22:41Z","published":"2023-08-10T23:22:41Z","title":"Revisiting N-CNN for Clinical Practice","summary":" This paper revisits the Neonatal Convolutional Neural Network (N-CNN) by\noptimizing its hyperparameters and evaluating how they affect its\nclassification metrics, explainability and reliability, discussing their\npotential impact in clinical practice. We have chosen hyperparameters that do\nnot modify the original N-CNN architecture, but mainly modify its learning rate\nand training regularization. The optimization was done by evaluating the\nimprovement in F1 Score for each hyperparameter individually, and the best\nhyperparameters were chosen to create a Tuned N-CNN. We also applied soft\nlabels derived from the Neonatal Facial Coding System, proposing a novel\napproach for training facial expression classification models for neonatal pain\nassessment. Interestingly, while the Tuned N-CNN results point towards\nimprovements in classification metrics and explainability, these improvements\ndid not directly translate to calibration performance. We believe that such\ninsights might have the potential to contribute to the development of more\nreliable pain evaluation tools for newborns, aiding healthcare professionals in\ndelivering appropriate interventions and improving patient outcomes.\n","authors":["Leonardo Antunes Ferreira","Lucas Pereira Carlini","Gabriel de Almeida Sá Coutrin","Tatiany Marcondes Heideirich","Marina Carvalho de Moraes Barros","Ruth Guinsburg","Carlos Eduardo Thomaz"],"pdf_url":"https://arxiv.org/pdf/2308.05877v1.pdf","comment":"AICAI 2023 in conjuction with MICCAI"},{"id":"http://arxiv.org/abs/2308.05870v1","updated":"2023-08-10T22:52:13Z","published":"2023-08-10T22:52:13Z","title":"UFed-GAN: A Secure Federated Learning Framework with Constrained\n Computation and Unlabeled Data","summary":" To satisfy the broad applications and insatiable hunger for deploying low\nlatency multimedia data classification and data privacy in a cloud-based\nsetting, federated learning (FL) has emerged as an important learning paradigm.\nFor the practical cases involving limited computational power and only\nunlabeled data in many wireless communications applications, this work\ninvestigates FL paradigm in a resource-constrained and label-missing\nenvironment. Specifically, we propose a novel framework of UFed-GAN:\nUnsupervised Federated Generative Adversarial Network, which can capture\nuser-side data distribution without local classification training. We also\nanalyze the convergence and privacy of the proposed UFed-GAN. Our experimental\nresults demonstrate the strong potential of UFed-GAN in addressing limited\ncomputational resources and unlabeled data while preserving privacy.\n","authors":["Achintha Wijesinghe","Songyang Zhang","Siyu Qi","Zhi Ding"],"pdf_url":"https://arxiv.org/pdf/2308.05870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05866v1","updated":"2023-08-10T22:30:24Z","published":"2023-08-10T22:30:24Z","title":"Using Twitter Data to Determine Hurricane Category: An Experiment","summary":" Social media posts contain an abundant amount of information about public\nopinion on major events, especially natural disasters such as hurricanes. Posts\nrelated to an event, are usually published by the users who live near the place\nof the event at the time of the event. Special correlation between the social\nmedia data and the events can be obtained using data mining approaches. This\npaper presents research work to find the mappings between social media data and\nthe severity level of a disaster. Specifically, we have investigated the\nTwitter data posted during hurricanes Harvey and Irma, and attempted to find\nthe correlation between the Twitter data of a specific area and the hurricane\nlevel in that area. Our experimental results indicate a positive correlation\nbetween them. We also present a method to predict the hurricane category for a\nspecific area using relevant Twitter data.\n","authors":["Songhui Yue","Jyothsna Kondari","Aibek Musaev","Randy K. Smith","Songqing Yue"],"pdf_url":"https://arxiv.org/pdf/2308.05866v1.pdf","comment":"9 Pages, 6 Figures, in Proceedings of the 15th ISCRAM Conference\n Rochester, NY, USA May 2018"},{"id":"http://arxiv.org/abs/2307.16104v3","updated":"2023-08-10T22:25:20Z","published":"2023-07-30T01:49:21Z","title":"AI Increases Global Access to Reliable Flood Forecasts","summary":" Floods are one of the most common and impactful natural disasters, with a\ndisproportionate impact in developing countries that often lack dense\nstreamflow monitoring networks. Accurate and timely warnings are critical for\nmitigating flood risks, but accurate hydrological simulation models typically\nmust be calibrated to long data records in each watershed where they are\napplied. We developed an Artificial Intelligence (AI) model to predict extreme\nhydrological events at timescales up to 7 days in advance. This model\nsignificantly outperforms current state of the art global hydrology models (the\nCopernicus Emergency Management Service Global Flood Awareness System) across\nall continents, lead times, and return periods. AI is especially effective at\nforecasting in ungauged basins, which is important because only a few percent\nof the world's watersheds have stream gauges, with a disproportionate number of\nungauged basins in developing countries that are especially vulnerable to the\nhuman impacts of flooding. We produce forecasts of extreme events in South\nAmerica and Africa that achieve reliability approaching the current state of\nthe art in Europe and North America, and we achieve reliability at between 4\nand 6-day lead times that are similar to current state of the art nowcasts\n(0-day lead time). Additionally, we achieve accuracies over 10-year return\nperiod events that are similar to current accuracies over 2-year return period\nevents, meaning that AI can provide warnings earlier and over larger and more\nimpactful events. The model that we develop in this paper has been incorporated\ninto an operational early warning system that produces publicly available (free\nand open) forecasts in real time in over 80 countries. This work using AI and\nopen data highlights a need for increasing the availability of hydrological\ndata to continue to improve global access to reliable flood warnings.\n","authors":["Grey Nearing","Deborah Cohen","Vusumuzi Dube","Martin Gauch","Oren Gilon","Shaun Harrigan","Avinatan Hassidim","Frederik Kratzert","Asher Metzger","Sella Nevo","Florian Pappenberger","Christel Prudhomme","Guy Shalev","Shlomo Shenzis","Tadele Tekalign","Dana Weitzner","Yoss Matias"],"pdf_url":"https://arxiv.org/pdf/2307.16104v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03894v4","updated":"2023-08-10T22:00:30Z","published":"2021-10-08T05:07:35Z","title":"Neural Model Reprogramming with Similarity Based Mapping for\n Low-Resource Spoken Command Classification","summary":" In this study, we propose a novel adversarial reprogramming (AR) approach for\nlow-resource spoken command recognition (SCR), and build an AR-SCR system. The\nAR procedure aims to modify the acoustic signals (from the target domain) to\nrepurpose a pretrained SCR model (from the source domain). To solve the label\nmismatches between source and target domains, and further improve the stability\nof AR, we propose a novel similarity-based label mapping technique to align\nclasses. In addition, the transfer learning (TL) technique is combined with the\noriginal AR process to improve the model adaptation capability. We evaluate the\nproposed AR-SCR system on three low-resource SCR datasets, including Arabic,\nLithuanian, and dysarthric Mandarin speech. Experimental results show that with\na pretrained AM trained on a large-scale English dataset, the proposed AR-SCR\nsystem outperforms the current state-of-the-art results on Arabic and\nLithuanian speech commands datasets, with only a limited amount of training\ndata.\n","authors":["Hao Yen","Pin-Jui Ku","Chao-Han Huck Yang","Hu Hu","Sabato Marco Siniscalchi","Pin-Yu Chen","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2110.03894v4.pdf","comment":"Accepted to Interspeech 2023. Code is available at:\n https://github.com/dodohow1011/SpeechAdvReprogram"},{"id":"http://arxiv.org/abs/2308.05864v1","updated":"2023-08-10T21:59:23Z","published":"2023-08-10T21:59:23Z","title":"The Multi-modality Cell Segmentation Challenge: Towards Universal\n Solutions","summary":" Cell segmentation is a critical step for quantitative single-cell analysis in\nmicroscopy images. Existing cell segmentation methods are often tailored to\nspecific modalities or require manual interventions to specify hyperparameters\nin different experimental settings. Here, we present a multi-modality cell\nsegmentation benchmark, comprising over 1500 labeled images derived from more\nthan 50 diverse biological experiments. The top participants developed a\nTransformer-based deep-learning algorithm that not only exceeds existing\nmethods, but can also be applied to diverse microscopy images across imaging\nplatforms and tissue types without manual parameter adjustments. This benchmark\nand the improved algorithm offer promising avenues for more accurate and\nversatile cell analysis in microscopy imaging.\n","authors":["Jun Ma","Ronald Xie","Shamini Ayyadhury","Cheng Ge","Anubha Gupta","Ritu Gupta","Song Gu","Yao Zhang","Gihun Lee","Joonkee Kim","Wei Lou","Haofeng Li","Eric Upschulte","Timo Dickscheid","José Guilherme de Almeida","Yixin Wang","Lin Han","Xin Yang","Marco Labagnara","Sahand Jamal Rahi","Carly Kempster","Alice Pollitt","Leon Espinosa","Tâm Mignot","Jan Moritz Middeke","Jan-Niklas Eckardt","Wangkai Li","Zhaoyang Li","Xiaochen Cai","Bizhe Bai","Noah F. Greenwald","David Van Valen","Erin Weisbart","Beth A. Cimini","Zhuoshi Li","Chao Zuo","Oscar Brück","Gary D. Bader","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05864v1.pdf","comment":"NeurIPS22 Cell Segmentation Challenge:\n https://neurips22-cellseg.grand-challenge.org/"},{"id":"http://arxiv.org/abs/2003.03229v5","updated":"2023-08-10T21:19:32Z","published":"2020-02-02T21:09:39Z","title":"Non-linear Neurons with Human-like Apical Dendrite Activations","summary":" In order to classify linearly non-separable data, neurons are typically\norganized into multi-layer neural networks that are equipped with at least one\nhidden layer. Inspired by some recent discoveries in neuroscience, we propose a\nnew model of artificial neuron along with a novel activation function enabling\nthe learning of nonlinear decision boundaries using a single neuron. We show\nthat a standard neuron followed by our novel apical dendrite activation (ADA)\ncan learn the XOR logical function with 100% accuracy. Furthermore, we conduct\nexperiments on six benchmark data sets from computer vision, signal processing\nand natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST,\nTiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions\nprovide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and\nSwish, for various neural network architectures, e.g. one-hidden-layer or\ntwo-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural\nnetworks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain\nfurther performance improvements when we change the standard model of the\nneuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our\ncode is available at: https://github.com/raduionescu/pynada.\n","authors":["Mariana-Iuliana Georgescu","Radu Tudor Ionescu","Nicolae-Catalin Ristea","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2003.03229v5.pdf","comment":"Accepted for publication in Applied Intelligence"},{"id":"http://arxiv.org/abs/2308.05857v1","updated":"2023-08-10T21:06:18Z","published":"2023-08-10T21:06:18Z","title":"Knowledge Propagation over Conditional Independence Graphs","summary":" Conditional Independence (CI) graph is a special type of a Probabilistic\nGraphical Model (PGM) where the feature connections are modeled using an\nundirected graph and the edge weights show the partial correlation strength\nbetween the features. Since the CI graphs capture direct dependence between\nfeatures, they have been garnering increasing interest within the research\ncommunity for gaining insights into the systems from various domains, in\nparticular discovering the domain topology. In this work, we propose algorithms\nfor performing knowledge propagation over the CI graphs. Our experiments\ndemonstrate that our techniques improve upon the state-of-the-art on the\npublicly available Cora and PubMed datasets.\n","authors":["Urszula Chajewska","Harsh Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2308.05857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08427v2","updated":"2023-08-10T20:37:20Z","published":"2023-01-20T05:39:26Z","title":"Which Features are Learned by CodeBert: An Empirical Study of the\n BERT-based Source Code Representation Learning","summary":" The Bidirectional Encoder Representations from Transformers (BERT) were\nproposed in the natural language process (NLP) and shows promising results.\nRecently researchers applied the BERT to source-code representation learning\nand reported some good news on several downstream tasks. However, in this\npaper, we illustrated that current methods cannot effectively understand the\nlogic of source codes. The representation of source code heavily relies on the\nprogrammer-defined variable and function names. We design and implement a set\nof experiments to demonstrate our conjecture and provide some insights for\nfuture works.\n","authors":["Lan Zhang","Chen Cao","Zhilong Wang","Peng Liu"],"pdf_url":"https://arxiv.org/pdf/2301.08427v2.pdf","comment":"1 table, 2 figures"},{"id":"http://arxiv.org/abs/2308.05843v1","updated":"2023-08-10T19:51:00Z","published":"2023-08-10T19:51:00Z","title":"GaborPINN: Efficient physics informed neural networks using\n multiplicative filtered networks","summary":" The computation of the seismic wavefield by solving the Helmholtz equation is\ncrucial to many practical applications, e.g., full waveform inversion.\nPhysics-informed neural networks (PINNs) provide functional wavefield solutions\nrepresented by neural networks (NNs), but their convergence is slow. To address\nthis problem, we propose a modified PINN using multiplicative filtered\nnetworks, which embeds some of the known characteristics of the wavefield in\ntraining, e.g., frequency, to achieve much faster convergence. Specifically, we\nuse the Gabor basis function due to its proven ability to represent wavefields\naccurately and refer to the implementation as GaborPINN. Meanwhile, we\nincorporate prior information on the frequency of the wavefield into the design\nof the method to mitigate the influence of the discontinuity of the represented\nwavefield by GaborPINN. The proposed method achieves up to a two-magnitude\nincrease in the speed of convergence as compared with conventional PINNs.\n","authors":["Xinquan Huang","Tariq Alkhalifah"],"pdf_url":"https://arxiv.org/pdf/2308.05843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05832v1","updated":"2023-08-10T19:29:44Z","published":"2023-08-10T19:29:44Z","title":"FLShield: A Validation Based Federated Learning Framework to Defend\n Against Poisoning Attacks","summary":" Federated learning (FL) is revolutionizing how we learn from data. With its\ngrowing popularity, it is now being used in many safety-critical domains such\nas autonomous vehicles and healthcare. Since thousands of participants can\ncontribute in this collaborative setting, it is, however, challenging to ensure\nsecurity and reliability of such systems. This highlights the need to design FL\nsystems that are secure and robust against malicious participants' actions\nwhile also ensuring high utility, privacy of local data, and efficiency. In\nthis paper, we propose a novel FL framework dubbed as FLShield that utilizes\nbenign data from FL participants to validate the local models before taking\nthem into account for generating the global model. This is in stark contrast\nwith existing defenses relying on server's access to clean datasets -- an\nassumption often impractical in real-life scenarios and conflicting with the\nfundamentals of FL. We conduct extensive experiments to evaluate our FLShield\nframework in different settings and demonstrate its effectiveness in thwarting\nvarious types of poisoning and backdoor attacks including a defense-aware one.\nFLShield also preserves privacy of local data against gradient inversion\nattacks.\n","authors":["Ehsanul Kabir","Zeyu Song","Md Rafi Ur Rashid","Shagufta Mehnaz"],"pdf_url":"https://arxiv.org/pdf/2308.05832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02266v2","updated":"2023-08-10T18:55:20Z","published":"2023-03-03T23:46:25Z","title":"Collaborative Learning with a Drone Orchestrator","summary":" In this paper, the problem of drone-assisted collaborative learning is\nconsidered. In this scenario, swarm of intelligent wireless devices train a\nshared neural network (NN) model with the help of a drone. Using its sensors,\neach device records samples from its environment to gather a local dataset for\ntraining. The training data is severely heterogeneous as various devices have\ndifferent amount of data and sensor noise level. The intelligent devices\niteratively train the NN on their local datasets and exchange the model\nparameters with the drone for aggregation. For this system, the convergence\nrate of collaborative learning is derived while considering data heterogeneity,\nsensor noise levels, and communication errors, then, the drone trajectory that\nmaximizes the final accuracy of the trained NN is obtained. The proposed\ntrajectory optimization approach is aware of both the devices data\ncharacteristics (i.e., local dataset size and noise level) and their wireless\nchannel conditions, and significantly improves the convergence rate and final\naccuracy in comparison with baselines that only consider data characteristics\nor channel conditions. Compared to state-of-the-art baselines, the proposed\napproach achieves an average 3.85% and 3.54% improvement in the final accuracy\nof the trained NN on benchmark datasets for image recognition and semantic\nsegmentation tasks, respectively. Moreover, the proposed framework achieves a\nsignificant speedup in training, leading to an average 24% and 87% saving in\nthe drone hovering time, communication overhead, and battery usage,\nrespectively for these tasks.\n","authors":["Mahdi Boloursaz Mashhadi","Mahnoosh Mahdavimoghadam","Rahim Tafazolli","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2303.02266v2.pdf","comment":"Accepted at the IEEE"},{"id":"http://arxiv.org/abs/2308.02080v2","updated":"2023-08-10T18:32:56Z","published":"2023-08-03T23:39:03Z","title":"Causality Guided Disentanglement for Cross-Platform Hate Speech\n Detection","summary":" Social media platforms, despite their value in promoting open discourse, are\noften exploited to spread harmful content. Current deep learning and natural\nlanguage processing models used for detecting this harmful content overly rely\non domain-specific terms affecting their capabilities to adapt to generalizable\nhate speech detection. This is because they tend to focus too narrowly on\nparticular linguistic signals or the use of certain categories of words.\nAnother significant challenge arises when platforms lack high-quality annotated\ndata for training, leading to a need for cross-platform models that can adapt\nto different distribution shifts. Our research introduces a cross-platform hate\nspeech detection model capable of being trained on one platform's data and\ngeneralizing to multiple unseen platforms. To achieve good generalizability\nacross platforms, one way is to disentangle the input representations into\ninvariant and platform-dependent features. We also argue that learning causal\nrelationships, which remain constant across diverse environments, can\nsignificantly aid in understanding invariant representations in hate speech. By\ndisentangling input into platform-dependent features (useful for predicting\nhate targets) and platform-independent features (used to predict the presence\nof hate), we learn invariant representations resistant to distribution shifts.\nThese features are then used to predict hate speech across unseen platforms.\nOur extensive experiments across four platforms highlight our model's enhanced\nefficacy compared to existing state-of-the-art methods in detecting generalized\nhate speech.\n","authors":["Paras Sheth","Tharindu Kumarage","Raha Moraffah","Aman Chadha","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17316v2","updated":"2023-08-10T18:32:07Z","published":"2022-10-26T21:03:17Z","title":"There is more than one kind of robustness: Fooling Whisper with\n adversarial examples","summary":" Whisper is a recent Automatic Speech Recognition (ASR) model displaying\nimpressive robustness to both out-of-distribution inputs and random noise. In\nthis work, we show that this robustness does not carry over to adversarial\nnoise. We show that we can degrade Whisper performance dramatically, or even\ntranscribe a target sentence of our choice, by generating very small input\nperturbations with Signal Noise Ratio of 35-45dB. We also show that by fooling\nthe Whisper language detector we can very easily degrade the performance of\nmultilingual models. These vulnerabilities of a widely popular open-source\nmodel have practical security implications and emphasize the need for\nadversarially robust ASR.\n","authors":["Raphael Olivier","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2210.17316v2.pdf","comment":"Accepted at InterSpeech 2023"},{"id":"http://arxiv.org/abs/2207.04129v3","updated":"2023-08-10T18:28:08Z","published":"2022-07-08T21:25:17Z","title":"How many perturbations break this model? Evaluating robustness beyond\n adversarial accuracy","summary":" Robustness to adversarial attacks is typically evaluated with adversarial\naccuracy. While essential, this metric does not capture all aspects of\nrobustness and in particular leaves out the question of how many perturbations\ncan be found for each point. In this work, we introduce an alternative\napproach, adversarial sparsity, which quantifies how difficult it is to find a\nsuccessful perturbation given both an input point and a constraint on the\ndirection of the perturbation. We show that sparsity provides valuable insight\ninto neural networks in multiple ways: for instance, it illustrates important\ndifferences between current state-of-the-art robust models them that accuracy\nanalysis does not, and suggests approaches for improving their robustness. When\napplying broken defenses effective against weak attacks but not strong ones,\nsparsity can discriminate between the totally ineffective and the partially\neffective defenses. Finally, with sparsity we can measure increases in\nrobustness that do not affect accuracy: we show for example that data\naugmentation can by itself increase adversarial robustness, without using\nadversarial training.\n","authors":["Raphael Olivier","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2207.04129v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02916v2","updated":"2023-08-10T18:10:13Z","published":"2023-08-05T16:21:12Z","title":"Adversarial Erasing with Pruned Elements: Towards Better Graph Lottery\n Ticket","summary":" Graph Lottery Ticket (GLT), a combination of core subgraph and sparse\nsubnetwork, has been proposed to mitigate the computational cost of deep Graph\nNeural Networks (GNNs) on large input graphs while preserving original\nperformance. However, the winning GLTs in exisiting studies are obtained by\napplying iterative magnitude-based pruning (IMP) without re-evaluating and\nre-considering the pruned information, which disregards the dynamic changes in\nthe significance of edges/weights during graph/model structure pruning, and\nthus limits the appeal of the winning tickets. In this paper, we formulate a\nconjecture, i.e., existing overlooked valuable information in the pruned graph\nconnections and model parameters which can be re-grouped into GLT to enhance\nthe final performance. Specifically, we propose an adversarial complementary\nerasing (ACE) framework to explore the valuable information from the pruned\ncomponents, thereby developing a more powerful GLT, referred to as the ACE-GLT.\nThe main idea is to mine valuable information from pruned edges/weights after\neach round of IMP, and employ the ACE technique to refine the GLT processing.\nFinally, experimental results demonstrate that our ACE-GLT outperforms existing\nmethods for searching GLT in diverse tasks. Our code will be made publicly\navailable.\n","authors":["Yuwen Wang","Shunyu Liu","Kaixuan Chen","Tongtian Zhu","Ji Qiao","Mengjie Shi","Yuanyu Wan","Mingli Song"],"pdf_url":"https://arxiv.org/pdf/2308.02916v2.pdf","comment":"17 pages, 10 figures, Accept by ECAI2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.05734v1","updated":"2023-08-10T17:55:13Z","published":"2023-08-10T17:55:13Z","title":"AudioLDM 2: Learning Holistic Audio Generation with Self-supervised\n Pretraining","summary":" Although audio generation shares commonalities across different types of\naudio, such as speech, music, and sound effects, designing models for each type\nrequires careful consideration of specific objectives and biases that can\nsignificantly differ from those of other types. To bring us closer to a unified\nperspective of audio generation, this paper proposes a framework that utilizes\nthe same learning method for speech, music, and sound effect generation. Our\nframework introduces a general representation of audio, called language of\naudio (LOA). Any audio can be translated into LOA based on AudioMAE, a\nself-supervised pre-trained representation learning model. In the generation\nprocess, we translate any modalities into LOA by using a GPT-2 model, and we\nperform self-supervised audio generation learning with a latent diffusion model\nconditioned on LOA. The proposed framework naturally brings advantages such as\nin-context learning abilities and reusable self-supervised pretrained AudioMAE\nand latent diffusion models. Experiments on the major benchmarks of\ntext-to-audio, text-to-music, and text-to-speech demonstrate new\nstate-of-the-art or competitive performance to previous approaches. Our demo\nand code are available at https://audioldm.github.io/audioldm2.\n","authors":["Haohe Liu","Qiao Tian","Yi Yuan","Xubo Liu","Xinhao Mei","Qiuqiang Kong","Yuping Wang","Wenwu Wang","Yuxuan Wang","Mark D. Plumbley"],"pdf_url":"https://arxiv.org/pdf/2308.05734v1.pdf","comment":"AudioLDM 2 project page is https://audioldm.github.io/audioldm2"},{"id":"http://arxiv.org/abs/2308.04132v2","updated":"2023-08-10T17:00:15Z","published":"2023-08-08T08:43:18Z","title":"Optimizing Adaptive Video Streaming with Human Feedback","summary":" Quality of Experience~(QoE)-driven adaptive bitrate (ABR) algorithms are\ntypically optimized using QoE models that are based on the mean opinion\nscore~(MOS), while such principles may not account for user heterogeneity on\nrating scales, resulting in unexpected behaviors. In this paper, we propose\nJade, which leverages reinforcement learning with human feedback~(RLHF)\ntechnologies to better align the users' opinion scores. Jade's rank-based QoE\nmodel considers relative values of user ratings to interpret the subjective\nperception of video sessions. We implement linear-based and Deep Neural Network\n(DNN)-based architectures for satisfying both accuracy and generalization\nability. We further propose entropy-aware reinforced mechanisms for training\npolicies with the integration of the proposed QoE models. Experimental results\ndemonstrate that Jade performs favorably on conventional metrics, such as\nquality and stall ratio, and improves QoE by 8.09%-38.13% in different network\nconditions, emphasizing the importance of user heterogeneity in QoE modeling\nand the potential of combining linear-based and DNN-based models for\nperformance improvement.\n","authors":["Tianchi Huang","Rui-Xiao Zhang","Chenglei Wu","Lifeng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.04132v2.pdf","comment":"ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.05428v1","updated":"2023-08-10T08:42:20Z","published":"2023-08-10T08:42:20Z","title":"Speech-Driven 3D Face Animation with Composite and Regional Facial\n Movements","summary":" Speech-driven 3D face animation poses significant challenges due to the\nintricacy and variability inherent in human facial movements. This paper\nemphasizes the importance of considering both the composite and regional\nnatures of facial movements in speech-driven 3D face animation. The composite\nnature pertains to how speech-independent factors globally modulate\nspeech-driven facial movements along the temporal dimension. Meanwhile, the\nregional nature alludes to the notion that facial movements are not globally\ncorrelated but are actuated by local musculature along the spatial dimension.\nIt is thus indispensable to incorporate both natures for engendering vivid\nanimation. To address the composite nature, we introduce an adaptive modulation\nmodule that employs arbitrary facial movements to dynamically adjust\nspeech-driven facial movements across frames on a global scale. To accommodate\nthe regional nature, our approach ensures that each constituent of the facial\nfeatures for every frame focuses on the local spatial movements of 3D faces.\nMoreover, we present a non-autoregressive backbone for translating audio to 3D\nfacial movements, which maintains high-frequency nuances of facial movements\nand facilitates efficient inference. Comprehensive experiments and user studies\ndemonstrate that our method surpasses contemporary state-of-the-art approaches\nboth qualitatively and quantitatively.\n","authors":["Haozhe Wu","Songtao Zhou","Jia Jia","Junliang Xing","Qi Wen","Xiang Wen"],"pdf_url":"https://arxiv.org/pdf/2308.05428v1.pdf","comment":"Accepted by MM 2023, 9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.05421v1","updated":"2023-08-10T08:29:36Z","published":"2023-08-10T08:29:36Z","title":"Progressive Spatio-temporal Perception for Audio-Visual Question\n Answering","summary":" Audio-Visual Question Answering (AVQA) task aims to answer questions about\ndifferent visual objects, sounds, and their associations in videos. Such\nnaturally multi-modal videos are composed of rich and complex dynamic\naudio-visual components, where most of which could be unrelated to the given\nquestions, or even play as interference in answering the content of interest.\nOppositely, only focusing on the question-aware audio-visual content could get\nrid of influence, meanwhile enabling the model to answer more efficiently. In\nthis paper, we propose a Progressive Spatio-Temporal Perception Network\n(PSTP-Net), which contains three modules that progressively identify key\nspatio-temporal regions w.r.t. questions. Specifically, a temporal segment\nselection module is first introduced to select the most relevant audio-visual\nsegments related to the given question. Then, a spatial region selection module\nis utilized to choose the most relevant regions associated with the question\nfrom the selected temporal segments. To further refine the selection of\nfeatures, an audio-guided visual attention module is employed to perceive the\nassociation between auido and selected spatial regions. Finally, the\nspatio-temporal features from these modules are integrated for answering the\nquestion. Extensive experimental results on the public MUSIC-AVQA and AVQA\ndatasets provide compelling evidence of the effectiveness and efficiency of\nPSTP-Net. Code is available at:\n\\href{https://github.com/GeWu-Lab/PSTP-Net}{https://github.com/GeWu-Lab/PSTP-Net}\n","authors":["Guangyao Li","Wenxuan Hou","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2308.05421v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2207.03190v2","updated":"2023-08-10T08:06:05Z","published":"2022-07-07T09:44:44Z","title":"Learning Music-Dance Representations through Explicit-Implicit Rhythm\n Synchronization","summary":" Although audio-visual representation has been proved to be applicable in many\ndownstream tasks, the representation of dancing videos, which is more specific\nand always accompanied by music with complex auditory contents, remains\nchallenging and uninvestigated. Considering the intrinsic alignment between the\ncadent movement of dancer and music rhythm, we introduce MuDaR, a novel\nMusic-Dance Representation learning framework to perform the synchronization of\nmusic and dance rhythms both in explicit and implicit ways. Specifically, we\nderive the dance rhythms based on visual appearance and motion cues inspired by\nthe music rhythm analysis. Then the visual rhythms are temporally aligned with\nthe music counterparts, which are extracted by the amplitude of sound\nintensity. Meanwhile, we exploit the implicit coherence of rhythms implied in\naudio and visual streams by contrastive learning. The model learns the joint\nembedding by predicting the temporal consistency between audio-visual pairs.\nThe music-dance representation, together with the capability of detecting audio\nand visual rhythms, can further be applied to three downstream tasks: (a) dance\nclassification, (b) music-dance retrieval, and (c) music-dance retargeting.\nExtensive experiments demonstrate that our proposed framework outperforms other\nself-supervised methods by a large margin.\n","authors":["Jiashuo Yu","Junfu Pu","Ying Cheng","Rui Feng","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2207.03190v2.pdf","comment":"Accepted for publication in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2304.02970v3","updated":"2023-08-10T04:08:44Z","published":"2023-04-06T09:54:06Z","title":"A Closer Look at Audio-Visual Semantic Segmentation","summary":" Audio-visual segmentation (AVS) is a complex task that involves accurately\nsegmenting the corresponding sounding object based on audio-visual queries.\nSuccessful audio-visual learning requires two essential components: 1) an\nunbiased dataset with high-quality pixel-level multi-class labels, and 2) a\nmodel capable of effectively linking audio information with its corresponding\nvisual object. However, these two requirements are only partially addressed by\ncurrent methods, with training sets containing biased audio-visual data, and\nmodels that generalise poorly beyond this biased training set. In this work, we\npropose a new strategy to build cost-effective and relatively unbiased\naudio-visual semantic segmentation benchmarks. Our strategy, called Visual\nPost-production (VPO), explores the observation that it is not necessary to\nhave explicit audio-visual pairs extracted from single video sources to build\nsuch benchmarks. We also refine the previously proposed AVSBench to transform\nit into the audio-visual semantic segmentation benchmark AVSBench-Single+.\nFurthermore, this paper introduces a new pixel-wise audio-visual contrastive\nlearning method to enable a better generalisation of the model beyond the\ntraining set. We verify the validity of the VPO strategy by showing that\nstate-of-the-art (SOTA) models trained with datasets built by matching audio\nand visual data from different sources or with datasets containing audio and\nvisual data from the same video source produce almost the same accuracy. Then,\nusing the proposed VPO benchmarks and AVSBench-Single+, we show that our method\nproduces more accurate audio-visual semantic segmentation than SOTA models.\nCode and dataset will be available.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03463v3","updated":"2023-08-10T02:26:16Z","published":"2023-08-07T10:41:52Z","title":"DiffSynth: Latent In-Iteration Deflickering for Realistic Video\n Synthesis","summary":" In recent years, diffusion models have emerged as the most powerful approach\nin image synthesis. However, applying these models directly to video synthesis\npresents challenges, as it often leads to noticeable flickering contents.\nAlthough recently proposed zero-shot methods can alleviate flicker to some\nextent, we still struggle to generate coherent videos. In this paper, we\npropose DiffSynth, a novel approach that aims to convert image synthesis\npipelines to video synthesis pipelines. DiffSynth consists of two key\ncomponents: a latent in-iteration deflickering framework and a video\ndeflickering algorithm. The latent in-iteration deflickering framework applies\nvideo deflickering to the latent space of diffusion models, effectively\npreventing flicker accumulation in intermediate steps. Additionally, we propose\na video deflickering algorithm, named patch blending algorithm, that remaps\nobjects in different frames and blends them together to enhance video\nconsistency. One of the notable advantages of DiffSynth is its general\napplicability to various video synthesis tasks, including text-guided video\nstylization, fashion video synthesis, image-guided video stylization, video\nrestoring, and 3D rendering. In the task of text-guided video stylization, we\nmake it possible to synthesize high-quality videos without cherry-picking. The\nexperimental results demonstrate the effectiveness of DiffSynth. All videos can\nbe viewed on our project page. Source codes will also be released.\n","authors":["Zhongjie Duan","Lizhou You","Chengyu Wang","Cen Chen","Ziheng Wu","Weining Qian","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03463v3.pdf","comment":"9 pages, 6 figures"}]},"2023-08-11T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.06259v1","updated":"2023-08-11T17:47:54Z","published":"2023-08-11T17:47:54Z","title":"Self-Alignment with Instruction Backtranslation","summary":" We present a scalable method to build a high quality instruction following\nlanguage model by automatically labelling human-written text with corresponding\ninstructions. Our approach, named instruction backtranslation, starts with a\nlanguage model finetuned on a small amount of seed data, and a given web\ncorpus. The seed model is used to construct training examples by generating\ninstruction prompts for web documents (self-augmentation), and then selecting\nhigh quality examples from among these candidates (self-curation). This data is\nthen used to finetune a stronger model. Finetuning LLaMa on two iterations of\nour approach yields a model that outperforms all other LLaMa-based models on\nthe Alpaca leaderboard not relying on distillation data, demonstrating highly\neffective self-alignment.\n","authors":["Xian Li","Ping Yu","Chunting Zhou","Timo Schick","Luke Zettlemoyer","Omer Levy","Jason Weston","Mike Lewis"],"pdf_url":"https://arxiv.org/pdf/2308.06259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06817v2","updated":"2023-08-11T17:45:27Z","published":"2022-12-13T18:55:15Z","title":"RT-1: Robotics Transformer for Real-World Control at Scale","summary":" By transferring knowledge from large, diverse, task-agnostic datasets, modern\nmachine learning models can solve specific downstream tasks either zero-shot or\nwith small task-specific datasets to a high level of performance. While this\ncapability has been demonstrated in other fields such as computer vision,\nnatural language processing or speech recognition, it remains to be shown in\nrobotics, where the generalization capabilities of the models are particularly\ncritical due to the difficulty of collecting real-world robotic data. We argue\nthat one of the keys to the success of such general robotic models lies with\nopen-ended task-agnostic training, combined with high-capacity architectures\nthat can absorb all of the diverse, robotic data. In this paper, we present a\nmodel class, dubbed Robotics Transformer, that exhibits promising scalable\nmodel properties. We verify our conclusions in a study of different model\nclasses and their ability to generalize as a function of the data size, model\nsize, and data diversity based on a large-scale data collection on real robots\nperforming real-world tasks. The project's website and videos can be found at\nrobotics-transformer1.github.io\n","authors":["Anthony Brohan","Noah Brown","Justice Carbajal","Yevgen Chebotar","Joseph Dabis","Chelsea Finn","Keerthana Gopalakrishnan","Karol Hausman","Alex Herzog","Jasmine Hsu","Julian Ibarz","Brian Ichter","Alex Irpan","Tomas Jackson","Sally Jesmonth","Nikhil J Joshi","Ryan Julian","Dmitry Kalashnikov","Yuheng Kuang","Isabel Leal","Kuang-Huei Lee","Sergey Levine","Yao Lu","Utsav Malla","Deeksha Manjunath","Igor Mordatch","Ofir Nachum","Carolina Parada","Jodilyn Peralta","Emily Perez","Karl Pertsch","Jornell Quiambao","Kanishka Rao","Michael Ryoo","Grecia Salazar","Pannag Sanketi","Kevin Sayed","Jaspiar Singh","Sumedh Sontakke","Austin Stone","Clayton Tan","Huong Tran","Vincent Vanhoucke","Steve Vega","Quan Vuong","Fei Xia","Ted Xiao","Peng Xu","Sichun Xu","Tianhe Yu","Brianna Zitkovich"],"pdf_url":"https://arxiv.org/pdf/2212.06817v2.pdf","comment":"See website at robotics-transformer1.github.io"},{"id":"http://arxiv.org/abs/2305.10615v2","updated":"2023-08-11T17:39:21Z","published":"2023-05-18T00:01:27Z","title":"ML-SUPERB: Multilingual Speech Universal PERformance Benchmark","summary":" Speech processing Universal PERformance Benchmark (SUPERB) is a leaderboard\nto benchmark the performance of Self-Supervised Learning (SSL) models on\nvarious speech processing tasks. However, SUPERB largely considers English\nspeech in its evaluation. This paper presents multilingual SUPERB (ML-SUPERB),\ncovering 143 languages (ranging from high-resource to endangered), and\nconsidering both automatic speech recognition and language identification.\nFollowing the concept of SUPERB, ML-SUPERB utilizes frozen SSL features and\nemploys a simple framework for multilingual tasks by learning a shallow\ndownstream model. Similar to the SUPERB benchmark, we find speech SSL models\ncan significantly improve performance compared to FBANK features. Furthermore,\nwe find that multilingual models do not always perform better than their\nmonolingual counterparts. We will release ML-SUPERB as a challenge with\norganized datasets and reproducible training scripts for future multilingual\nrepresentation research.\n","authors":["Jiatong Shi","Dan Berrebbi","William Chen","Ho-Lam Chung","En-Pei Hu","Wei Ping Huang","Xuankai Chang","Shang-Wen Li","Abdelrahman Mohamed","Hung-yi Lee","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2305.10615v2.pdf","comment":"Accepted by Interspeech"},{"id":"http://arxiv.org/abs/2308.06235v1","updated":"2023-08-11T17:08:14Z","published":"2023-08-11T17:08:14Z","title":"KETM:A Knowledge-Enhanced Text Matching method","summary":" Text matching is the task of matching two texts and determining the\nrelationship between them, which has extensive applications in natural language\nprocessing tasks such as reading comprehension, and Question-Answering systems.\nThe mainstream approach is to compute text representations or to interact with\nthe text through attention mechanism, which is effective in text matching\ntasks. However, the performance of these models is insufficient for texts that\nrequire commonsense knowledge-based reasoning. To this end, in this paper, We\nintroduce a new model for text matching called the Knowledge Enhanced Text\nMatching model (KETM), to enrich contextual representations with real-world\ncommon-sense knowledge from external knowledge sources to enhance our model\nunderstanding and reasoning. First, we use Wiktionary to retrieve the text word\ndefinitions as our external knowledge. Secondly, we feed text and knowledge to\nthe text matching module to extract their feature vectors. The text matching\nmodule is used as an interaction module by integrating the encoder layer, the\nco-attention layer, and the aggregation layer. Specifically, the interaction\nprocess is iterated several times to obtain in-depth interaction information\nand extract the feature vectors of text and knowledge by multi-angle pooling.\nThen, we fuse text and knowledge using a gating mechanism to learn the ratio of\ntext and knowledge fusion by a neural network that prevents noise generated by\nknowledge. After that, experimental validation on four datasets are carried\nout, and the experimental results show that our proposed model performs well on\nall four datasets, and the performance of our method is improved compared to\nthe base model without adding external knowledge, which validates the\neffectiveness of our proposed method. The code is available at\nhttps://github.com/1094701018/KETM\n","authors":["Kexin Jiang","Yahui Zhao","Guozhe Jin","Zhenguo Zhang","Rongyi Cui"],"pdf_url":"https://arxiv.org/pdf/2308.06235v1.pdf","comment":"Accepted to IJCNN 2023"},{"id":"http://arxiv.org/abs/2308.06212v1","updated":"2023-08-11T16:30:44Z","published":"2023-08-11T16:30:44Z","title":"A Large Language Model Enhanced Conversational Recommender System","summary":" Conversational recommender systems (CRSs) aim to recommend high-quality items\nto users through a dialogue interface. It usually contains multiple sub-tasks,\nsuch as user preference elicitation, recommendation, explanation, and item\ninformation search. To develop effective CRSs, there are some challenges: 1)\nhow to properly manage sub-tasks; 2) how to effectively solve different\nsub-tasks; and 3) how to correctly generate responses that interact with users.\nRecently, Large Language Models (LLMs) have exhibited an unprecedented ability\nto reason and generate, presenting a new opportunity to develop more powerful\nCRSs. In this work, we propose a new LLM-based CRS, referred to as LLMCRS, to\naddress the above challenges. For sub-task management, we leverage the\nreasoning ability of LLM to effectively manage sub-task. For sub-task solving,\nwe collaborate LLM with expert models of different sub-tasks to achieve the\nenhanced performance. For response generation, we utilize the generation\nability of LLM as a language interface to better interact with users.\nSpecifically, LLMCRS divides the workflow into four stages: sub-task detection,\nmodel matching, sub-task execution, and response generation. LLMCRS also\ndesigns schema-based instruction, demonstration-based instruction, dynamic\nsub-task and model matching, and summary-based generation to instruct LLM to\ngenerate desired results in the workflow. Finally, to adapt LLM to\nconversational recommendations, we also propose to fine-tune LLM with\nreinforcement learning from CRSs performance feedback, referred to as RLPF.\nExperimental results on benchmark datasets show that LLMCRS with RLPF\noutperforms the existing methods.\n","authors":["Yue Feng","Shuchang Liu","Zhenghai Xue","Qingpeng Cai","Lantao Hu","Peng Jiang","Kun Gai","Fei Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06207v1","updated":"2023-08-11T16:13:04Z","published":"2023-08-11T16:13:04Z","title":"Thinking Like an Expert:Multimodal Hypergraph-of-Thought (HoT) Reasoning\n to boost Foundation Modals","summary":" Reasoning ability is one of the most crucial capabilities of a foundation\nmodel, signifying its capacity to address complex reasoning tasks.\nChain-of-Thought (CoT) technique is widely regarded as one of the effective\nmethods for enhancing the reasoning ability of foundation models and has\ngarnered significant attention. However, the reasoning process of CoT is\nlinear, step-by-step, similar to personal logical reasoning, suitable for\nsolving general and slightly complicated problems. On the contrary, the\nthinking pattern of an expert owns two prominent characteristics that cannot be\nhandled appropriately in CoT, i.e., high-order multi-hop reasoning and\nmultimodal comparative judgement. Therefore, the core motivation of this paper\nis transcending CoT to construct a reasoning paradigm that can think like an\nexpert. The hyperedge of a hypergraph could connect various vertices, making it\nnaturally suitable for modelling high-order relationships. Inspired by this,\nthis paper innovatively proposes a multimodal Hypergraph-of-Thought (HoT)\nreasoning paradigm, which enables the foundation models to possess the\nexpert-level ability of high-order multi-hop reasoning and multimodal\ncomparative judgement. Specifically, a textual hypergraph-of-thought is\nconstructed utilizing triple as the primary thought to model higher-order\nrelationships, and a hyperedge-of-thought is generated through multi-hop\nwalking paths to achieve multi-hop inference. Furthermore, we devise a visual\nhypergraph-of-thought to interact with the textual hypergraph-of-thought via\nCross-modal Co-Attention Graph Learning for multimodal comparative\nverification. Experimentations on the ScienceQA benchmark demonstrate the\nproposed HoT-based T5 outperforms CoT-based GPT3.5 and chatGPT, which is on par\nwith CoT-based GPT4 with a lower model size.\n","authors":["Fanglong Yao","Changyuan Tian","Jintao Liu","Zequn Zhang","Qing Liu","Li Jin","Shuchao Li","Xiaoyu Li","Xian Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06199v1","updated":"2023-08-11T15:47:49Z","published":"2023-08-11T15:47:49Z","title":"Weakly Supervised Text Classification on Free Text Comments in\n Patient-Reported Outcome Measures","summary":" Free text comments (FTC) in patient-reported outcome measures (PROMs) data\nare typically analysed using manual methods, such as content analysis, which is\nlabour-intensive and time-consuming. Machine learning analysis methods are\nlargely unsupervised, necessitating post-analysis interpretation. Weakly\nsupervised text classification (WSTC) can be a valuable method of analysis to\nclassify domain-specific text data in which there is limited labelled data. In\nthis paper, we apply five WSTC techniques to FTC in PROMs data to identify\nhealth-related quality of life (HRQoL) themes reported by colorectal cancer\npatients. The WSTC methods label all the themes mentioned in the FTC. The\nresults showed moderate performance on the PROMs data, mainly due to the\nprecision of the models, and variation between themes. Evaluation of the\nclassification performance illustrated the potential and limitations of keyword\nbased WSTC to label PROMs FTC when labelled data is limited.\n","authors":["Anna-Grace Linton","Vania Dimitrova","Amy Downing","Richard Wagland","Adam Glaser"],"pdf_url":"https://arxiv.org/pdf/2308.06199v1.pdf","comment":"Accepted and presented at Health Text Analytics conference 2023 (UK)"},{"id":"http://arxiv.org/abs/2308.04255v2","updated":"2023-08-11T15:24:37Z","published":"2023-08-08T13:41:41Z","title":"CLASSLA-Stanza: The Next Step for Linguistic Processing of South Slavic\n Languages","summary":" We present CLASSLA-Stanza, a pipeline for automatic linguistic annotation of\nthe South Slavic languages, which is based on the Stanza natural language\nprocessing pipeline. We describe the main improvements in CLASSLA-Stanza with\nrespect to Stanza, and give a detailed description of the model training\nprocess for the latest 2.1 release of the pipeline. We also report performance\nscores produced by the pipeline for different languages and varieties.\nCLASSLA-Stanza exhibits consistently high performance across all the supported\nlanguages and outperforms or expands its parent pipeline Stanza at all the\nsupported tasks. We also present the pipeline's new functionality enabling\nefficient processing of web data and the reasons that led to its\nimplementation.\n","authors":["Luka Terčon","Nikola Ljubešić"],"pdf_url":"https://arxiv.org/pdf/2308.04255v2.pdf","comment":"17 pages, 14 tables, 1 figure; Typos corrected"},{"id":"http://arxiv.org/abs/2308.06175v1","updated":"2023-08-11T15:04:34Z","published":"2023-08-11T15:04:34Z","title":"Assessing Guest Nationality Composition from Hotel Reviews","summary":" Many hotels target guest acquisition efforts to specific markets in order to\nbest anticipate individual preferences and needs of their guests. Likewise,\nsuch strategic positioning is a prerequisite for efficient marketing budget\nallocation. Official statistics report on the number of visitors from different\ncountries, but no fine-grained information on the guest composition of\nindividual businesses exists. There is, however, growing interest in such data\nfrom competitors, suppliers, researchers and the general public. We demonstrate\nhow machine learning can be leveraged to extract references to guest\nnationalities from unstructured text reviews in order to dynamically assess and\nmonitor the dynamics of guest composition of individual businesses. In\nparticular, we show that a rather simple architecture of pre-trained embeddings\nand stacked LSTM layers provides a better performance-runtime tradeoff than\nmore complex state-of-the-art language models.\n","authors":["Fabian Gröger","Marc Pouly","Flavia Tinner","Leif Brandes"],"pdf_url":"https://arxiv.org/pdf/2308.06175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06165v1","updated":"2023-08-11T14:47:27Z","published":"2023-08-11T14:47:27Z","title":"Task Conditioned BERT for Joint Intent Detection and Slot-filling","summary":" Dialogue systems need to deal with the unpredictability of user intents to\ntrack dialogue state and the heterogeneity of slots to understand user\npreferences. In this paper we investigate the hypothesis that solving these\nchallenges as one unified model will allow the transfer of parameter support\ndata across the different tasks. The proposed principled model is based on a\nTransformer encoder, trained on multiple tasks, and leveraged by a rich input\nthat conditions the model on the target inferences. Conditioning the\nTransformer encoder on multiple target inferences over the same corpus, i.e.,\nintent and multiple slot types, allows learning richer language interactions\nthan a single-task model would be able to. In fact, experimental results\ndemonstrate that conditioning the model on an increasing number of dialogue\ninference tasks leads to improved results: on the MultiWOZ dataset, the joint\nintent and slot detection can be improved by 3.2\\% by conditioning on intent,\n10.8\\% by conditioning on slot and 14.4\\% by conditioning on both intent and\nslots. Moreover, on real conversations with Farfetch costumers, the proposed\nconditioned BERT can achieve high joint-goal and intent detection performance\nthroughout a dialogue.\n","authors":["Diogo Tavares","Pedro Azevedo","David Semedo","Ricardo Sousa","João Magalhães"],"pdf_url":"https://arxiv.org/pdf/2308.06165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14704v2","updated":"2023-08-11T14:17:56Z","published":"2023-06-26T13:54:47Z","title":"Ontology Enrichment from Texts: A Biomedical Dataset for Concept\n Discovery and Placement","summary":" Mentions of new concepts appear regularly in texts and require automated\napproaches to harvest and place them into Knowledge Bases (KB), e.g.,\nontologies and taxonomies. Existing datasets suffer from three issues, (i)\nmostly assuming that a new concept is pre-discovered and cannot support\nout-of-KB mention discovery; (ii) only using the concept label as the input\nalong with the KB and thus lacking the contexts of a concept label; and (iii)\nmostly focusing on concept placement w.r.t a taxonomy of atomic concepts,\ninstead of complex concepts, i.e., with logical operators. To address these\nissues, we propose a new benchmark, adapting MedMentions dataset (PubMed\nabstracts) with SNOMED CT versions in 2014 and 2017 under the Diseases\nsub-category and the broader categories of Clinical finding, Procedure, and\nPharmaceutical / biologic product. We provide usage on the evaluation with the\ndataset for out-of-KB mention discovery and concept placement, adapting recent\nLarge Language Model based methods.\n","authors":["Hang Dong","Jiaoyan Chen","Yuan He","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2306.14704v2.pdf","comment":"5 pages, 1 figure, accepted for CIKM 2023. The dataset, data\n construction scripts, and baseline implementation are available at\n https://zenodo.org/record/8228005 (Zenodo) and\n https://github.com/KRR-Oxford/OET (GitHub)"},{"id":"http://arxiv.org/abs/2308.06144v1","updated":"2023-08-11T14:06:41Z","published":"2023-08-11T14:06:41Z","title":"Identification of the Relevance of Comments in Codes Using Bag of Words\n and Transformer Based Models","summary":" The Forum for Information Retrieval (FIRE) started a shared task this year\nfor classification of comments of different code segments. This is binary text\nclassification task where the objective is to identify whether comments given\nfor certain code segments are relevant or not. The BioNLP-IISERB group at the\nIndian Institute of Science Education and Research Bhopal (IISERB) participated\nin this task and submitted five runs for five different models. The paper\npresents the overview of the models and other significant findings on the\ntraining corpus. The methods involve different feature engineering schemes and\ntext classification techniques. The performance of the classical bag of words\nmodel and transformer-based models were explored to identify significant\nfeatures from the given training corpus. We have explored different classifiers\nviz., random forest, support vector machine and logistic regression using the\nbag of words model. Furthermore, the pre-trained transformer based models like\nBERT, RoBERT and ALBERT were also used by fine-tuning them on the given\ntraining corpus. The performance of different such models over the training\ncorpus were reported and the best five models were implemented on the given\ntest corpus. The empirical results show that the bag of words model outperforms\nthe transformer based models, however, the performance of our runs are not\nreasonably well in both training and test corpus. This paper also addresses the\nlimitations of the models and scope for further improvement.\n","authors":["Sruthi S","Tanmay Basu"],"pdf_url":"https://arxiv.org/pdf/2308.06144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07189v3","updated":"2023-08-11T13:57:42Z","published":"2023-02-14T17:00:06Z","title":"Reveal the Unknown: Out-of-Knowledge-Base Mention Discovery with Entity\n Linking","summary":" Discovering entity mentions that are out of a Knowledge Base (KB) from texts\nplays a critical role in KB maintenance, but has not yet been fully explored.\nThe current methods are mostly limited to the simple threshold-based approach\nand feature-based classification, and the datasets for evaluation are\nrelatively rare. We propose BLINKout, a new BERT-based Entity Linking (EL)\nmethod which can identify mentions that do not have corresponding KB entities\nby matching them to a special NIL entity. To better utilize BERT, we propose\nnew techniques including NIL entity representation and classification, with\nsynonym enhancement. We also apply KB Pruning and Versioning strategies to\nautomatically construct out-of-KB datasets from common in-KB EL datasets.\nResults on five datasets of clinical notes, biomedical publications, and\nWikipedia articles in various domains show the advantages of BLINKout over\nexisting methods to identify out-of-KB mentions for the medical ontologies,\nUMLS, SNOMED CT, and the general KB, WikiData.\n","authors":["Hang Dong","Jiaoyan Chen","Yuan He","Yinan Liu","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2302.07189v3.pdf","comment":"11 pages, 3 figures, accepted for CIKM 2023"},{"id":"http://arxiv.org/abs/2302.14057v2","updated":"2023-08-11T13:48:44Z","published":"2023-02-25T10:12:34Z","title":"Cross-modal Contrastive Learning for Multimodal Fake News Detection","summary":" Automatic detection of multimodal fake news has gained a widespread attention\nrecently. Many existing approaches seek to fuse unimodal features to produce\nmultimodal news representations. However, the potential of powerful cross-modal\ncontrastive learning methods for fake news detection has not been well\nexploited. Besides, how to aggregate features from different modalities to\nboost the performance of the decision-making process is still an open question.\nTo address that, we propose COOLANT, a cross-modal contrastive learning\nframework for multimodal fake news detection, aiming to achieve more accurate\nimage-text alignment. To further improve the alignment precision, we leverage\nan auxiliary task to soften the loss term of negative samples during the\ncontrast process. A cross-modal fusion module is developed to learn the\ncross-modality correlations. An attention mechanism with an attention guidance\nmodule is implemented to help effectively and interpretably aggregate the\naligned unimodal representations and the cross-modality correlations. Finally,\nwe evaluate the COOLANT and conduct a comparative study on two widely used\ndatasets, Twitter and Weibo. The experimental results demonstrate that our\nCOOLANT outperforms previous approaches by a large margin and achieves new\nstate-of-the-art results on the two datasets.\n","authors":["Longzheng Wang","Chuang Zhang","Hongbo Xu","Yongxiu Xu","Xiaohan Xu","Siqi Wang"],"pdf_url":"https://arxiv.org/pdf/2302.14057v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.06125v1","updated":"2023-08-11T13:28:48Z","published":"2023-08-11T13:28:48Z","title":"Improving Joint Speech-Text Representations Without Alignment","summary":" The last year has seen astonishing progress in text-prompted image generation\npremised on the idea of a cross-modal representation space in which the text\nand image domains are represented jointly. In ASR, this idea has found\napplication as joint speech-text encoders that can scale to the capacities of\nvery large parameter models by being trained on both unpaired speech and text.\nWhile these methods show promise, they have required special treatment of the\nsequence-length mismatch inherent in speech and text, either by up-sampling\nheuristics or an explicit alignment model. In this work, we offer evidence that\njoint speech-text encoders naturally achieve consistent representations across\nmodalities by disregarding sequence length, and argue that consistency losses\ncould forgive length differences and simply assume the best alignment. We show\nthat such a loss improves downstream WER in both a large-parameter monolingual\nand multilingual system.\n","authors":["Cal Peyser","Zhong Meng","Ke Hu","Rohit Prabhavalkar","Andrew Rosenberg","Tara N. Sainath","Michael Picheny","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2308.06125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06112v1","updated":"2023-08-11T12:59:02Z","published":"2023-08-11T12:59:02Z","title":"Lip2Vec: Efficient and Robust Visual Speech Recognition via\n Latent-to-Latent Visual to Audio Representation Mapping","summary":" Visual Speech Recognition (VSR) differs from the common perception tasks as\nit requires deeper reasoning over the video sequence, even by human experts.\nDespite the recent advances in VSR, current approaches rely on labeled data to\nfully train or finetune their models predicting the target speech. This hinders\ntheir ability to generalize well beyond the training set and leads to\nperformance degeneration under out-of-distribution challenging scenarios.\nUnlike previous works that involve auxiliary losses or complex training\nprocedures and architectures, we propose a simple approach, named Lip2Vec that\nis based on learning a prior model. Given a robust visual speech encoder, this\nnetwork maps the encoded latent representations of the lip sequence to their\ncorresponding latents from the audio pair, which are sufficiently invariant for\neffective text decoding. The generated audio representation is then decoded to\ntext using an off-the-shelf Audio Speech Recognition (ASR) model. The proposed\nmodel compares favorably with fully-supervised learning methods on the LRS3\ndataset achieving 26 WER. Unlike SoTA approaches, our model keeps a reasonable\nperformance on the VoxCeleb test set. We believe that reprogramming the VSR as\nan ASR task narrows the performance gap between the two and paves the way for\nmore flexible formulations of lip reading.\n","authors":["Yasser Abdelaziz Dahou Djilali","Sanath Narayan","Haithem Boussaid","Ebtessam Almazrouei","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2308.06112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06111v1","updated":"2023-08-11T12:55:09Z","published":"2023-08-11T12:55:09Z","title":"Improving Zero-Shot Text Matching for Financial Auditing with Large\n Language Models","summary":" Auditing financial documents is a very tedious and time-consuming process. As\nof today, it can already be simplified by employing AI-based solutions to\nrecommend relevant text passages from a report for each legal requirement of\nrigorous accounting standards. However, these methods need to be fine-tuned\nregularly, and they require abundant annotated data, which is often lacking in\nindustrial environments. Hence, we present ZeroShotALI, a novel recommender\nsystem that leverages a state-of-the-art large language model (LLM) in\nconjunction with a domain-specifically optimized transformer-based\ntext-matching solution. We find that a two-step approach of first retrieving a\nnumber of best matching document sections per legal requirement with a custom\nBERT-based model and second filtering these selections using an LLM yields\nsignificant performance improvements over existing approaches.\n","authors":["Lars Hillebrand","Armin Berger","Tobias Deußer","Tim Dilmaghani","Mohamed Khaled","Bernd Kliem","Rüdiger Loitz","Maren Pielka","David Leonhard","Christian Bauckhage","Rafet Sifa"],"pdf_url":"https://arxiv.org/pdf/2308.06111v1.pdf","comment":"4 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2308.03043v2","updated":"2023-08-11T12:07:52Z","published":"2023-08-06T07:59:12Z","title":"3D-EX : A Unified Dataset of Definitions and Dictionary Examples","summary":" Definitions are a fundamental building block in lexicography, linguistics and\ncomputational semantics. In NLP, they have been used for retrofitting word\nembeddings or augmenting contextual representations in language models.\nHowever, lexical resources containing definitions exhibit a wide range of\nproperties, which has implications in the behaviour of models trained and\nevaluated on them. In this paper, we introduce 3D- EX , a dataset that aims to\nfill this gap by combining well-known English resources into one centralized\nknowledge repository in the form of triples. 3D- EX\nis a unified evaluation framework with carefully pre-computed\ntrain/validation/test splits to prevent memorization. We report experimental\nresults that suggest that this dataset could be effectively leveraged in\ndownstream NLP tasks. Code and data are available at\nhttps://github.com/F-Almeman/3D-EX .\n","authors":["Fatemah Almeman","Hadi Sheikhi","Luis Espinosa-Anke"],"pdf_url":"https://arxiv.org/pdf/2308.03043v2.pdf","comment":"11 pages (including references pages), 9 tables, and 1 figure. This\n paper is submitted to RANLP2023"},{"id":"http://arxiv.org/abs/2308.06095v1","updated":"2023-08-11T12:07:45Z","published":"2023-08-11T12:07:45Z","title":"Neural Conversation Models and How to Rein Them in: A Survey of Failures\n and Fixes","summary":" Recent conditional language models are able to continue any kind of text\nsource in an often seemingly fluent way. This fact encouraged research in the\narea of open-domain conversational systems that are based on powerful language\nmodels and aim to imitate an interlocutor by generating appropriate\ncontributions to a written dialogue. From a linguistic perspective, however,\nthe complexity of contributing to a conversation is high. In this survey, we\ninterpret Grice's maxims of cooperative conversation from the perspective of\nthis specific research area and systematize the literature under the aspect of\nwhat makes a contribution appropriate: A neural conversation model has to be\nfluent, informative, consistent, coherent, and follow social norms. In order to\nensure these qualities, recent approaches try to tame the underlying language\nmodels at various intervention points, such as data, training regime or\ndecoding. Sorted by these categories and intervention points, we discuss\npromising attempts and suggest novel ways for future research.\n","authors":["Fabian Galetzka","Anne Beyer","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2308.06095v1.pdf","comment":"Represents the state of the field in 2022; partially based on the\n first authors 2022 PhD thesis"},{"id":"http://arxiv.org/abs/2308.06077v1","updated":"2023-08-11T11:29:51Z","published":"2023-08-11T11:29:51Z","title":"Fly-Swat or Cannon? Cost-Effective Language Model Choice via\n Meta-Modeling","summary":" Generative language models (LMs) have become omnipresent across data science.\nFor a wide variety of tasks, inputs can be phrased as natural language prompts\nfor an LM, from whose output the solution can then be extracted. LM performance\nhas consistently been increasing with model size - but so has the monetary cost\nof querying the ever larger models. Importantly, however, not all inputs are\nequally hard: some require larger LMs for obtaining a satisfactory solution,\nwhereas for others smaller LMs suffice. Based on this fact, we design a\nframework for Cost-Effective Language Model Choice (CELMOC). Given a set of\ninputs and a set of candidate LMs, CELMOC judiciously assigns each input to an\nLM predicted to do well on the input according to a so-called meta-model,\naiming to achieve high overall performance at low cost. The cost-performance\ntrade-off can be flexibly tuned by the user. Options include, among others,\nmaximizing total expected performance (or the number of processed inputs) while\nstaying within a given cost budget, or minimizing total cost while processing\nall inputs. We evaluate CELMOC on 14 datasets covering five natural language\ntasks, using four candidate LMs of vastly different size and cost. With CELMOC,\nwe match the performance of the largest available LM while achieving a cost\nreduction of 63%. Via our publicly available library, researchers as well as\npractitioners can thus save large amounts of money without sacrificing\nperformance.\n","authors":["Marija Šakota","Maxime Peyrard","Robert West"],"pdf_url":"https://arxiv.org/pdf/2308.06077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16878v3","updated":"2023-08-11T11:25:49Z","published":"2022-11-30T10:25:24Z","title":"Transformers are Short Text Classifiers: A Study of Inductive Short Text\n Classifiers on Benchmarks and Real-world Datasets","summary":" Short text classification is a crucial and challenging aspect of Natural\nLanguage Processing. For this reason, there are numerous highly specialized\nshort text classifiers. However, in recent short text research, State of the\nArt (SOTA) methods for traditional text classification, particularly the pure\nuse of Transformers, have been unexploited. In this work, we examine the\nperformance of a variety of short text classifiers as well as the top\nperforming traditional text classifier. We further investigate the effects on\ntwo new real-world short text datasets in an effort to address the issue of\nbecoming overly dependent on benchmark datasets with a limited number of\ncharacteristics. Our experiments unambiguously demonstrate that Transformers\nachieve SOTA accuracy on short text classification tasks, raising the question\nof whether specialized short text techniques are necessary.\n","authors":["Fabian Karl","Ansgar Scherp"],"pdf_url":"https://arxiv.org/pdf/2211.16878v3.pdf","comment":"Accepted at CD-MAKE 2023"},{"id":"http://arxiv.org/abs/2106.07306v6","updated":"2023-08-11T10:46:29Z","published":"2021-06-14T11:23:59Z","title":"Constraining Linear-chain CRFs to Regular Languages","summary":" A major challenge in structured prediction is to represent the\ninterdependencies within output structures. When outputs are structured as\nsequences, linear-chain conditional random fields (CRFs) are a widely used\nmodel class which can learn \\textit{local} dependencies in the output. However,\nthe CRF's Markov assumption makes it impossible for CRFs to represent\ndistributions with \\textit{nonlocal} dependencies, and standard CRFs are unable\nto respect nonlocal constraints of the data (such as global arity constraints\non output labels). We present a generalization of CRFs that can enforce a broad\nclass of constraints, including nonlocal ones, by specifying the space of\npossible output structures as a regular language $\\mathcal{L}$. The resulting\nregular-constrained CRF (RegCCRF) has the same formal properties as a standard\nCRF, but assigns zero probability to all label sequences not in $\\mathcal{L}$.\nNotably, RegCCRFs can incorporate their constraints during training, while\nrelated models only enforce constraints during decoding. We prove that\nconstrained training is never worse than constrained decoding, and show\nempirically that it can be substantially better in practice. Additionally, we\ndemonstrate a practical benefit on downstream tasks by incorporating a RegCCRF\ninto a deep neural model for semantic role labeling, exceeding state-of-the-art\nresults on a standard dataset.\n","authors":["Sean Papay","Roman Klinger","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2106.07306v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06063v1","updated":"2023-08-11T10:35:53Z","published":"2023-08-11T10:35:53Z","title":"A Case Study on Context Encoding in Multi-Encoder based Document-Level\n Neural Machine Translation","summary":" Recent studies have shown that the multi-encoder models are agnostic to the\nchoice of context, and the context encoder generates noise which helps improve\nthe models in terms of BLEU score. In this paper, we further explore this idea\nby evaluating with context-aware pronoun translation test set by training\nmulti-encoder models trained on three different context settings viz, previous\ntwo sentences, random two sentences, and a mix of both as context.\nSpecifically, we evaluate the models on the ContraPro test set to study how\ndifferent contexts affect pronoun translation accuracy. The results show that\nthe model can perform well on the ContraPro test set even when the context is\nrandom. We also analyze the source representations to study whether the context\nencoder generates noise. Our analysis shows that the context encoder provides\nsufficient information to learn discourse-level information. Additionally, we\nobserve that mixing the selected context (the previous two sentences in this\ncase) and the random context is generally better than the other settings.\n","authors":["Ramakrishna Appicharla","Baban Gain","Santanu Pal","Asif Ekbal"],"pdf_url":"https://arxiv.org/pdf/2308.06063v1.pdf","comment":"Accepted to MT Summit 2023 (oral)"},{"id":"http://arxiv.org/abs/2303.16618v2","updated":"2023-08-11T10:01:35Z","published":"2023-03-29T12:19:23Z","title":"Personalised Language Modelling of Screen Characters Using Rich Metadata\n Annotations","summary":" Language models that are sensitive to external context can more effectively\ncapture the speaking patterns of individuals with specific characteristics or\nin particular environments. However, obtaining and leveraging such annotations\ncan be challenging. In this work, we show how to leverage rich character and\nfilm annotations to personalise language models in a scalable manner. Our best\nmodel can reduce perplexity by up to 6.5% compared to a parameter-matched\nlanguage model. Our approach performs on par with speaker-specific fine-tuning\nwhen the fine-tuning data (i.e. past dialogue) for individual speakers is\navailable. On top of that, it also generalises well to a scenario with no such\ndata, relying on combinations of demographic characteristics expressed via\nmetadata. Our findings are consistent across two corpora, one of which is also\na contribution of this paper: Cornell-rich contains rich manual annotations for\n863 speaking characters from the Cornell Movie Dialog Corpus, including\nfeatures such as characteristic quotes and character descriptions, along with\nsix automatically extracted metadata features for over 95% of the featured\nfilms. Finally, we also present a cost-benefit analysis highlighting which\nannotations are most cost-effective in reducing perplexity.\n","authors":["Sebastian Vincent","Rowanne Sumner","Alice Dowek","Charlotte Blundell","Emily Preston","Chris Bayliss","Chris Oakley","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2303.16618v2.pdf","comment":"9 pages; 4 figures; 6 tables. Preprint"},{"id":"http://arxiv.org/abs/2303.08032v2","updated":"2023-08-11T09:59:07Z","published":"2023-03-14T16:11:47Z","title":"Verifying the Robustness of Automatic Credibility Assessment","summary":" Text classification methods have been widely investigated as a way to detect\ncontent of low credibility: fake news, social media bots, propaganda, etc.\nQuite accurate models (likely based on deep neural networks) help in moderating\npublic electronic platforms and often cause content creators to face rejection\nof their submissions or removal of already published texts. Having the\nincentive to evade further detection, content creators try to come up with a\nslightly modified version of the text (known as an attack with an adversarial\nexample) that exploit the weaknesses of classifiers and result in a different\noutput. Here we systematically test the robustness of popular text classifiers\nagainst available attacking techniques and discover that, indeed, in some cases\ninsignificant changes in input text can mislead the models. We also introduce\nBODEGA: a benchmark for testing both victim models and attack methods on four\nmisinformation detection tasks in an evaluation framework designed to simulate\nreal use-cases of content moderation. Finally, we manually analyse a subset\nadversarial examples and check what kinds of modifications are used in\nsuccessful attacks. The BODEGA code and data is openly shared in hope of\nenhancing the comparability and replicability of further research in this area\n","authors":["Piotr Przybyła","Alexander Shvets","Horacio Saggion"],"pdf_url":"https://arxiv.org/pdf/2303.08032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06039v1","updated":"2023-08-11T09:36:33Z","published":"2023-08-11T09:36:33Z","title":"Learning to Guide Human Experts via Personalized Large Language Models","summary":" In learning to defer, a predictor identifies risky decisions and defers them\nto a human expert. One key issue with this setup is that the expert may end up\nover-relying on the machine's decisions, due to anchoring bias. At the same\ntime, whenever the machine chooses the deferral option the expert has to take\ndecisions entirely unassisted. As a remedy, we propose learning to guide (LTG),\nan alternative framework in which -- rather than suggesting ready-made\ndecisions -- the machine provides guidance useful to guide decision-making, and\nthe human is entirely responsible for coming up with a decision. We also\nintroduce SLOG, an LTG implementation that leverages (a small amount of) human\nsupervision to convert a generic large language model into a module capable of\ngenerating textual guidance, and present preliminary but promising results on a\nmedical diagnosis task.\n","authors":["Debodeep Banerjee","Stefano Teso","Andrea Passerini"],"pdf_url":"https://arxiv.org/pdf/2308.06039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06035v1","updated":"2023-08-11T09:30:07Z","published":"2023-08-11T09:30:07Z","title":"Evidence of Human-Like Visual-Linguistic Integration in Multimodal Large\n Language Models During Predictive Language Processing","summary":" The advanced language processing abilities of large language models (LLMs)\nhave stimulated debate over their capacity to replicate human-like cognitive\nprocesses. One differentiating factor between language processing in LLMs and\nhumans is that language input is often grounded in more than one perceptual\nmodality, whereas most LLMs process solely text-based information. Multimodal\ngrounding allows humans to integrate - e.g. visual context with linguistic\ninformation and thereby place constraints on the space of upcoming words,\nreducing cognitive load and improving perception and comprehension. Recent\nmultimodal LLMs (mLLMs) combine visual and linguistic embedding spaces with a\ntransformer type attention mechanism for next-word prediction. To what extent\ndoes predictive language processing based on multimodal input align in mLLMs\nand humans? To answer this question, 200 human participants watched short\naudio-visual clips and estimated the predictability of an upcoming verb or\nnoun. The same clips were processed by the mLLM CLIP, with predictability\nscores based on a comparison of image and text feature vectors. Eye-tracking\nwas used to estimate what visual features participants attended to, and CLIP's\nvisual attention weights were recorded. We find that human estimates of\npredictability align significantly with CLIP scores, but not for a unimodal LLM\nof comparable parameter size. Further, alignment vanished when CLIP's visual\nattention weights were perturbed, and when the same input was fed to a\nmultimodal model without attention. Analysing attention patterns, we find a\nsignificant spatial overlap between CLIP's visual attention weights and human\neye-tracking data. Results suggest that comparable processes of integrating\nmultimodal information, guided by attention to relevant visual features,\nsupports predictive language processing in mLLMs and humans.\n","authors":["Viktor Kewenig","Christopher Edwards","Quitterie Lacome DEstalenx","Akilles Rechardt","Jeremy I Skipper","Gabriella Vigliocco"],"pdf_url":"https://arxiv.org/pdf/2308.06035v1.pdf","comment":"13 pages, 4 figures, submitted to journal"},{"id":"http://arxiv.org/abs/2308.06032v1","updated":"2023-08-11T09:23:11Z","published":"2023-08-11T09:23:11Z","title":"Large Language Models in Cryptocurrency Securities Cases: Can ChatGPT\n Replace Lawyers?","summary":" Large Language Models (LLMs) could enhance access to the legal system.\nHowever, empirical research on their effectiveness in conducting legal tasks is\nscant. We study securities cases involving cryptocurrencies as one of numerous\ncontexts where AI could support the legal process, studying LLMs' legal\nreasoning and drafting capabilities. We examine whether a) an LLM can\naccurately determine which laws are potentially being violated from a fact\npattern, and b) whether there is a difference in juror decision-making based on\ncomplaints written by a lawyer compared to an LLM. We feed fact patterns from\nreal-life cases to GPT-3.5 and evaluate its ability to determine correct\npotential violations from the scenario and exclude spurious violations. Second,\nwe had mock jurors assess complaints written by the LLM and lawyers. GPT-3.5's\nlegal reasoning skills proved weak, though we expect improvement in future\nmodels, particularly given the violations it suggested tended to be correct (it\nmerely missed additional, correct violations). GPT-3.5 performed better at\nlegal drafting, and jurors' decisions were not statistically significantly\nassociated with the author of the document upon which they based their\ndecisions. Because LLMs cannot satisfactorily conduct legal reasoning tasks,\nthey would be unable to replace lawyers at this stage. However, their drafting\nskills (though, perhaps, still inferior to lawyers), could provide access to\njustice for more individuals by reducing the cost of legal services. Our\nresearch is the first to systematically study LLMs' legal drafting and\nreasoning capabilities in litigation, as well as in securities law and\ncryptocurrency-related misconduct.\n","authors":["Arianna Trozze","Toby Davies","Bennett Kleinberg"],"pdf_url":"https://arxiv.org/pdf/2308.06032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12267v3","updated":"2023-08-11T09:18:51Z","published":"2023-07-23T08:47:51Z","title":"Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid\n Essay in Education","summary":" The recent large language models (LLMs), e.g., ChatGPT, have been able to\ngenerate human-like and fluent responses when provided with specific\ninstructions. While admitting the convenience brought by technological\nadvancement, educators also have concerns that students might leverage LLMs to\ncomplete their writing assignments and pass them off as their original work.\nAlthough many AI content detection studies have been conducted as a result of\nsuch concerns, most of these prior studies modeled AI content detection as a\nclassification problem, assuming that a text is either entirely human-written\nor entirely AI-generated. In this study, we investigated AI content detection\nin a rarely explored yet realistic setting where the text to be detected is\ncollaboratively written by human and generative LLMs (i.e., hybrid text). We\nfirst formalized the detection task as identifying the transition points\nbetween human-written content and AI-generated content from a given hybrid text\n(boundary detection). Then we proposed a two-step approach where we (1)\nseparated AI-generated content from human-written content during the encoder\ntraining process; and (2) calculated the distances between every two adjacent\nprototypes and assumed that the boundaries exist between the two adjacent\nprototypes that have the furthest distance from each other. Through extensive\nexperiments, we observed the following main findings: (1) the proposed approach\nconsistently outperformed the baseline methods across different experiment\nsettings; (2) the encoder training process can significantly boost the\nperformance of the proposed approach; (3) when detecting boundaries for\nsingle-boundary hybrid essays, the proposed approach could be enhanced by\nadopting a relatively large prototype size, leading to a 22% improvement in the\nIn-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation.\n","authors":["Zijie Zeng","Lele Sha","Yuheng Li","Kaixun Yang","Dragan Gašević","Guanliang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12267v3.pdf","comment":"9 pages including references, 2 figures"},{"id":"http://arxiv.org/abs/2308.06017v1","updated":"2023-08-11T08:47:52Z","published":"2023-08-11T08:47:52Z","title":"Optimizing transformer-based machine translation model for single GPU\n training: a hyperparameter ablation study","summary":" In machine translation tasks, the relationship between model complexity and\nperformance is often presumed to be linear, driving an increase in the number\nof parameters and consequent demands for computational resources like multiple\nGPUs. To explore this assumption, this study systematically investigates the\neffects of hyperparameters through ablation on a sequence-to-sequence machine\ntranslation pipeline, utilizing a single NVIDIA A100 GPU. Contrary to\nexpectations, our experiments reveal that combinations with the most parameters\nwere not necessarily the most effective. This unexpected insight prompted a\ncareful reduction in parameter sizes, uncovering \"sweet spots\" that enable\ntraining sophisticated models on a single GPU without compromising translation\nquality. The findings demonstrate an intricate relationship between\nhyperparameter selection, model size, and computational resource needs. The\ninsights from this study contribute to the ongoing efforts to make machine\ntranslation more accessible and cost-effective, emphasizing the importance of\nprecise hyperparameter tuning over mere scaling.\n","authors":["Luv Verma","Ketaki N. Kolhatkar"],"pdf_url":"https://arxiv.org/pdf/2308.06017v1.pdf","comment":"12 pages, 15 figures, 1 Table"},{"id":"http://arxiv.org/abs/2307.03104v3","updated":"2023-08-11T08:12:50Z","published":"2023-07-06T16:26:34Z","title":"Efficient Domain Adaptation of Sentence Embeddings Using Adapters","summary":" Sentence embeddings enable us to capture the semantic similarity of short\ntexts. Most sentence embedding models are trained for general semantic textual\nsimilarity (STS) tasks. Therefore, to use sentence embeddings in a particular\ndomain, the model must be adapted to it in order to achieve good results.\nUsually, this is done by fine-tuning the entire sentence embedding model for\nthe domain of interest. While this approach yields state-of-the-art results,\nall of the model's weights are updated during fine-tuning, making this method\nresource-intensive. Therefore, instead of fine-tuning entire sentence embedding\nmodels for each target domain individually, we propose to train lightweight\nadapters. These domain-specific adapters do not require fine-tuning all\nunderlying sentence embedding model parameters. Instead, we only train a small\nnumber of additional parameters while keeping the weights of the underlying\nsentence embedding model fixed. Training domain-specific adapters allows always\nusing the same base model and only exchanging the domain-specific adapters to\nadapt sentence embeddings to a specific domain. We show that using adapters for\nparameter-efficient domain adaptation of sentence embeddings yields competitive\nperformance within 1% of a domain-adapted, entirely fine-tuned sentence\nembedding model while only training approximately 3.6% of the parameters.\n","authors":["Tim Schopf","Dennis N. Schneider","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.03104v3.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2308.05481v2","updated":"2023-08-11T07:55:19Z","published":"2023-08-10T10:12:43Z","title":"LLM As DBA","summary":" Database administrators (DBAs) play a crucial role in managing, maintaining\nand optimizing a database system to ensure data availability, performance, and\nreliability. However, it is hard and tedious for DBAs to manage a large number\nof database instances (e.g., millions of instances on the cloud databases).\nRecently large language models (LLMs) have shown great potential to understand\nvaluable documents and accordingly generate reasonable answers. Thus, we\npropose D-Bot, a LLM-based database administrator that can continuously acquire\ndatabase maintenance experience from textual sources, and provide reasonable,\nwell-founded, in-time diagnosis and optimization advice for target databases.\nThis paper presents a revolutionary LLM-centric framework for database\nmaintenance, including (i) database maintenance knowledge detection from\ndocuments and tools, (ii) tree of thought reasoning for root cause analysis,\nand (iii) collaborative diagnosis among multiple LLMs. Our preliminary\nexperimental results that D-Bot can efficiently and effectively diagnose the\nroot causes and our code is available at\ngithub.com/TsinghuaDatabaseGroup/DB-GPT.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhiyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05481v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05973v1","updated":"2023-08-11T07:16:49Z","published":"2023-08-11T07:16:49Z","title":"Tweet Sentiment Extraction using Viterbi Algorithm with Transfer\n Learning","summary":" Tweet sentiment extraction extracts the most significant portion of the\nsentence, determining whether the sentiment is positive or negative. This\nresearch aims to identify the part of tweet sentences that strikes any emotion.\nTo reach this objective, we continue improving the Viterbi algorithm previously\nmodified by the author to make it able to receive pre-trained model parameters.\nWe introduce the confidence score and vector as two indicators responsible for\nevaluating the model internally before assessing the final results. We then\npresent a method to fine-tune this nonparametric model. We found that the model\ngets highly explainable as the confidence score vector reveals precisely where\nthe least confidence predicted states are and if the modifications approved\nameliorate the confidence score or if the tuning is going in the wrong\ndirection.\n","authors":["Zied Baklouti"],"pdf_url":"https://arxiv.org/pdf/2308.05973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05935v1","updated":"2023-08-11T04:36:26Z","published":"2023-08-11T04:36:26Z","title":"LittleMu: Deploying an Online Virtual Teaching Assistant via\n Heterogeneous Sources Integration and Chain of Teach Prompts","summary":" Teaching assistants have played essential roles in the long history of\neducation. However, few MOOC platforms are providing human or virtual teaching\nassistants to support learning for massive online students due to the\ncomplexity of real-world online education scenarios and the lack of training\ndata. In this paper, we present a virtual MOOC teaching assistant, LittleMu\nwith minimum labeled training data, to provide question answering and chit-chat\nservices. Consisting of two interactive modules of heterogeneous retrieval and\nlanguage model prompting, LittleMu first integrates structural, semi- and\nunstructured knowledge sources to support accurate answers for a wide range of\nquestions. Then, we design delicate demonstrations named \"Chain of Teach\"\nprompts to exploit the large-scale pre-trained model to handle complex\nuncollected questions. Except for question answering, we develop other\neducational services such as knowledge-grounded chit-chat. We test the system's\nperformance via both offline evaluation and online deployment. Since May 2020,\nour LittleMu system has served over 80,000 users with over 300,000 queries from\nover 500 courses on XuetangX MOOC platform, which continuously contributes to a\nmore convenient and fair education. Our code, services, and dataset will be\navailable at https://github.com/THU-KEG/VTA.\n","authors":["Shangqing Tu","Zheyuan Zhang","Jifan Yu","Chunyang Li","Siyu Zhang","Zijun Yao","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2308.05935v1.pdf","comment":"7 pages, 3 figures, Accepted by CIKM 23"},{"id":"http://arxiv.org/abs/2308.04823v2","updated":"2023-08-11T04:17:33Z","published":"2023-08-09T09:22:56Z","title":"Evaluating the Generation Capabilities of Large Chinese Language Models","summary":" This paper presents CG-Eval, the first comprehensive evaluation of the\ngeneration capabilities of large Chinese language models across a wide range of\nacademic disciplines. The models' performance was assessed based on their\nability to generate accurate and relevant responses to different types of\nquestions in six disciplines, namely, Science and Engineering, Humanities and\nSocial Sciences, Mathematical Calculations, Medical Practitioner Qualification\nExamination, Judicial Examination, and Certified Public Accountant Examination.\nThis paper also presents Gscore, a composite index derived from the weighted\nsum of multiple metrics to measure the quality of model's generation against a\nreference. The test data and test results can be found at\nhttp://cgeval.besteasy.com/.\n","authors":["Hui Zeng","Jingyuan Xue","Meng Hao","Chen Sun","Bin Ning","Na Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04823v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00732v3","updated":"2023-08-11T04:00:59Z","published":"2022-10-28T12:54:30Z","title":"Kuaipedia: a Large-scale Multi-modal Short-video Encyclopedia","summary":" Online encyclopedias, such as Wikipedia, have been well-developed and\nresearched in the last two decades. One can find any attributes or other\ninformation of a wiki item on a wiki page edited by a community of volunteers.\nHowever, the traditional text, images and tables can hardly express some\naspects of an wiki item. For example, when we talk about ``Shiba Inu'', one may\ncare more about ``How to feed it'' or ``How to train it not to protect its\nfood''. Currently, short-video platforms have become a hallmark in the online\nworld. Whether you're on TikTok, Instagram, Kuaishou, or YouTube Shorts,\nshort-video apps have changed how we consume and create content today. Except\nfor producing short videos for entertainment, we can find more and more authors\nsharing insightful knowledge widely across all walks of life. These short\nvideos, which we call knowledge videos, can easily express any aspects (e.g.\nhair or how-to-feed) consumers want to know about an item (e.g. Shiba Inu), and\nthey can be systematically analyzed and organized like an online encyclopedia.\nIn this paper, we propose Kuaipedia, a large-scale multi-modal encyclopedia\nconsisting of items, aspects, and short videos lined to them, which was\nextracted from billions of videos of Kuaishou (Kwai), a well-known short-video\nplatform in China. We first collected items from multiple sources and mined\nuser-centered aspects from millions of users' queries to build an item-aspect\ntree. Then we propose a new task called ``multi-modal item-aspect linking'' as\nan expansion of ``entity linking'' to link short videos into item-aspect pairs\nand build the whole short-video encyclopedia. Intrinsic evaluations show that\nour encyclopedia is of large scale and highly accurate. We also conduct\nsufficient extrinsic experiments to show how Kuaipedia can help fundamental\napplications such as entity typing and entity linking.\n","authors":["Haojie Pan","Zepeng Zhai","Yuzhou Zhang","Ruiji Fu","Ming Liu","Yangqiu Song","Zhongyuan Wang","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2211.00732v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05476v2","updated":"2023-08-11T02:50:00Z","published":"2023-08-10T10:07:00Z","title":"Exploring Machine Learning and Transformer-based Approaches for\n Deceptive Text Classification: A Comparative Analysis","summary":" Deceptive text classification is a critical task in natural language\nprocessing that aims to identify deceptive o fraudulent content. This study\npresents a comparative analysis of machine learning and transformer-based\napproaches for deceptive text classification. We investigate the effectiveness\nof traditional machine learning algorithms and state-of-the-art transformer\nmodels, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive\ntext. A labeled dataset consisting of deceptive and non-deceptive texts is used\nfor training and evaluation purposes. Through extensive experimentation, we\ncompare the performance metrics, including accuracy, precision, recall, and F1\nscore, of the different approaches. The results of this study shed light on the\nstrengths and limitations of machine learning and transformer-based methods for\ndeceptive text classification, enabling researchers and practitioners to make\ninformed decisions when dealing with deceptive content.\n","authors":["Anusuya Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.05476v2.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.02463v2","updated":"2023-08-11T02:19:33Z","published":"2023-08-04T17:00:38Z","title":"Towards Generalist Foundation Model for Radiology","summary":" In this study, we aim to initiate the development of Radiology Foundation\nModel, termed as RadFM.We consider the construction of foundational models from\nthe perspectives of data, model design, and evaluation thoroughly. Our\ncontribution can be concluded as follows: (i), we construct a large-scale\nMedical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans.\nTo the best of our knowledge, this is the first multi-modal dataset containing\n3D medical scans. (ii), We propose an architecture that enables visually\nconditioned generative pre-training, allowing for the integration of text input\ninterleaved with 2D or 3D medical scans to generate response for diverse\nradiologic tasks. The model was initially pre-trained on MedMD and subsequently\ndomain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD,\ncontaining 3M radiologic visual-language pairs. (iii), we propose a new\nevaluation benchmark that comprises five tasks, aiming to comprehensively\nassess the capability of foundation models in handling practical clinical\nproblems. Our experimental results confirm that RadFM significantly outperforms\nexisting multi-modal foundation models. The codes, data, and model checkpoint\nwill all be made publicly available to promote further research and development\nin the field.\n","authors":["Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09927v2","updated":"2023-08-11T02:04:46Z","published":"2023-06-16T15:50:03Z","title":"Trained Transformers Learn Linear Models In-Context","summary":" Attention-based neural networks such as transformers have demonstrated a\nremarkable ability to exhibit in-context learning (ICL): Given a short prompt\nsequence of tokens from an unseen task, they can formulate relevant per-token\nand next-token predictions without any parameter updates. By embedding a\nsequence of labeled training data and unlabeled test data as a prompt, this\nallows for transformers to behave like supervised learning algorithms. Indeed,\nrecent work has shown that when training transformer architectures over random\ninstances of linear regression problems, these models' predictions mimic those\nof ordinary least squares.\n Towards understanding the mechanisms underlying this phenomenon, we\ninvestigate the dynamics of ICL in transformers with a single linear\nself-attention layer trained by gradient flow on linear regression tasks. We\nshow that despite non-convexity, gradient flow with a suitable random\ninitialization finds a global minimum of the objective function. At this global\nminimum, when given a test prompt of labeled examples from a new prediction\ntask, the transformer achieves prediction error competitive with the best\nlinear predictor over the test prompt distribution. We additionally\ncharacterize the robustness of the trained transformer to a variety of\ndistribution shifts and show that although a number of shifts are tolerated,\nshifts in the covariate distribution of the prompts are not. Motivated by this,\nwe consider a generalized ICL setting where the covariate distributions can\nvary across prompts. We show that although gradient flow succeeds at finding a\nglobal minimum in this setting, the trained transformer is still brittle under\nmild covariate shifts. We complement this finding with experiments on large,\nnonlinear transformer architectures which we show are more robust under\ncovariate shifts.\n","authors":["Ruiqi Zhang","Spencer Frei","Peter L. Bartlett"],"pdf_url":"https://arxiv.org/pdf/2306.09927v2.pdf","comment":"50 pages, experiments added, reference added, typo corrected"},{"id":"http://arxiv.org/abs/2308.05884v1","updated":"2023-08-11T00:33:26Z","published":"2023-08-11T00:33:26Z","title":"PIPPA: A Partially Synthetic Conversational Dataset","summary":" With the emergence of increasingly powerful large language models, there is a\nburgeoning interest in leveraging these models for casual conversation and\nrole-play applications. However, existing conversational and role-playing\ndatasets often fail to capture the diverse and nuanced interactions typically\nexhibited by real-world role-play participants. To address this limitation and\ncontribute to the rapidly growing field, we introduce a partially-synthetic\ndataset named PIPPA (Personal Interaction Pairs between People and AI). PIPPA\nis a result of a community-driven crowdsourcing effort involving a group of\nrole-play enthusiasts. The dataset comprises over 1 million utterances that are\ndistributed across 26,000 conversation sessions and provides a rich resource\nfor researchers and AI developers to explore and refine conversational AI\nsystems in the context of role-play scenarios.\n","authors":["Tear Gosling","Alpin Dale","Yinhe Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.05884v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.06391v1","updated":"2023-08-11T21:17:13Z","published":"2023-08-11T21:17:13Z","title":"Dynamic Planning with a LLM","summary":" While Large Language Models (LLMs) can solve many NLP tasks in zero-shot\nsettings, applications involving embodied agents remain problematic. In\nparticular, complex plans that require multi-step reasoning become difficult\nand too costly as the context window grows. Planning requires understanding the\nlikely effects of one's actions and identifying whether the current environment\nsatisfies the goal state. While symbolic planners find optimal solutions\nquickly, they require a complete and accurate representation of the planning\nproblem, severely limiting their use in practical scenarios. In contrast,\nmodern LLMs cope with noisy observations and high levels of uncertainty when\nreasoning about a task. Our work presents LLM Dynamic Planner (LLM-DP): a\nneuro-symbolic framework where an LLM works hand-in-hand with a traditional\nplanner to solve an embodied task. Given action-descriptions, LLM-DP solves\nAlfworld faster and more efficiently than a naive LLM ReAct baseline.\n","authors":["Gautier Dagan","Frank Keller","Alex Lascarides"],"pdf_url":"https://arxiv.org/pdf/2308.06391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06385v1","updated":"2023-08-11T20:59:31Z","published":"2023-08-11T20:59:31Z","title":"ZYN: Zero-Shot Reward Models with Yes-No Questions","summary":" In this work, we address the problem of directing the text generations of a\nLLM towards a desired behavior, aligning the generated text with the\npreferences of the human operator. We propose using another language model as a\ncritic, reward model in a zero-shot way thanks to the prompt of a Yes-No\nquestion that represents the user preferences, without requiring further\nlabeled data. This zero-shot reward model provides the learning signal to\nfurther fine-tune the base LLM using reinforcement learning, as in RLAIF; yet\nour approach is also compatible in other contexts such as quality-diversity\nsearch. Extensive evidence of the capabilities of the proposed ZYN framework is\nprovided through experiments in different domains related to text generation,\nincluding detoxification; optimizing sentiment of movie reviews, or any other\nattribute; steering the opinion about a particular topic the model may have;\nand personalizing prompt generators for text-to-image tasks. Code to be\nreleased at \\url{https://github.com/vicgalle/zero-shot-reward-models/}.\n","authors":["Victor Gallego"],"pdf_url":"https://arxiv.org/pdf/2308.06385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06374v1","updated":"2023-08-11T20:16:57Z","published":"2023-08-11T20:16:57Z","title":"Large Language Models and Knowledge Graphs: Opportunities and Challenges","summary":" Large Language Models (LLMs) have taken Knowledge Representation -- and the\nworld -- by storm. This inflection point marks a shift from explicit knowledge\nrepresentation to a renewed focus on the hybrid representation of both explicit\nknowledge and parametric knowledge. In this position paper, we will discuss\nsome of the common debate points within the community on LLMs (parametric\nknowledge) and Knowledge Graphs (explicit knowledge) and speculate on\nopportunities and visions that the renewed focus brings, as well as related\nresearch topics and challenges.\n","authors":["Jeff Z. Pan","Simon Razniewski","Jan-Christoph Kalo","Sneha Singhania","Jiaoyan Chen","Stefan Dietze","Hajira Jabeen","Janna Omeliyanenko","Wen Zhang","Matteo Lissandrini","Russa Biswas","Gerard de Melo","Angela Bonifati","Edlira Vakaj","Mauro Dragoni","Damien Graux"],"pdf_url":"https://arxiv.org/pdf/2308.06374v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2208.01066v3","updated":"2023-08-11T19:27:58Z","published":"2022-08-01T18:01:40Z","title":"What Can Transformers Learn In-Context? A Case Study of Simple Function\n Classes","summary":" In-context learning refers to the ability of a model to condition on a prompt\nsequence consisting of in-context examples (input-output pairs corresponding to\nsome task) along with a new query input, and generate the corresponding output.\nCrucially, in-context learning happens only at inference time without any\nparameter updates to the model. While large language models such as GPT-3\nexhibit some ability to perform in-context learning, it is unclear what the\nrelationship is between tasks on which this succeeds and what is present in the\ntraining data. To make progress towards understanding in-context learning, we\nconsider the well-defined problem of training a model to in-context learn a\nfunction class (e.g., linear functions): that is, given data derived from some\nfunctions in the class, can we train a model to in-context learn \"most\"\nfunctions from this class? We show empirically that standard Transformers can\nbe trained from scratch to perform in-context learning of linear functions --\nthat is, the trained model is able to learn unseen linear functions from\nin-context examples with performance comparable to the optimal least squares\nestimator. In fact, in-context learning is possible even under two forms of\ndistribution shift: (i) between the training data of the model and\ninference-time prompts, and (ii) between the in-context examples and the query\ninput during inference. We also show that we can train Transformers to\nin-context learn more complex function classes -- namely sparse linear\nfunctions, two-layer neural networks, and decision trees -- with performance\nthat matches or exceeds task-specific learning algorithms. Our code and models\nare available at https://github.com/dtsip/in-context-learning .\n","authors":["Shivam Garg","Dimitris Tsipras","Percy Liang","Gregory Valiant"],"pdf_url":"https://arxiv.org/pdf/2208.01066v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04566v2","updated":"2023-08-11T19:21:45Z","published":"2023-08-08T20:29:13Z","title":"Single-Sentence Reader: A Novel Approach for Addressing Answer Position\n Bias","summary":" Machine Reading Comprehension (MRC) models tend to take advantage of spurious\ncorrelations (also known as dataset bias or annotation artifacts in the\nresearch community). Consequently, these models may perform the MRC task\nwithout fully comprehending the given context and question, which is\nundesirable since it may result in low robustness against distribution shift.\nThis paper delves into the concept of answer-position bias, where a significant\npercentage of training questions have answers located solely in the first\nsentence of the context. We propose a Single-Sentence Reader as a new approach\nfor addressing answer position bias in MRC. We implement this approach using\nsix different models and thoroughly analyze their performance. Remarkably, our\nproposed Single-Sentence Readers achieve results that nearly match those of\nmodels trained on conventional training sets, proving their effectiveness. Our\nstudy also discusses several challenges our Single-Sentence Readers encounter\nand proposes a potential solution.\n","authors":["Son Quoc Tran","Matt Kretchmar"],"pdf_url":"https://arxiv.org/pdf/2308.04566v2.pdf","comment":"11 pages, 5 tables, 2 figures"},{"id":"http://arxiv.org/abs/2308.06354v1","updated":"2023-08-11T19:18:35Z","published":"2023-08-11T19:18:35Z","title":"Large Language Models to Identify Social Determinants of Health in\n Electronic Health Records","summary":" Social determinants of health (SDoH) have an important impact on patient\noutcomes but are incompletely collected from the electronic health records\n(EHR). This study researched the ability of large language models to extract\nSDoH from free text in EHRs, where they are most commonly documented, and\nexplored the role of synthetic clinical text for improving the extraction of\nthese scarcely documented, yet extremely valuable, clinical data. 800 patient\nnotes were annotated for SDoH categories, and several transformer-based models\nwere evaluated. The study also experimented with synthetic data generation and\nassessed for algorithmic bias. Our best-performing models were fine-tuned\nFlan-T5 XL (macro-F1 0.71) for any SDoH, and Flan-T5 XXL (macro-F1 0.70). The\nbenefit of augmenting fine-tuning with synthetic data varied across model\narchitecture and size, with smaller Flan-T5 models (base and large) showing the\ngreatest improvements in performance (delta F1 +0.12 to +0.23). Model\nperformance was similar on the in-hospital system dataset but worse on the\nMIMIC-III dataset. Our best-performing fine-tuned models outperformed zero- and\nfew-shot performance of ChatGPT-family models for both tasks. These fine-tuned\nmodels were less likely than ChatGPT to change their prediction when\nrace/ethnicity and gender descriptors were added to the text, suggesting less\nalgorithmic bias (p<0.05). At the patient-level, our models identified 93.8% of\npatients with adverse SDoH, while ICD-10 codes captured 2.0%. Our method can\neffectively extracted SDoH information from clinic notes, performing better\ncompare to GPT zero- and few-shot settings. These models could enhance\nreal-world evidence on SDoH and aid in identifying patients needing social\nsupport.\n","authors":["Marco Guevara","Shan Chen","Spencer Thomas","Tafadzwa L. Chaunzwa","Idalid Franco","Benjamin Kann","Shalini Moningi","Jack Qian","Madeleine Goldstein","Susan Harper","Hugo JWL Aerts","Guergana K. Savova","Raymond H. Mak","Danielle S. Bitterman"],"pdf_url":"https://arxiv.org/pdf/2308.06354v1.pdf","comment":"38 pages, 5 figures, 5 tables in main, submitted for review"},{"id":"http://arxiv.org/abs/2308.06327v1","updated":"2023-08-11T18:06:33Z","published":"2023-08-11T18:06:33Z","title":"Bilingual Streaming ASR with Grapheme units and Auxiliary Monolingual\n Loss","summary":" We introduce a bilingual solution to support English as secondary locale for\nmost primary locales in hybrid automatic speech recognition (ASR) settings. Our\nkey developments constitute: (a) pronunciation lexicon with grapheme units\ninstead of phone units, (b) a fully bilingual alignment model and subsequently\nbilingual streaming transformer model, (c) a parallel encoder structure with\nlanguage identification (LID) loss, (d) parallel encoder with an auxiliary loss\nfor monolingual projections. We conclude that in comparison to LID loss, our\nproposed auxiliary loss is superior in specializing the parallel encoders to\nrespective monolingual locales, and that contributes to stronger bilingual\nlearning. We evaluate our work on large-scale training and test tasks for\nbilingual Spanish (ES) and bilingual Italian (IT) applications. Our bilingual\nmodels demonstrate strong English code-mixing capability. In particular, the\nbilingual IT model improves the word error rate (WER) for a code-mix IT task\nfrom 46.5% to 13.8%, while also achieving a close parity (9.6%) with the\nmonolingual IT model (9.5%) over IT tests.\n","authors":["Mohammad Soleymanpour","Mahmoud Al Ismail","Fahimeh Bahmaninezhad","Kshitiz Kumar","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2308.06327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06599v2","updated":"2023-08-11T08:18:50Z","published":"2023-05-11T06:43:37Z","title":"Structured Chain-of-Thought Prompting for Code Generation","summary":" Large Language Models (LLMs) (e.g., ChatGPT) have shown impressive\nperformance in code generation. LLMs take prompts as inputs, and\nChain-of-Thought (CoT) prompting is the state-of-the-art prompting technique.\nCoT prompting asks LLMs first to generate CoTs (i.e., intermediate natural\nlanguage reasoning steps) and then output the code. However, CoT prompting is\ndesigned for natural language generation and has low accuracy in code\ngeneration.\n In this paper, we propose Structured CoTs (SCoTs) and present a novel\nprompting technique for code generation, named SCoT prompting. Our motivation\nis source code contains rich structural information and any code can be\ncomposed of three program structures (i.e., sequence, branch, and loop\nstructures). Intuitively, structured intermediate reasoning steps make for\nstructured source code. Thus, we ask LLMs to use program structures to build\nCoTs, obtaining SCoTs. Then, LLMs generate the final code based on SCoTs.\nCompared to CoT prompting, SCoT prompting explicitly constrains LLMs to think\nabout how to solve requirements from the view of source code and further the\nperformance of LLMs in code generation. We apply SCoT prompting to two LLMs\n(i.e., ChatGPT and Codex) and evaluate it on three benchmarks (i.e., HumanEval,\nMBPP, and MBCPP). (1) SCoT prompting outperforms the state-of-the-art baseline\n- CoT prompting by up to 13.79% in Pass@1. (2) Human evaluation shows human\ndevelopers prefer programs from SCoT prompting. (3) SCoT prompting is robust to\nexamples and achieves substantial improvements.\n","authors":["Jia Allen Li","Ge Li","Yongmin Li","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2305.06599v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.17780"},{"id":"http://arxiv.org/abs/2308.07336v1","updated":"2023-08-11T13:15:35Z","published":"2023-08-11T13:15:35Z","title":"Learning Deductive Reasoning from Synthetic Corpus based on Formal Logic","summary":" We study a synthetic corpus-based approach for language models (LMs) to\nacquire logical deductive reasoning ability. The previous studies generated\ndeduction examples using specific sets of deduction rules. However, these rules\nwere limited or otherwise arbitrary. This can limit the generalizability of\nacquired deductive reasoning ability. We rethink this and adopt a well-grounded\nset of deduction rules based on formal logic theory, which can derive any other\ndeduction rules when combined in a multistep way. We empirically verify that\nLMs trained on the proposed corpora, which we name $\\textbf{FLD}$\n($\\textbf{F}$ormal $\\textbf{L}$ogic $\\textbf{D}$eduction), acquire more\ngeneralizable deductive reasoning ability. Furthermore, we identify the aspects\nof deductive reasoning ability on which deduction corpora can enhance LMs and\nthose on which they cannot. Finally, on the basis of these results, we discuss\nthe future directions for applying deduction corpora or other approaches for\neach aspect. We release the code, data, and models.\n","authors":["Terufumi Morishita","Gaku Morio","Atsuki Yamaguchi","Yasuhiro Sogawa"],"pdf_url":"https://arxiv.org/pdf/2308.07336v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2212.06817v2","updated":"2023-08-11T17:45:27Z","published":"2022-12-13T18:55:15Z","title":"RT-1: Robotics Transformer for Real-World Control at Scale","summary":" By transferring knowledge from large, diverse, task-agnostic datasets, modern\nmachine learning models can solve specific downstream tasks either zero-shot or\nwith small task-specific datasets to a high level of performance. While this\ncapability has been demonstrated in other fields such as computer vision,\nnatural language processing or speech recognition, it remains to be shown in\nrobotics, where the generalization capabilities of the models are particularly\ncritical due to the difficulty of collecting real-world robotic data. We argue\nthat one of the keys to the success of such general robotic models lies with\nopen-ended task-agnostic training, combined with high-capacity architectures\nthat can absorb all of the diverse, robotic data. In this paper, we present a\nmodel class, dubbed Robotics Transformer, that exhibits promising scalable\nmodel properties. We verify our conclusions in a study of different model\nclasses and their ability to generalize as a function of the data size, model\nsize, and data diversity based on a large-scale data collection on real robots\nperforming real-world tasks. The project's website and videos can be found at\nrobotics-transformer1.github.io\n","authors":["Anthony Brohan","Noah Brown","Justice Carbajal","Yevgen Chebotar","Joseph Dabis","Chelsea Finn","Keerthana Gopalakrishnan","Karol Hausman","Alex Herzog","Jasmine Hsu","Julian Ibarz","Brian Ichter","Alex Irpan","Tomas Jackson","Sally Jesmonth","Nikhil J Joshi","Ryan Julian","Dmitry Kalashnikov","Yuheng Kuang","Isabel Leal","Kuang-Huei Lee","Sergey Levine","Yao Lu","Utsav Malla","Deeksha Manjunath","Igor Mordatch","Ofir Nachum","Carolina Parada","Jodilyn Peralta","Emily Perez","Karl Pertsch","Jornell Quiambao","Kanishka Rao","Michael Ryoo","Grecia Salazar","Pannag Sanketi","Kevin Sayed","Jaspiar Singh","Sumedh Sontakke","Austin Stone","Clayton Tan","Huong Tran","Vincent Vanhoucke","Steve Vega","Quan Vuong","Fei Xia","Ted Xiao","Peng Xu","Sichun Xu","Tianhe Yu","Brianna Zitkovich"],"pdf_url":"https://arxiv.org/pdf/2212.06817v2.pdf","comment":"See website at robotics-transformer1.github.io"},{"id":"http://arxiv.org/abs/2308.06248v1","updated":"2023-08-11T17:29:02Z","published":"2023-08-11T17:29:02Z","title":"FunnyBirds: A Synthetic Vision Dataset for a Part-Based Analysis of\n Explainable AI Methods","summary":" The field of explainable artificial intelligence (XAI) aims to uncover the\ninner workings of complex deep neural models. While being crucial for\nsafety-critical domains, XAI inherently lacks ground-truth explanations, making\nits automatic evaluation an unsolved problem. We address this challenge by\nproposing a novel synthetic vision dataset, named FunnyBirds, and accompanying\nautomatic evaluation protocols. Our dataset allows performing semantically\nmeaningful image interventions, e.g., removing individual object parts, which\nhas three important implications. First, it enables analyzing explanations on a\npart level, which is closer to human comprehension than existing methods that\nevaluate on a pixel level. Second, by comparing the model output for inputs\nwith removed parts, we can estimate ground-truth part importances that should\nbe reflected in the explanations. Third, by mapping individual explanations\ninto a common space of part importances, we can analyze a variety of different\nexplanation types in a single common framework. Using our tools, we report\nresults for 24 different combinations of neural models and XAI methods,\ndemonstrating the strengths and weaknesses of the assessed methods in a fully\nautomatic and systematic manner.\n","authors":["Robin Hesse","Simone Schaub-Meyer","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2308.06248v1.pdf","comment":"Accepted at ICCV 2023. Code: https://github.com/visinf/funnybirds"},{"id":"http://arxiv.org/abs/2305.20048v3","updated":"2023-08-11T17:26:42Z","published":"2023-05-31T17:21:58Z","title":"F?D: On understanding the role of deep feature spaces on face generation\n evaluation","summary":" Perceptual metrics, like the Fr\\'echet Inception Distance (FID), are widely\nused to assess the similarity between synthetically generated and ground truth\n(real) images. The key idea behind these metrics is to compute errors in a deep\nfeature space that captures perceptually and semantically rich image features.\nDespite their popularity, the effect that different deep features and their\ndesign choices have on a perceptual metric has not been well studied. In this\nwork, we perform a causal analysis linking differences in semantic attributes\nand distortions between face image distributions to Fr\\'echet distances (FD)\nusing several popular deep feature spaces. A key component of our analysis is\nthe creation of synthetic counterfactual faces using deep face generators. Our\nexperiments show that the FD is heavily influenced by its feature space's\ntraining dataset and objective function. For example, FD using features\nextracted from ImageNet-trained models heavily emphasize hats over regions like\nthe eyes and mouth. Moreover, FD using features from a face gender classifier\nemphasize hair length more than distances in an identity (recognition) feature\nspace. Finally, we evaluate several popular face generation models across\nfeature spaces and find that StyleGAN2 consistently ranks higher than other\nface generators, except with respect to identity (recognition) features. This\nsuggests the need for considering multiple feature spaces when evaluating\ngenerative models and using feature spaces that are tuned to nuances of the\ndomain of interest.\n","authors":["Krish Kabra","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2305.20048v3.pdf","comment":"Code and dataset to be released soon"},{"id":"http://arxiv.org/abs/2308.06217v1","updated":"2023-08-11T16:37:31Z","published":"2023-08-11T16:37:31Z","title":"Continual Face Forgery Detection via Historical Distribution Preserving","summary":" Face forgery techniques have advanced rapidly and pose serious security\nthreats. Existing face forgery detection methods try to learn generalizable\nfeatures, but they still fall short of practical application. Additionally,\nfinetuning these methods on historical training data is resource-intensive in\nterms of time and storage. In this paper, we focus on a novel and challenging\nproblem: Continual Face Forgery Detection (CFFD), which aims to efficiently\nlearn from new forgery attacks without forgetting previous ones. Specifically,\nwe propose a Historical Distribution Preserving (HDP) framework that reserves\nand preserves the distributions of historical faces. To achieve this, we use\nuniversal adversarial perturbation (UAP) to simulate historical forgery\ndistribution, and knowledge distillation to maintain the distribution variation\nof real faces across different models. We also construct a new benchmark for\nCFFD with three evaluation protocols. Our extensive experiments on the\nbenchmarks show that our method outperforms the state-of-the-art competitors.\n","authors":["Ke Sun","Shen Chen","Taiping Yao","Xiaoshuai Sun","Shouhong Ding","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.06217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06202v1","updated":"2023-08-11T15:57:45Z","published":"2023-08-11T15:57:45Z","title":"Exploring Predicate Visual Context in Detecting of Human-Object\n Interactions","summary":" Recently, the DETR framework has emerged as the dominant approach for\nhuman--object interaction (HOI) research. In particular, two-stage\ntransformer-based HOI detectors are amongst the most performant and\ntraining-efficient approaches. However, these often condition HOI\nclassification on object features that lack fine-grained contextual\ninformation, eschewing pose and orientation information in favour of visual\ncues about object identity and box extremities. This naturally hinders the\nrecognition of complex or ambiguous interactions. In this work, we study these\nissues through visualisations and carefully designed experiments. Accordingly,\nwe investigate how best to re-introduce image features via cross-attention.\nWith an improved query design, extensive exploration of keys and values, and\nbox pair positional embeddings as spatial guidance, our model with enhanced\npredicate visual context (PViC) outperforms state-of-the-art methods on the\nHICO-DET and V-COCO benchmarks, while maintaining low training cost.\n","authors":["Frederic Z. Zhang","Yuhui Yuan","Dylan Campbell","Zhuoyao Zhong","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2308.06202v1.pdf","comment":"To appear in ICCV2023"},{"id":"http://arxiv.org/abs/2303.06628v2","updated":"2023-08-11T15:56:32Z","published":"2023-03-12T10:28:07Z","title":"Preventing Zero-Shot Transfer Degradation in Continual Learning of\n Vision-Language Models","summary":" Continual learning (CL) can help pre-trained vision-language models\nefficiently adapt to new or under-trained data distributions without\nre-training. Nevertheless, during the continual training of the Contrastive\nLanguage-Image Pre-training (CLIP) model, we observe that the model's zero-shot\ntransfer ability significantly degrades due to catastrophic forgetting.\nExisting CL methods can mitigate forgetting by replaying previous data.\nHowever, since the CLIP dataset is private, replay methods cannot access the\npre-training dataset. In addition, replaying data of previously learned\ndownstream tasks can enhance their performance but comes at the cost of\nsacrificing zero-shot performance. To address this challenge, we propose a\nnovel method ZSCL to prevent zero-shot transfer degradation in the continual\nlearning of vision-language models in both feature and parameter space. In the\nfeature space, a reference dataset is introduced for distillation between the\ncurrent and initial models. The reference dataset should have semantic\ndiversity but no need to be labeled, seen in pre-training, or matched\nimage-text pairs. In parameter space, we prevent a large parameter shift by\naveraging weights during the training. We propose a more challenging\nMulti-domain Task Incremental Learning (MTIL) benchmark to evaluate different\nmethods, where tasks are from various domains instead of class-separated in a\nsingle dataset. Our method outperforms other methods in the traditional\nclass-incremental learning setting and the MTIL by 9.7% average score. Our code\nlocates at https://github.com/Thunderbeee/ZSCL.\n","authors":["Zangwei Zheng","Mingyuan Ma","Kai Wang","Ziheng Qin","Xiangyu Yue","Yang You"],"pdf_url":"https://arxiv.org/pdf/2303.06628v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06198v1","updated":"2023-08-11T15:43:37Z","published":"2023-08-11T15:43:37Z","title":"DIG In: Evaluating Disparities in Image Generations with Indicators for\n Geographic Diversity","summary":" The unprecedented photorealistic results achieved by recent text-to-image\ngenerative systems and their increasing use as plug-and-play content creation\nsolutions make it crucial to understand their potential biases. In this work,\nwe introduce three indicators to evaluate the realism, diversity and\nprompt-generation consistency of text-to-image generative systems when prompted\nto generate objects from across the world. Our indicators complement\nqualitative analysis of the broader impact of such systems by enabling\nautomatic and efficient benchmarking of geographic disparities, an important\nstep towards building responsible visual content creation systems. We use our\nproposed indicators to analyze potential geographic biases in state-of-the-art\nvisual content creation systems and find that: (1) models have less realism and\ndiversity of generations when prompting for Africa and West Asia than Europe,\n(2) prompting with geographic information comes at a cost to prompt-consistency\nand diversity of generated images, and (3) models exhibit more region-level\ndisparities for some objects than others. Perhaps most interestingly, our\nindicators suggest that progress in image generation quality has come at the\ncost of real-world geographic representation. Our comprehensive evaluation\nconstitutes a crucial step towards ensuring a positive experience of visual\ncontent creation for everyone.\n","authors":["Melissa Hall","Candace Ross","Adina Williams","Nicolas Carion","Michal Drozdzal","Adriana Romero Soriano"],"pdf_url":"https://arxiv.org/pdf/2308.06198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06197v1","updated":"2023-08-11T15:42:48Z","published":"2023-08-11T15:42:48Z","title":"Complex Facial Expression Recognition Using Deep Knowledge Distillation\n of Basic Features","summary":" Complex emotion recognition is a cognitive task that has so far eluded the\nsame excellent performance of other tasks that are at or above the level of\nhuman cognition. Emotion recognition through facial expressions is particularly\ndifficult due to the complexity of emotions expressed by the human face. For a\nmachine to approach the same level of performance in this domain as a human, it\nmay need to synthesise knowledge and understand new concepts in real-time as\nhumans do. Humans are able to learn new concepts using only few examples, by\ndistilling the important information from memories and discarding the rest.\nSimilarly, continual learning methods learn new classes whilst retaining the\nknowledge of known classes, whilst few-shot learning methods are able to learn\nnew classes using very few training examples. We propose a novel continual\nlearning method inspired by human cognition and learning that can accurately\nrecognise new compound expression classes using few training samples, by\nbuilding on and retaining its knowledge of basic expression classes. Using\nGradCAM visualisations, we demonstrate the relationship between basic and\ncompound facial expressions, which our method leverages through knowledge\ndistillation and a novel Predictive Sorting Memory Replay. Our method achieves\nthe current state-of-the-art in continual learning for complex facial\nexpression recognition with 74.28% Overall Accuracy on new classes. We also\ndemonstrate that using continual learning for complex facial expression\nrecognition achieves far better performance than non-continual learning\nmethods, improving on state-of-the-art non-continual learning methods by\n13.95%. To the best of our knowledge, our work is also the first to apply\nfew-shot learning to complex facial expression recognition, achieving the\nstate-of-the-art with 100% accuracy using a single training sample for each\nexpression class.\n","authors":["Angus Maiden","Bahareh Nakisa"],"pdf_url":"https://arxiv.org/pdf/2308.06197v1.pdf","comment":"17 pages, 9 figures, 6 tables. Code available at\n https://github.com/AngusMaiden/complex-FER"},{"id":"http://arxiv.org/abs/2304.09466v2","updated":"2023-08-11T15:30:29Z","published":"2023-04-19T07:27:21Z","title":"MAMAF-Net: Motion-Aware and Multi-Attention Fusion Network for Stroke\n Diagnosis","summary":" Stroke is a major cause of mortality and disability worldwide from which one\nin four people are in danger of incurring in their lifetime. The pre-hospital\nstroke assessment plays a vital role in identifying stroke patients accurately\nto accelerate further examination and treatment in hospitals. Accordingly, the\nNational Institutes of Health Stroke Scale (NIHSS), Cincinnati Pre-hospital\nStroke Scale (CPSS) and Face Arm Speed Time (F.A.S.T.) are globally known tests\nfor stroke assessment. However, the validity of these tests is skeptical in the\nabsence of neurologists and access to healthcare may be limited. Therefore, in\nthis study, we propose a motion-aware and multi-attention fusion network\n(MAMAF-Net) that can detect stroke from multimodal examination videos. Contrary\nto other studies on stroke detection from video analysis, our study for the\nfirst time proposes an end-to-end solution from multiple video recordings of\neach subject with a dataset encapsulating stroke, transient ischemic attack\n(TIA), and healthy controls. The proposed MAMAF-Net consists of motion-aware\nmodules to sense the mobility of patients, attention modules to fuse the\nmulti-input video data, and 3D convolutional layers to perform diagnosis from\nthe attention-based extracted features. Experimental results over the collected\nStroke-data dataset show that the proposed MAMAF-Net achieves a successful\ndetection of stroke with 93.62% sensitivity and 95.33% AUC score.\n","authors":["Aysen Degerli","Pekka Jakala","Juha Pajula","Milla Immonen","Miguel Bordallo Lopez"],"pdf_url":"https://arxiv.org/pdf/2304.09466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06024v2","updated":"2023-08-11T15:23:21Z","published":"2023-05-10T10:19:31Z","title":"Larger is not Better: A Survey on the Robustness of Computer Vision\n Models against Common Corruptions","summary":" The performance of computer vision models are susceptible to unexpected\nchanges in input images, known as common corruptions (e.g. noise, blur,\nillumination changes, etc.), that can hinder their reliability when deployed in\nreal scenarios. These corruptions are not always considered to test model\ngeneralization and robustness. In this survey, we present a comprehensive\noverview of methods that improve the robustness of computer vision models\nagainst common corruptions. We categorize methods into four groups based on the\nmodel part and training method addressed: data augmentation, representation\nlearning, knowledge distillation, and network components. We also cover\nindirect methods for generalization and mitigation of shortcut learning,\npotentially useful for corruption robustness. We release a unified benchmark\nframework to compare robustness performance on several datasets, and address\nthe inconsistencies of evaluation in the literature. We provide an experimental\noverview of the base corruption robustness of popular vision backbones, and\nshow that corruption robustness does not necessarily scale with model size. The\nvery large models (above 100M parameters) gain negligible robustness,\nconsidering the increased computational requirements. To achieve generalizable\nand robust computer vision models, we foresee the need of developing new\nlearning strategies to efficiently exploit limited data and mitigate unwanted\nor unreliable learning behaviors.\n","authors":["Shunxin Wang","Raymond Veldhuis","Christoph Brune","Nicola Strisciuglio"],"pdf_url":"https://arxiv.org/pdf/2305.06024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06173v1","updated":"2023-08-11T15:02:19Z","published":"2023-08-11T15:02:19Z","title":"Physical Adversarial Attacks For Camera-based Smart Systems: Current\n Trends, Categorization, Applications, Research Challenges, and Future Outlook","summary":" In this paper, we present a comprehensive survey of the current trends\nfocusing specifically on physical adversarial attacks. We aim to provide a\nthorough understanding of the concept of physical adversarial attacks,\nanalyzing their key characteristics and distinguishing features. Furthermore,\nwe explore the specific requirements and challenges associated with executing\nattacks in the physical world. Our article delves into various physical\nadversarial attack methods, categorized according to their target tasks in\ndifferent applications, including classification, detection, face recognition,\nsemantic segmentation and depth estimation. We assess the performance of these\nattack methods in terms of their effectiveness, stealthiness, and robustness.\nWe examine how each technique strives to ensure the successful manipulation of\nDNNs while mitigating the risk of detection and withstanding real-world\ndistortions. Lastly, we discuss the current challenges and outline potential\nfuture research directions in the field of physical adversarial attacks. We\nhighlight the need for enhanced defense mechanisms, the exploration of novel\nattack strategies, the evaluation of attacks in different application domains,\nand the establishment of standardized benchmarks and evaluation criteria for\nphysical adversarial attacks. Through this comprehensive survey, we aim to\nprovide a valuable resource for researchers, practitioners, and policymakers to\ngain a holistic understanding of physical adversarial attacks in computer\nvision and facilitate the development of robust and secure DNN-based systems.\n","authors":["Amira Guesmi","Muhammad Abdullah Hanif","Bassem Ouni","Muhammed Shafique"],"pdf_url":"https://arxiv.org/pdf/2308.06173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06161v1","updated":"2023-08-11T14:38:51Z","published":"2023-08-11T14:38:51Z","title":"Rethinking the Localization in Weakly Supervised Object Localization","summary":" Weakly supervised object localization (WSOL) is one of the most popular and\nchallenging tasks in computer vision. This task is to localize the objects in\nthe images given only the image-level supervision. Recently, dividing WSOL into\ntwo parts (class-agnostic object localization and object classification) has\nbecome the state-of-the-art pipeline for this task. However, existing solutions\nunder this pipeline usually suffer from the following drawbacks: 1) they are\nnot flexible since they can only localize one object for each image due to the\nadopted single-class regression (SCR) for localization; 2) the generated pseudo\nbounding boxes may be noisy, but the negative impact of such noise is not well\naddressed. To remedy these drawbacks, we first propose to replace SCR with a\nbinary-class detector (BCD) for localizing multiple objects, where the detector\nis trained by discriminating the foreground and background. Then we design a\nweighted entropy (WE) loss using the unlabeled data to reduce the negative\nimpact of noisy bounding boxes. Extensive experiments on the popular\nCUB-200-2011 and ImageNet-1K datasets demonstrate the effectiveness of our\nmethod.\n","authors":["Rui Xu","Yong Luo","Han Hu","Bo Du","Jialie Shen","Yonggang Wen"],"pdf_url":"https://arxiv.org/pdf/2308.06161v1.pdf","comment":"Accepted by ACM International Conference on Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.06160v1","updated":"2023-08-11T14:38:11Z","published":"2023-08-11T14:38:11Z","title":"DatasetDM: Synthesizing Data with Perception Annotations Using Diffusion\n Models","summary":" Current deep networks are very data-hungry and benefit from training on\nlargescale datasets, which are often time-consuming to collect and annotate. By\ncontrast, synthetic data can be generated infinitely using generative models\nsuch as DALL-E and diffusion models, with minimal effort and cost. In this\npaper, we present DatasetDM, a generic dataset generation model that can\nproduce diverse synthetic images and the corresponding high-quality perception\nannotations (e.g., segmentation masks, and depth). Our method builds upon the\npre-trained diffusion model and extends text-guided image synthesis to\nperception data generation. We show that the rich latent code of the diffusion\nmodel can be effectively decoded as accurate perception annotations using a\ndecoder module. Training the decoder only needs less than 1% (around 100\nimages) manually labeled images, enabling the generation of an infinitely large\nannotated dataset. Then these synthetic data can be used for training various\nperception models for downstream tasks. To showcase the power of the proposed\napproach, we generate datasets with rich dense pixel-wise labels for a wide\nrange of downstream tasks, including semantic segmentation, instance\nsegmentation, and depth estimation. Notably, it achieves 1) state-of-the-art\nresults on semantic segmentation and instance segmentation; 2) significantly\nmore robust on domain generalization than using the real data alone; and\nstate-of-the-art results in zero-shot segmentation setting; and 3) flexibility\nfor efficient application and novel task composition (e.g., image editing). The\nproject website and code can be found at\nhttps://weijiawu.github.io/DatasetDM_page/ and\nhttps://github.com/showlab/DatasetDM, respectively\n","authors":["Weijia Wu","Yuzhong Zhao","Hao Chen","Yuchao Gu","Rui Zhao","Yefei He","Hong Zhou","Mike Zheng Shou","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2308.06160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06147v1","updated":"2023-08-11T14:24:03Z","published":"2023-08-11T14:24:03Z","title":"Efficient Large-scale AUV-based Visual Seafloor Mapping","summary":" Driven by the increasing number of marine data science applications, there is\na growing interest in surveying and exploring the vast, uncharted terrain of\nthe deep sea with robotic platforms. Despite impressive results achieved by\nmany on-land visual mapping algorithms in the past decades, transferring these\nmethods from land to the deep sea remains a challenge due to harsh\nenvironmental conditions. Typically, deep-sea exploration involves the use of\nautonomous underwater vehicles (AUVs) equipped with high-resolution cameras and\nartificial illumination systems. However, images obtained in this manner often\nsuffer from heterogeneous illumination and quality degradation due to\nattenuation and scattering, on top of refraction of light rays. All of this\ntogether often lets on-land SLAM approaches fail underwater or makes\nStructure-from-Motion approaches drift or omit difficult images, resulting in\ngaps, jumps or weakly registered areas. In this work, we present a system that\nincorporates recent developments in underwater imaging and visual mapping to\nfacilitate automated robotic 3D reconstruction of hectares of seafloor. Our\napproach is efficient in that it detects and reconsiders difficult, weakly\nregistered areas, to avoid omitting images and to make better use of limited\ndive time; on the other hand it is computationally efficient; leveraging a\nhybrid approach combining benefits from SLAM and Structure-from-Motion that\nruns much faster than incremental reconstructions while achieving at least\non-par performance. The proposed system has been extensively tested and\nevaluated during several research cruises, demonstrating its robustness and\npracticality in real-world conditions.\n","authors":["Mengkun She","Yifan Song","David Nakath","Kevin Köser"],"pdf_url":"https://arxiv.org/pdf/2308.06147v1.pdf","comment":"27 pages, 21 figures"},{"id":"http://arxiv.org/abs/2307.08106v3","updated":"2023-08-11T14:04:13Z","published":"2023-07-16T17:14:39Z","title":"Polarization Multi-Image Synthesis with Birefringent Metasurfaces","summary":" Optical metasurfaces composed of precisely engineered nanostructures have\ngained significant attention for their ability to manipulate light and\nimplement distinct functionalities based on the properties of the incident\nfield. Computational imaging systems have started harnessing this capability to\nproduce sets of coded measurements that benefit certain tasks when paired with\ndigital post-processing. Inspired by these works, we introduce a new system\nthat uses a birefringent metasurface with a polarizer-mosaicked photosensor to\ncapture four optically-coded measurements in a single exposure. We apply this\nsystem to the task of incoherent opto-electronic filtering, where digital\nspatial-filtering operations are replaced by simpler, per-pixel sums across the\nfour polarization channels, independent of the spatial filter size. In contrast\nto previous work on incoherent opto-electronic filtering that can realize only\none spatial filter, our approach can realize a continuous family of filters\nfrom a single capture, with filters being selected from the family by adjusting\nthe post-capture digital summation weights. To find a metasurface that can\nrealize a set of user-specified spatial filters, we introduce a form of\ngradient descent with a novel regularizer that encourages light efficiency and\na high signal-to-noise ratio. We demonstrate several examples in simulation and\nwith fabricated prototypes, including some with spatial filters that have\nprescribed variations with respect to depth and wavelength.\n Visit the Project Page at\nhttps://deanhazineh.github.io/publications/Multi_Image_Synthesis/MIS_Home.html\n","authors":["Dean Hazineh","Soon Wei Daniel Lim","Qi Guo","Federico Capasso","Todd Zickler"],"pdf_url":"https://arxiv.org/pdf/2307.08106v3.pdf","comment":"Published in the Proceedings of the 2023 IEEE International\n Conference of Computational Photography"},{"id":"http://arxiv.org/abs/2308.06142v1","updated":"2023-08-11T14:02:52Z","published":"2023-08-11T14:02:52Z","title":"CompTLL-UNet: Compressed Domain Text-Line Localization in Challenging\n Handwritten Documents using Deep Feature Learning from JPEG Coefficients","summary":" Automatic localization of text-lines in handwritten documents is still an\nopen and challenging research problem. Various writing issues such as uneven\nspacing between the lines, oscillating and touching text, and the presence of\nskew become much more challenging when the case of complex handwritten document\nimages are considered for segmentation directly in their respective compressed\nrepresentation. This is because, the conventional way of processing compressed\ndocuments is through decompression, but here in this paper, we propose an idea\nthat employs deep feature learning directly from the JPEG compressed\ncoefficients without full decompression to accomplish text-line localization in\nthe JPEG compressed domain. A modified U-Net architecture known as Compressed\nText-Line Localization Network (CompTLL-UNet) is designed to accomplish it. The\nmodel is trained and tested with JPEG compressed version of benchmark datasets\nincluding ICDAR2017 (cBAD) and ICDAR2019 (cBAD), reporting the state-of-the-art\nperformance with reduced storage and computational costs in the JPEG compressed\ndomain.\n","authors":["Bulla Rajesh","Sk Mahafuz Zaman","Mohammed Javed","P. Nagabhushan"],"pdf_url":"https://arxiv.org/pdf/2308.06142v1.pdf","comment":"Accepted in 7th Asian Conference on Pattern Recognition (ACPR 2023),\n 5-8 November 2023, Kitakyushu, Japan"},{"id":"http://arxiv.org/abs/2308.03409v2","updated":"2023-08-11T13:53:19Z","published":"2023-08-07T08:55:48Z","title":"DiT: Efficient Vision Transformers with Dynamic Token Routing","summary":" Recently, the tokens of images share the same static data flow in many dense\nnetworks. However, challenges arise from the variance among the objects in\nimages, such as large variations in the spatial scale and difficulties of\nrecognition for visual entities. In this paper, we propose a data-dependent\ntoken routing strategy to elaborate the routing paths of image tokens for\nDynamic Vision Transformer, dubbed DiT. The proposed framework generates a\ndata-dependent path per token, adapting to the object scales and visual\ndiscrimination of tokens. In feed-forward, the differentiable routing gates are\ndesigned to select the scaling paths and feature transformation paths for image\ntokens, leading to multi-path feature propagation. In this way, the impact of\nobject scales and visual discrimination of image representation can be\ncarefully tuned. Moreover, the computational cost can be further reduced by\ngiving budget constraints to the routing gate and early-stopping of feature\nextraction. In experiments, our DiT achieves superior performance and favorable\ncomplexity/accuracy trade-offs than many SoTA methods on ImageNet\nclassification, object detection, instance segmentation, and semantic\nsegmentation. Particularly, the DiT-B5 obtains 84.8\\% top-1 Acc on ImageNet\nwith 10.3 GFLOPs, which is 1.0\\% higher than that of the SoTA method with\nsimilar computational complexity. These extensive results demonstrate that DiT\ncan serve as versatile backbones for various vision tasks.\n","authors":["Yuchen Ma","Zhengcong Fei","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2308.03409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08115v2","updated":"2023-08-11T13:36:35Z","published":"2022-11-15T13:09:54Z","title":"Heatmap-based Out-of-Distribution Detection","summary":" Our work investigates out-of-distribution (OOD) detection as a neural network\noutput explanation problem. We learn a heatmap representation for detecting OOD\nimages while visualizing in- and out-of-distribution image regions at the same\ntime. Given a trained and fixed classifier, we train a decoder neural network\nto produce heatmaps with zero response for in-distribution samples and high\nresponse heatmaps for OOD samples, based on the classifier features and the\nclass prediction. Our main innovation lies in the heatmap definition for an OOD\nsample, as the normalized difference from the closest in-distribution sample.\nThe heatmap serves as a margin to distinguish between in- and\nout-of-distribution samples. Our approach generates the heatmaps not only for\nOOD detection, but also to indicate in- and out-of-distribution regions of the\ninput image. In our evaluations, our approach mostly outperforms the prior work\non fixed classifiers, trained on CIFAR-10, CIFAR-100 and Tiny ImageNet. The\ncode is publicly available at: https://github.com/jhornauer/heatmap_ood.\n","authors":["Julia Hornauer","Vasileios Belagiannis"],"pdf_url":"https://arxiv.org/pdf/2211.08115v2.pdf","comment":"Accepted to WACV 2023"},{"id":"http://arxiv.org/abs/2308.06129v1","updated":"2023-08-11T13:35:52Z","published":"2023-08-11T13:35:52Z","title":"Uncertainty Quantification for Image-based Traffic Prediction across\n Cities","summary":" Despite the strong predictive performance of deep learning models for traffic\nprediction, their widespread deployment in real-world intelligent\ntransportation systems has been restrained by a lack of interpretability.\nUncertainty quantification (UQ) methods provide an approach to induce\nprobabilistic reasoning, improve decision-making and enhance model deployment\npotential. To gain a comprehensive picture of the usefulness of existing UQ\nmethods for traffic prediction and the relation between obtained uncertainties\nand city-wide traffic dynamics, we investigate their application to a\nlarge-scale image-based traffic dataset spanning multiple cities and time\nperiods. We compare two epistemic and two aleatoric UQ methods on both temporal\nand spatio-temporal transfer tasks, and find that meaningful uncertainty\nestimates can be recovered. We further demonstrate how uncertainty estimates\ncan be employed for unsupervised outlier detection on changes in city traffic\ndynamics. We find that our approach can capture both temporal and spatial\neffects on traffic behaviour in a representative case study for the city of\nMoscow. Our work presents a further step towards boosting uncertainty awareness\nin traffic prediction tasks, and aims to highlight the value contribution of UQ\nmethods to a better understanding of city traffic dynamics.\n","authors":["Alexander Timans","Nina Wiedemann","Nishant Kumar","Ye Hong","Martin Raubal"],"pdf_url":"https://arxiv.org/pdf/2308.06129v1.pdf","comment":"39 pages, 22 figures. Code publicly available at:\n https://github.com/alextimans/traffic4cast-uncertainty"},{"id":"http://arxiv.org/abs/2012.01654v2","updated":"2023-08-11T12:57:04Z","published":"2020-12-03T02:26:01Z","title":"Towards Defending Multiple $\\ell_p$-norm Bounded Adversarial\n Perturbations via Gated Batch Normalization","summary":" There has been extensive evidence demonstrating that deep neural networks are\nvulnerable to adversarial examples, which motivates the development of defenses\nagainst adversarial attacks. Existing adversarial defenses typically improve\nmodel robustness against individual specific perturbation types (\\eg,\n$\\ell_{\\infty}$-norm bounded adversarial examples). However, adversaries are\nlikely to generate multiple types of perturbations in practice (\\eg, $\\ell_1$,\n$\\ell_2$, and $\\ell_{\\infty}$ perturbations). Some recent methods improve model\nrobustness against adversarial attacks in multiple $\\ell_p$ balls, but their\nperformance against each perturbation type is still far from satisfactory. In\nthis paper, we observe that different $\\ell_p$ bounded adversarial\nperturbations induce different statistical properties that can be separated and\ncharacterized by the statistics of Batch Normalization (BN). We thus propose\nGated Batch Normalization (GBN) to adversarially train a perturbation-invariant\npredictor for defending multiple $\\ell_p$ bounded adversarial perturbations.\nGBN consists of a multi-branch BN layer and a gated sub-network. Each BN branch\nin GBN is in charge of one perturbation type to ensure that the normalized\noutput is aligned towards learning perturbation-invariant representation.\nMeanwhile, the gated sub-network is designed to separate inputs added with\ndifferent perturbation types. We perform an extensive evaluation of our\napproach on commonly-used dataset including MNIST, CIFAR-10, and Tiny-ImageNet,\nand demonstrate that GBN outperforms previous defense proposals against\nmultiple perturbation types (\\ie, $\\ell_1$, $\\ell_2$, and $\\ell_{\\infty}$\nperturbations) by large margins.\n","authors":["Aishan Liu","Shiyu Tang","Xinyun Chen","Lei Huang","Haotong Qin","Xianglong Liu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2012.01654v2.pdf","comment":"Accepted on IJCV"},{"id":"http://arxiv.org/abs/2308.02525v2","updated":"2023-08-11T12:31:02Z","published":"2023-07-31T13:07:56Z","title":"Can Self-Supervised Representation Learning Methods Withstand\n Distribution Shifts and Corruptions?","summary":" Self-supervised learning in computer vision aims to leverage the inherent\nstructure and relationships within data to learn meaningful representations\nwithout explicit human annotation, enabling a holistic understanding of visual\nscenes. Robustness in vision machine learning ensures reliable and consistent\nperformance, enhancing generalization, adaptability, and resistance to noise,\nvariations, and adversarial attacks. Self-supervised paradigms, namely\ncontrastive learning, knowledge distillation, mutual information maximization,\nand clustering, have been considered to have shown advances in invariant\nlearning representations. This work investigates the robustness of learned\nrepresentations of self-supervised learning approaches focusing on distribution\nshifts and image corruptions in computer vision. Detailed experiments have been\nconducted to study the robustness of self-supervised learning methods on\ndistribution shifts and image corruptions. The empirical analysis demonstrates\na clear relationship between the performance of learned representations within\nself-supervised paradigms and the severity of distribution shifts and\ncorruptions. Notably, higher levels of shifts and corruptions are found to\nsignificantly diminish the robustness of the learned representations. These\nfindings highlight the critical impact of distribution shifts and image\ncorruptions on the performance and resilience of self-supervised learning\nmethods, emphasizing the need for effective strategies to mitigate their\nadverse effects. The study strongly advocates for future research in the field\nof self-supervised representation learning to prioritize the key aspects of\nsafety and robustness in order to ensure practical applicability. The source\ncode and results are available on GitHub.\n","authors":["Prakash Chandra Chhipa","Johan Rodahl Holmgren","Kanjar De","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.02525v2.pdf","comment":"Accepted at 2023 IEEE/CVF International Conference on Computer Vision\n Workshops (ICCVW). Corresponding author - prakash.chandra.chhipa@ltu.se"},{"id":"http://arxiv.org/abs/2303.15109v2","updated":"2023-08-11T12:27:42Z","published":"2023-03-27T11:26:34Z","title":"Improving the Transferability of Adversarial Examples via Direction\n Tuning","summary":" In the transfer-based adversarial attacks, adversarial examples are only\ngenerated by the surrogate models and achieve effective perturbation in the\nvictim models. Although considerable efforts have been developed on improving\nthe transferability of adversarial examples generated by transfer-based\nadversarial attacks, our investigation found that, the big deviation between\nthe actual and steepest update directions of the current transfer-based\nadversarial attacks is caused by the large update step length, resulting in the\ngenerated adversarial examples can not converge well. However, directly\nreducing the update step length will lead to serious update oscillation so that\nthe generated adversarial examples also can not achieve great transferability\nto the victim models. To address these issues, a novel transfer-based attack,\nnamely direction tuning attack, is proposed to not only decrease the update\ndeviation in the large step length, but also mitigate the update oscillation in\nthe small sampling step length, thereby making the generated adversarial\nexamples converge well to achieve great transferability on victim models. In\naddition, a network pruning method is proposed to smooth the decision boundary,\nthereby further decreasing the update oscillation and enhancing the\ntransferability of the generated adversarial examples. The experiment results\non ImageNet demonstrate that the average attack success rate (ASR) of the\nadversarial examples generated by our method can be improved from 87.9\\% to\n94.5\\% on five victim models without defenses, and from 69.1\\% to 76.2\\% on\neight advanced defense methods, in comparison with that of latest\ngradient-based attacks.\n","authors":["Xiangyuan Yang","Jie Lin","Hanlin Zhang","Xinyu Yang","Peng Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.15109v2.pdf","comment":"Accepted by INS 2023"},{"id":"http://arxiv.org/abs/2308.06101v1","updated":"2023-08-11T12:23:09Z","published":"2023-08-11T12:23:09Z","title":"Taming the Power of Diffusion Models for High-Quality Virtual Try-On\n with Appearance Flow","summary":" Virtual try-on is a critical image synthesis task that aims to transfer\nclothes from one image to another while preserving the details of both humans\nand clothes. While many existing methods rely on Generative Adversarial\nNetworks (GANs) to achieve this, flaws can still occur, particularly at high\nresolutions. Recently, the diffusion model has emerged as a promising\nalternative for generating high-quality images in various applications.\nHowever, simply using clothes as a condition for guiding the diffusion model to\ninpaint is insufficient to maintain the details of the clothes. To overcome\nthis challenge, we propose an exemplar-based inpainting approach that leverages\na warping module to guide the diffusion model's generation effectively. The\nwarping module performs initial processing on the clothes, which helps to\npreserve the local details of the clothes. We then combine the warped clothes\nwith clothes-agnostic person image and add noise as the input of diffusion\nmodel. Additionally, the warped clothes is used as local conditions for each\ndenoising process to ensure that the resulting output retains as much detail as\npossible. Our approach, namely Diffusion-based Conditional Inpainting for\nVirtual Try-ON (DCI-VTON), effectively utilizes the power of the diffusion\nmodel, and the incorporation of the warping module helps to produce\nhigh-quality and realistic virtual try-on results. Experimental results on\nVITON-HD demonstrate the effectiveness and superiority of our method.\n","authors":["Junhong Gou","Siyu Sun","Jianfu Zhang","Jianlou Si","Chen Qian","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.06101v1.pdf","comment":"Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.06100v1","updated":"2023-08-11T12:22:37Z","published":"2023-08-11T12:22:37Z","title":"Diffusion-based Visual Counterfactual Explanations -- Towards Systematic\n Quantitative Evaluation","summary":" Latest methods for visual counterfactual explanations (VCE) harness the power\nof deep generative models to synthesize new examples of high-dimensional images\nof impressive quality. However, it is currently difficult to compare the\nperformance of these VCE methods as the evaluation procedures largely vary and\noften boil down to visual inspection of individual examples and small scale\nuser studies. In this work, we propose a framework for systematic, quantitative\nevaluation of the VCE methods and a minimal set of metrics to be used. We use\nthis framework to explore the effects of certain crucial design choices in the\nlatest diffusion-based generative models for VCEs of natural image\nclassification (ImageNet). We conduct a battery of ablation-like experiments,\ngenerating thousands of VCEs for a suite of classifiers of various complexity,\naccuracy and robustness. Our findings suggest multiple directions for future\nadvancements and improvements of VCE methods. By sharing our methodology and\nour approach to tackle the computational challenges of such a study on a\nlimited hardware setup (including the complete code base), we offer a valuable\nguidance for researchers in the field fostering consistency and transparency in\nthe assessment of counterfactual explanations.\n","authors":["Philipp Vaeth","Alexander M. Fruehwald","Benjamin Paassen","Magda Gregorova"],"pdf_url":"https://arxiv.org/pdf/2308.06100v1.pdf","comment":"Accepted at the 5th International Workshop on eXplainable Knowledge\n Discovery in Data Mining @ ECML 2023"},{"id":"http://arxiv.org/abs/2306.04542v2","updated":"2023-08-11T12:20:50Z","published":"2023-06-07T15:46:47Z","title":"On the Design Fundamentals of Diffusion Models: A Survey","summary":" Diffusion models are generative models, which gradually add and remove noise\nto learn the underlying distribution of training data for data generation. The\ncomponents of diffusion models have gained significant attention with many\ndesign choices proposed. Existing reviews have primarily focused on\nhigher-level solutions, thereby covering less on the design fundamentals of\ncomponents. This study seeks to address this gap by providing a comprehensive\nand coherent review on component-wise design choices in diffusion models.\nSpecifically, we organize this review according to their three key components,\nnamely the forward process, the reverse process, and the sampling procedure.\nThis allows us to provide a fine-grained perspective of diffusion models,\nbenefiting future studies in the analysis of individual components, the\napplicability of design choices, and the implementation of diffusion models.\n","authors":["Ziyi Chang","George Alex Koulieris","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2306.04542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06098v1","updated":"2023-08-11T12:18:53Z","published":"2023-08-11T12:18:53Z","title":"Automated Construction of Time-Space Diagrams for Traffic Analysis Using\n Street-View Video Sequence","summary":" Time-space diagrams are essential tools for analyzing traffic patterns and\noptimizing transportation infrastructure and traffic management strategies.\nTraditional data collection methods for these diagrams have limitations in\nterms of temporal and spatial coverage. Recent advancements in camera\ntechnology have overcome these limitations and provided extensive urban data.\nIn this study, we propose an innovative approach to constructing time-space\ndiagrams by utilizing street-view video sequences captured by cameras mounted\non moving vehicles. Using the state-of-the-art YOLOv5, StrongSORT, and\nphotogrammetry techniques for distance calculation, we can infer vehicle\ntrajectories from the video data and generate time-space diagrams. To evaluate\nthe effectiveness of our proposed method, we utilized datasets from the KITTI\ncomputer vision benchmark suite. The evaluation results demonstrate that our\napproach can generate trajectories from video data, although there are some\nerrors that can be mitigated by improving the performance of the detector,\ntracker, and distance calculation components. In conclusion, the utilization of\nstreet-view video sequences captured by cameras mounted on moving vehicles,\ncombined with state-of-the-art computer vision techniques, has immense\npotential for constructing comprehensive time-space diagrams. These diagrams\noffer valuable insights into traffic patterns and contribute to the design of\ntransportation infrastructure and traffic management strategies.\n","authors":["Tanay Rastogi","Mårten Björkman"],"pdf_url":"https://arxiv.org/pdf/2308.06098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06097v1","updated":"2023-08-11T12:17:24Z","published":"2023-08-11T12:17:24Z","title":"RIGID: Recurrent GAN Inversion and Editing of Real Face Videos","summary":" GAN inversion is indispensable for applying the powerful editability of GAN\nto real images. However, existing methods invert video frames individually\noften leading to undesired inconsistent results over time. In this paper, we\npropose a unified recurrent framework, named \\textbf{R}ecurrent v\\textbf{I}deo\n\\textbf{G}AN \\textbf{I}nversion and e\\textbf{D}iting (RIGID), to explicitly and\nsimultaneously enforce temporally coherent GAN inversion and facial editing of\nreal videos. Our approach models the temporal relations between current and\nprevious frames from three aspects. To enable a faithful real video\nreconstruction, we first maximize the inversion fidelity and consistency by\nlearning a temporal compensated latent code. Second, we observe incoherent\nnoises lie in the high-frequency domain that can be disentangled from the\nlatent space. Third, to remove the inconsistency after attribute manipulation,\nwe propose an \\textit{in-between frame composition constraint} such that the\narbitrary frame must be a direct composite of its neighboring frames. Our\nunified framework learns the inherent coherence between input frames in an\nend-to-end manner, and therefore it is agnostic to a specific attribute and can\nbe applied to arbitrary editing of the same video without re-training.\nExtensive experiments demonstrate that RIGID outperforms state-of-the-art\nmethods qualitatively and quantitatively in both inversion and editing tasks.\nThe deliverables can be found in \\url{https://cnnlstm.github.io/RIGID}\n","authors":["Yangyang Xu","Shengfeng He","Kwan-Yee K. Wong","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.06097v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/1903.10360v2","updated":"2023-08-11T12:08:58Z","published":"2019-03-25T14:21:08Z","title":"Structured 2D Representation of 3D Data for Shape Processing","summary":" We represent 3D shape by structured 2D representations of fixed length making\nit feasible to apply well investigated 2D convolutional neural networks (CNN)\nfor both discriminative and geometric tasks on 3D shapes. We first provide a\ngeneral introduction to such structured descriptors, analyze their different\nforms and show how a simple 2D CNN can be used to achieve good classification\nresult. With a specialized classification network for images and our structured\nrepresentation, we achieve the classification accuracy of 99.7\\% in the\nModelNet40 test set - improving the previous state-of-the-art by a large\nmargin. We finally provide a novel framework for performing the geometric task\nof 3D segmentation using 2D CNNs and the structured representation - concluding\nthe utility of such descriptors for both discriminative and geometric tasks.\n","authors":["Kripasindhu Sarkar","Elizabeth Mathews","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/1903.10360v2.pdf","comment":"Results of some of the experiments were incorrect"},{"id":"http://arxiv.org/abs/2308.06093v1","updated":"2023-08-11T12:05:12Z","published":"2023-08-11T12:05:12Z","title":"Experts Weights Averaging: A New General Training Scheme for Vision\n Transformers","summary":" Structural re-parameterization is a general training scheme for Convolutional\nNeural Networks (CNNs), which achieves performance improvement without\nincreasing inference cost. As Vision Transformers (ViTs) are gradually\nsurpassing CNNs in various visual tasks, one may question: if a training scheme\nspecifically for ViTs exists that can also achieve performance improvement\nwithout increasing inference cost? Recently, Mixture-of-Experts (MoE) has\nattracted increasing attention, as it can efficiently scale up the capacity of\nTransformers at a fixed cost through sparsely activated experts. Considering\nthat MoE can also be viewed as a multi-branch structure, can we utilize MoE to\nimplement a ViT training scheme similar to structural re-parameterization? In\nthis paper, we affirmatively answer these questions, with a new general\ntraining strategy for ViTs. Specifically, we decouple the training and\ninference phases of ViTs. During training, we replace some Feed-Forward\nNetworks (FFNs) of the ViT with specially designed, more efficient MoEs that\nassign tokens to experts by random uniform partition, and perform Experts\nWeights Averaging (EWA) on these MoEs at the end of each iteration. After\ntraining, we convert each MoE into an FFN by averaging the experts,\ntransforming the model back into original ViT for inference. We further provide\na theoretical analysis to show why and how it works. Comprehensive experiments\nacross various 2D and 3D visual tasks, ViT architectures, and datasets validate\nthe effectiveness and generalizability of the proposed training scheme.\nBesides, our training scheme can also be applied to improve performance when\nfine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can\nsignificantly improve the effectiveness of naive MoE in various 2D visual small\ndatasets and 3D visual tasks.\n","authors":["Yongqi Huang","Peng Ye","Xiaoshui Huang","Sheng Li","Tao Chen","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2308.06093v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2301.01732v4","updated":"2023-08-11T11:55:50Z","published":"2023-01-04T18:02:59Z","title":"UNAEN: Unsupervised Abnormality Extraction Network for MRI Motion\n Artifact Reduction","summary":" Motion artifacts compromise the quality of magnetic resonance imaging (MRI)\nand pose challenges to achieving diagnostic outcomes and image-guided\ntherapies. In recent years, supervised deep learning approaches have emerged as\nsuccessful solutions for motion artifact reduction (MAR). One disadvantage of\nthese methods is their dependency on acquiring paired sets of motion\nartifact-corrupted (MA-corrupted) and motion artifact-free (MA-free) MR images\nfor training purposes. Obtaining such image pairs is difficult and therefore\nlimits the application of supervised training. In this paper, we propose a\nnovel UNsupervised Abnormality Extraction Network (UNAEN) to alleviate this\nproblem. Our network is capable of working with unpaired MA-corrupted and\nMA-free images. It converts the MA-corrupted images to MA-reduced images by\nextracting abnormalities from the MA-corrupted images using a proposed artifact\nextractor, which intercepts the residual artifact maps from the MA-corrupted MR\nimages explicitly, and a reconstructor to restore the original input from the\nMA-reduced images. The performance of UNAEN was assessed by experimenting on\nvarious publicly available MRI datasets and comparing them with\nstate-of-the-art methods. The quantitative evaluation demonstrates the\nsuperiority of UNAEN over alternative MAR methods and visually exhibits fewer\nresidual artifacts. Our results substantiate the potential of UNAEN as a\npromising solution applicable in real-world clinical environments, with the\ncapability to enhance diagnostic accuracy and facilitate image-guided\ntherapies.\n","authors":["Yusheng Zhou","Hao Li","Jianan Liu","Zhengmin Kong","Tao Huang","Euijoon Ahn","Zhihan Lv","Jinman Kim","David Dagan Feng"],"pdf_url":"https://arxiv.org/pdf/2301.01732v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06076v1","updated":"2023-08-11T11:29:01Z","published":"2023-08-11T11:29:01Z","title":"Versatile Face Animator: Driving Arbitrary 3D Facial Avatar in RGBD\n Space","summary":" Creating realistic 3D facial animation is crucial for various applications in\nthe movie production and gaming industry, especially with the burgeoning demand\nin the metaverse. However, prevalent methods such as blendshape-based\napproaches and facial rigging techniques are time-consuming, labor-intensive,\nand lack standardized configurations, making facial animation production\nchallenging and costly. In this paper, we propose a novel self-supervised\nframework, Versatile Face Animator, which combines facial motion capture with\nmotion retargeting in an end-to-end manner, eliminating the need for\nblendshapes or rigs. Our method has the following two main characteristics: 1)\nwe propose an RGBD animation module to learn facial motion from raw RGBD videos\nby hierarchical motion dictionaries and animate RGBD images rendered from 3D\nfacial mesh coarse-to-fine, enabling facial animation on arbitrary 3D\ncharacters regardless of their topology, textures, blendshapes, and rigs; and\n2) we introduce a mesh retarget module to utilize RGBD animation to create 3D\nfacial animation by manipulating facial mesh with controller transformations,\nwhich are estimated from dense optical flow fields and blended together with\ngeodesic-distance-based weights. Comprehensive experiments demonstrate the\neffectiveness of our proposed framework in generating impressive 3D facial\nanimation results, highlighting its potential as a promising solution for the\ncost-effective and efficient production of facial animation in the metaverse.\n","authors":["Haoyu Wang","Haozhe Wu","Junliang Xing","Jia Jia"],"pdf_url":"https://arxiv.org/pdf/2308.06076v1.pdf","comment":"Accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2308.06072v1","updated":"2023-08-11T11:25:23Z","published":"2023-08-11T11:25:23Z","title":"Out-of-Distribution Detection for Monocular Depth Estimation","summary":" In monocular depth estimation, uncertainty estimation approaches mainly\ntarget the data uncertainty introduced by image noise. In contrast to prior\nwork, we address the uncertainty due to lack of knowledge, which is relevant\nfor the detection of data not represented by the training distribution, the\nso-called out-of-distribution (OOD) data. Motivated by anomaly detection, we\npropose to detect OOD images from an encoder-decoder depth estimation model\nbased on the reconstruction error. Given the features extracted with the fixed\ndepth encoder, we train an image decoder for image reconstruction using only\nin-distribution data. Consequently, OOD images result in a high reconstruction\nerror, which we use to distinguish between in- and out-of-distribution samples.\nWe built our experiments on the standard NYU Depth V2 and KITTI benchmarks as\nin-distribution data. Our post hoc method performs astonishingly well on\ndifferent models and outperforms existing uncertainty estimation approaches\nwithout modifying the trained encoder-decoder depth estimation model.\n","authors":["Julia Hornauer","Adrian Holzbock","Vasileios Belagiannis"],"pdf_url":"https://arxiv.org/pdf/2308.06072v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2306.02898v3","updated":"2023-08-11T11:13:08Z","published":"2023-06-05T14:06:24Z","title":"Towards Unified Text-based Person Retrieval: A Large-scale\n Multi-Attribute and Language Search Benchmark","summary":" In this paper, we introduce a large Multi-Attribute and Language Search\ndataset for text-based person retrieval, called MALS, and explore the\nfeasibility of performing pre-training on both attribute recognition and\nimage-text matching tasks in one stone. In particular, MALS contains 1,510,330\nimage-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,\nand all images are annotated with 27 attributes. Considering the privacy\nconcerns and annotation costs, we leverage the off-the-shelf diffusion models\nto generate the dataset. To verify the feasibility of learning from the\ngenerated data, we develop a new joint Attribute Prompt Learning and Text\nMatching Learning (APTM) framework, considering the shared knowledge between\nattribute and text. As the name implies, APTM contains an attribute prompt\nlearning stream and a text matching learning stream. (1) The attribute prompt\nlearning leverages the attribute prompts for image-attribute alignment, which\nenhances the text matching learning. (2) The text matching learning facilitates\nthe representation learning on fine-grained details, and in turn, boosts the\nattribute prompt learning. Extensive experiments validate the effectiveness of\nthe pre-training on MALS, achieving state-of-the-art retrieval performance via\nAPTM on three challenging real-world benchmarks. In particular, APTM achieves a\nconsistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on\nCUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.\n","authors":["Shuyu Yang","Yinan Zhou","Yaxiong Wang","Yujiao Wu","Li Zhu","Zhedong Zheng"],"pdf_url":"https://arxiv.org/pdf/2306.02898v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15157v3","updated":"2023-08-11T11:06:09Z","published":"2022-06-30T09:40:05Z","title":"HRFuser: A Multi-resolution Sensor Fusion Architecture for 2D Object\n Detection","summary":" Besides standard cameras, autonomous vehicles typically include multiple\nadditional sensors, such as lidars and radars, which help acquire richer\ninformation for perceiving the content of the driving scene. While several\nrecent works focus on fusing certain pairs of sensors - such as camera with\nlidar or radar - by using architectural components specific to the examined\nsetting, a generic and modular sensor fusion architecture is missing from the\nliterature. In this work, we propose HRFuser, a modular architecture for\nmulti-modal 2D object detection. It fuses multiple sensors in a\nmulti-resolution fashion and scales to an arbitrary number of input modalities.\nThe design of HRFuser is based on state-of-the-art high-resolution networks for\nimage-only dense prediction and incorporates a novel multi-window\ncross-attention block as the means to perform fusion of multiple modalities at\nmultiple resolutions. We demonstrate via extensive experiments on nuScenes and\nthe adverse conditions DENSE datasets that our model effectively leverages\ncomplementary features from additional modalities, substantially improving upon\ncamera-only performance and consistently outperforming state-of-the-art 3D and\n2D fusion methods evaluated on 2D object detection metrics. The source code is\npublicly available.\n","authors":["Tim Broedermann","Christos Sakaridis","Dengxin Dai","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2206.15157v3.pdf","comment":"IEEE International Conference on Intelligent Transportation Systems\n (ITSC) 2023"},{"id":"http://arxiv.org/abs/2306.14628v2","updated":"2023-08-11T10:17:04Z","published":"2023-06-26T12:06:20Z","title":"An Integral Projection-based Semantic Autoencoder for Zero-Shot Learning","summary":" Zero-shot Learning (ZSL) classification categorizes or predicts classes\n(labels) that are not included in the training set (unseen classes). Recent\nworks proposed different semantic autoencoder (SAE) models where the encoder\nembeds a visual feature vector space into the semantic space and the decoder\nreconstructs the original visual feature space. The objective is to learn the\nembedding by leveraging a source data distribution, which can be applied\neffectively to a different but related target data distribution. Such\nembedding-based methods are prone to domain shift problems and are vulnerable\nto biases. We propose an integral projection-based semantic autoencoder\n(IP-SAE) where an encoder projects a visual feature space concatenated with the\nsemantic space into a latent representation space. We force the decoder to\nreconstruct the visual-semantic data space. Due to this constraint, the\nvisual-semantic projection function preserves the discriminatory data included\ninside the original visual feature space. The enriched projection forces a more\nprecise reconstitution of the visual feature space invariant to the domain\nmanifold. Consequently, the learned projection function is less domain-specific\nand alleviates the domain shift problem. Our proposed IP-SAE model consolidates\na symmetric transformation function for embedding and projection, and thus, it\nprovides transparency for interpreting generative applications in ZSL.\nTherefore, in addition to outperforming state-of-the-art methods considering\nfour benchmark datasets, our analytical approach allows us to investigate\ndistinct characteristics of generative-based methods in the unique context of\nzero-shot inference.\n","authors":["William Heyden","Habib Ullah","M. Salman Siddiqui","Fadi Al Machot"],"pdf_url":"https://arxiv.org/pdf/2306.14628v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06057v1","updated":"2023-08-11T10:14:22Z","published":"2023-08-11T10:14:22Z","title":"Head Rotation in Denoising Diffusion Models","summary":" Denoising Diffusion Models (DDM) are emerging as the cutting-edge technology\nin the realm of deep generative modeling, challenging the dominance of\nGenerative Adversarial Networks. However, effectively exploring the latent\nspace's semantics and identifying compelling trajectories for manipulating and\nediting important attributes of the generated samples remains challenging,\nprimarily due to the high-dimensional nature of the latent space. In this\nstudy, we specifically concentrate on face rotation, which is known to be one\nof the most intricate editing operations. By leveraging a recent embedding\ntechnique for Denoising Diffusion Implicit Models (DDIM), we achieve, in many\ncases, noteworthy manipulations encompassing a wide rotation angle of $\\pm\n30^o$, preserving the distinct characteristics of the individual. Our\nmethodology exploits the computation of trajectories approximating clouds of\nlatent representations of dataset samples with different yaw rotations through\nlinear regression. Specific trajectories are obtained by restricting the\nanalysis to subsets of data sharing significant attributes with the source\nimage. One of these attributes is the light provenance: a byproduct of our\nresearch is a labeling of CelebA, categorizing images into three major groups\nbased on the illumination direction: left, center, and right.\n","authors":["Andrea Asperti","Gabriele Colasuonno","Antonio Guerra"],"pdf_url":"https://arxiv.org/pdf/2308.06057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06055v1","updated":"2023-08-11T10:09:08Z","published":"2023-08-11T10:09:08Z","title":"Computer-Aided Cytology Diagnosis in Animals: CNN-Based Image Quality\n Assessment for Accurate Disease Classification","summary":" This paper presents a computer-aided cytology diagnosis system designed for\nanimals, focusing on image quality assessment (IQA) using Convolutional Neural\nNetworks (CNNs). The system's building blocks are tailored to seamlessly\nintegrate IQA, ensuring reliable performance in disease classification. We\nextensively investigate the CNN's ability to handle various image variations\nand scenarios, analyzing the impact on detecting low-quality input data.\nAdditionally, the network's capacity to differentiate valid cellular samples\nfrom those with artifacts is evaluated. Our study employs a ResNet18 network\narchitecture and explores the effects of input sizes and cropping strategies on\nmodel performance. The research sheds light on the significance of CNN-based\nIQA in computer-aided cytology diagnosis for animals, enhancing the accuracy of\ndisease classification.\n","authors":["Jan Krupiński","Maciej Wielgosz","Szymon Mazurek","Krystian Strzałka","Paweł Russek","Jakub Caputa","Daria Łukasik","Jakub Grzeszczyk","Michał Karwatowski","Rafał Fraczek","Ernest Jamro","Marcin Pietroń","Sebastian Koryciak","Agnieszka Dąbrowska-Boruch","Kazimierz Wiatr"],"pdf_url":"https://arxiv.org/pdf/2308.06055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00464v2","updated":"2023-08-11T10:08:46Z","published":"2023-07-02T03:24:58Z","title":"Human-to-Human Interaction Detection","summary":" A comprehensive understanding of interested human-to-human interactions in\nvideo streams, such as queuing, handshaking, fighting and chasing, is of\nimmense importance to the surveillance of public security in regions like\ncampuses, squares and parks. Different from conventional human interaction\nrecognition, which uses choreographed videos as inputs, neglects concurrent\ninteractive groups, and performs detection and recognition in separate stages,\nwe introduce a new task named human-to-human interaction detection (HID). HID\ndevotes to detecting subjects, recognizing person-wise actions, and grouping\npeople according to their interactive relations, in one model. First, based on\nthe popular AVA dataset created for action detection, we establish a new HID\nbenchmark, termed AVA-Interaction (AVA-I), by adding annotations on interactive\nrelations in a frame-by-frame manner. AVA-I consists of 85,254 frames and\n86,338 interactive groups, and each image includes up to 4 concurrent\ninteractive groups. Second, we present a novel baseline approach SaMFormer for\nHID, containing a visual feature extractor, a split stage which leverages a\nTransformer-based model to decode action instances and interactive groups, and\na merging stage which reconstructs the relationship between instances and\ngroups. All SaMFormer components are jointly trained in an end-to-end manner.\nExtensive experiments on AVA-I validate the superiority of SaMFormer over\nrepresentative methods. The dataset and code will be made public to encourage\nmore follow-up studies.\n","authors":["Zhenhua Wang","Kaining Ying","Jiajun Meng","Jifeng Ning"],"pdf_url":"https://arxiv.org/pdf/2307.00464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06054v1","updated":"2023-08-11T10:07:33Z","published":"2023-08-11T10:07:33Z","title":"Hardware Accelerators in Autonomous Driving","summary":" Computing platforms in autonomous vehicles record large amounts of data from\nmany sensors, process the data through machine learning models, and make\ndecisions to ensure the vehicle's safe operation. Fast, accurate, and reliable\ndecision-making is critical. Traditional computer processors lack the power and\nflexibility needed for the perception and machine vision demands of advanced\nautonomous driving tasks. Hardware accelerators are special-purpose\ncoprocessors that help autonomous vehicles meet performance requirements for\nhigher levels of autonomy. This paper provides an overview of ML accelerators\nwith examples of their use for machine vision in autonomous vehicles. We offer\nrecommendations for researchers and practitioners and highlight a trajectory\nfor ongoing and future research in this emerging field.\n","authors":["Ken Power","Shailendra Deva","Ting Wang","Julius Li","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2308.06054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06051v1","updated":"2023-08-11T09:58:47Z","published":"2023-08-11T09:58:47Z","title":"Towards Instance-adaptive Inference for Federated Learning","summary":" Federated learning (FL) is a distributed learning paradigm that enables\nmultiple clients to learn a powerful global model by aggregating local\ntraining. However, the performance of the global model is often hampered by\nnon-i.i.d. distribution among the clients, requiring extensive efforts to\nmitigate inter-client data heterogeneity. Going beyond inter-client data\nheterogeneity, we note that intra-client heterogeneity can also be observed on\ncomplex real-world data and seriously deteriorate FL performance. In this\npaper, we present a novel FL algorithm, i.e., FedIns, to handle intra-client\ndata heterogeneity by enabling instance-adaptive inference in the FL framework.\nInstead of huge instance-adaptive models, we resort to a parameter-efficient\nfine-tuning method, i.e., scale and shift deep features (SSF), upon a\npre-trained model. Specifically, we first train an SSF pool for each client,\nand aggregate these SSF pools on the server side, thus still maintaining a low\ncommunication cost. To enable instance-adaptive inference, for a given\ninstance, we dynamically find the best-matched SSF subsets from the pool and\naggregate them to generate an adaptive SSF specified for the instance, thereby\nreducing the intra-client as well as the inter-client heterogeneity. Extensive\nexperiments show that our FedIns outperforms state-of-the-art FL algorithms,\ne.g., a 6.64\\% improvement against the top-performing method with less than\n15\\% communication cost on Tiny-ImageNet. Our code and models will be publicly\nreleased.\n","authors":["Chun-Mei Feng","Kai Yu","Nian Liu","Xinxing Xu","Salman Khan","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.06051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13031v2","updated":"2023-08-11T09:46:37Z","published":"2023-04-25T17:59:54Z","title":"DQS3D: Densely-matched Quantization-aware Semi-supervised 3D Detection","summary":" In this paper, we study the problem of semi-supervised 3D object detection,\nwhich is of great importance considering the high annotation cost for cluttered\n3D indoor scenes. We resort to the robust and principled framework of\nselfteaching, which has triggered notable progress for semisupervised learning\nrecently. While this paradigm is natural for image-level or pixel-level\nprediction, adapting it to the detection problem is challenged by the issue of\nproposal matching. Prior methods are based upon two-stage pipelines, matching\nheuristically selected proposals generated in the first stage and resulting in\nspatially sparse training signals. In contrast, we propose the first\nsemisupervised 3D detection algorithm that works in the singlestage manner and\nallows spatially dense training signals. A fundamental issue of this new design\nis the quantization error caused by point-to-voxel discretization, which\ninevitably leads to misalignment between two transformed views in the voxel\ndomain. To this end, we derive and implement closed-form rules that compensate\nthis misalignment onthe-fly. Our results are significant, e.g., promoting\nScanNet mAP@0.5 from 35.2% to 48.5% using 20% annotation. Codes and data will\nbe publicly available.\n","authors":["Huan-ang Gao","Beiwen Tian","Pengfei Li","Hao Zhao","Guyue Zhou"],"pdf_url":"https://arxiv.org/pdf/2304.13031v2.pdf","comment":"Accepted to ICCV 2023. Code: https://github.com/AIR-DISCOVER/DQS3D"},{"id":"http://arxiv.org/abs/2303.11681v2","updated":"2023-08-11T09:44:04Z","published":"2023-03-21T08:43:15Z","title":"DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic\n Segmentation Using Diffusion Models","summary":" Collecting and annotating images with pixel-wise labels is time-consuming and\nlaborious. In contrast, synthetic data can be freely available using a\ngenerative model (e.g., DALL-E, Stable Diffusion). In this paper, we show that\nit is possible to automatically obtain accurate semantic masks of synthetic\nimages generated by the Off-the-shelf Stable Diffusion model, which uses only\ntext-image pairs during training. Our approach, called DiffuMask, exploits the\npotential of the cross-attention map between text and image, which is natural\nand seamless to extend the text-driven image synthesis to semantic mask\ngeneration. DiffuMask uses text-guided cross-attention information to localize\nclass/word-specific regions, which are combined with practical techniques to\ncreate a novel high-resolution and class-discriminative pixel-wise mask. The\nmethods help to reduce data collection and annotation costs obviously.\nExperiments demonstrate that the existing segmentation methods trained on\nsynthetic data of DiffuMask can achieve a competitive performance over the\ncounterpart of real data (VOC 2012, Cityscapes). For some classes (e.g., bird),\nDiffuMask presents promising performance, close to the stateof-the-art result\nof real data (within 3% mIoU gap). Moreover, in the open-vocabulary\nsegmentation (zero-shot) setting, DiffuMask achieves a new SOTA result on\nUnseen class of VOC 2012. The project website can be found at\nhttps://weijiawu.github.io/DiffusionMask/.\n","authors":["Weijia Wu","Yuzhong Zhao","Mike Zheng Shou","Hong Zhou","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2303.11681v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06038v1","updated":"2023-08-11T09:36:31Z","published":"2023-08-11T09:36:31Z","title":"Diverse Data Augmentation with Diffusions for Effective Test-time Prompt\n Tuning","summary":" Benefiting from prompt tuning, recent years have witnessed the promising\nperformance of pre-trained vision-language models, e.g., CLIP, on versatile\ndownstream tasks. In this paper, we focus on a particular setting of learning\nadaptive prompts on the fly for each test sample from an unseen new domain,\nwhich is known as test-time prompt tuning (TPT). Existing TPT methods typically\nrely on data augmentation and confidence selection. However, conventional data\naugmentation techniques, e.g., random resized crops, suffers from the lack of\ndata diversity, while entropy-based confidence selection alone is not\nsufficient to guarantee prediction fidelity. To address these issues, we\npropose a novel TPT method, named DiffTPT, which leverages pre-trained\ndiffusion models to generate diverse and informative new data. Specifically, we\nincorporate augmented data by both conventional method and pre-trained stable\ndiffusion to exploit their respective merits, improving the models ability to\nadapt to unknown new test data. Moreover, to ensure the prediction fidelity of\ngenerated data, we introduce a cosine similarity-based filtration technique to\nselect the generated data with higher similarity to the single test sample. Our\nexperiments on test datasets with distribution shifts and unseen categories\ndemonstrate that DiffTPT improves the zero-shot accuracy by an average of\n5.13\\% compared to the state-of-the-art TPT method. Our code and models will be\npublicly released.\n","authors":["Chun-Mei Feng","Kai Yu","Yong Liu","Salman Khan","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.06038v1.pdf","comment":"8 pages,9 figures"},{"id":"http://arxiv.org/abs/2204.08247v3","updated":"2023-08-11T09:30:41Z","published":"2022-04-18T10:50:03Z","title":"Joint Multi-view Unsupervised Feature Selection and Graph Learning","summary":" Despite significant progress, previous multi-view unsupervised feature\nselection methods mostly suffer from two limitations. First, they generally\nutilize either cluster structure or similarity structure to guide the feature\nselection, which neglect the possibility of a joint formulation with mutual\nbenefits. Second, they often learn the similarity structure by either global\nstructure learning or local structure learning, which lack the capability of\ngraph learning with both global and local structural awareness. In light of\nthis, this paper presents a joint multi-view unsupervised feature selection and\ngraph learning (JMVFG) approach. Particularly, we formulate the multi-view\nfeature selection with orthogonal decomposition, where each target matrix is\ndecomposed into a view-specific basis matrix and a view-consistent cluster\nindicator. The cross-space locality preservation is incorporated to bridge the\ncluster structure learning in the projected space and the similarity learning\n(i.e., graph learning) in the original space. Further, a unified objective\nfunction is presented to enable the simultaneous learning of the cluster\nstructure, the global and local similarity structures, and the multi-view\nconsistency and inconsistency, upon which an alternating optimization algorithm\nis developed with theoretically proved convergence. Extensive experiments on a\nvariety of real-world multi-view datasets demonstrate the superiority of our\napproach for both the multi-view feature selection and graph learning tasks.\nThe code is available at https://github.com/huangdonghere/JMVFG.\n","authors":["Si-Guo Fang","Dong Huang","Chang-Dong Wang","Yong Tang"],"pdf_url":"https://arxiv.org/pdf/2204.08247v3.pdf","comment":"To appear in IEEE Transactions on Emerging Topics in Computational\n Intelligence"},{"id":"http://arxiv.org/abs/2204.01558v2","updated":"2023-08-11T09:20:55Z","published":"2022-04-04T15:05:45Z","title":"Con$^{2}$DA: Simplifying Semi-supervised Domain Adaptation by Learning\n Consistent and Contrastive Feature Representations","summary":" In this work, we present Con$^{2}$DA, a simple framework that extends recent\nadvances in semi-supervised learning to the semi-supervised domain adaptation\n(SSDA) problem. Our framework generates pairs of associated samples by\nperforming stochastic data transformations to a given input. Associated data\npairs are mapped to a feature representation space using a feature extractor.\nWe use different loss functions to enforce consistency between the feature\nrepresentations of associated data pairs of samples. We show that these learned\nrepresentations are useful to deal with differences in data distributions in\nthe domain adaptation problem. We performed experiments to study the main\ncomponents of our model and we show that (i) learning of the consistent and\ncontrastive feature representations is crucial to extract good discriminative\nfeatures across different domains, and ii) our model benefits from the use of\nstrong augmentation policies. With these findings, our method achieves\nstate-of-the-art performances in three benchmark datasets for SSDA.\n","authors":["Manuel Pérez-Carrasco","Pavlos Protopapas","Guillermo Cabrera-Vives"],"pdf_url":"https://arxiv.org/pdf/2204.01558v2.pdf","comment":"Accepted to NeurIPS 2021 Workshop on Distribution Shifts: Connecting\n Methods and Applications"},{"id":"http://arxiv.org/abs/2307.03903v3","updated":"2023-08-11T09:15:27Z","published":"2023-07-08T05:03:10Z","title":"Adversarial Self-Attack Defense and Spatial-Temporal Relation Mining for\n Visible-Infrared Video Person Re-Identification","summary":" In visible-infrared video person re-identification (re-ID), extracting\nfeatures not affected by complex scenes (such as modality, camera views,\npedestrian pose, background, etc.) changes, and mining and utilizing motion\ninformation are the keys to solving cross-modal pedestrian identity matching.\nTo this end, the paper proposes a new visible-infrared video person re-ID\nmethod from a novel perspective, i.e., adversarial self-attack defense and\nspatial-temporal relation mining. In this work, the changes of views, posture,\nbackground and modal discrepancy are considered as the main factors that cause\nthe perturbations of person identity features. Such interference information\ncontained in the training samples is used as an adversarial perturbation. It\nperforms adversarial attacks on the re-ID model during the training to make the\nmodel more robust to these unfavorable factors. The attack from the adversarial\nperturbation is introduced by activating the interference information contained\nin the input samples without generating adversarial samples, and it can be thus\ncalled adversarial self-attack. This design allows adversarial attack and\ndefense to be integrated into one framework. This paper further proposes a\nspatial-temporal information-guided feature representation network to use the\ninformation in video sequences. The network cannot only extract the information\ncontained in the video-frame sequences but also use the relation of the local\ninformation in space to guide the network to extract more robust features. The\nproposed method exhibits compelling performance on large-scale cross-modality\nvideo datasets. The source code of the proposed method will be released at\nhttps://github.com/lhf12278/xxx.\n","authors":["Huafeng Li","Le Xu","Yafei Zhang","Dapeng Tao","Zhengtao Yu"],"pdf_url":"https://arxiv.org/pdf/2307.03903v3.pdf","comment":"11 pages,8 figures"},{"id":"http://arxiv.org/abs/2308.06027v1","updated":"2023-08-11T09:15:22Z","published":"2023-08-11T09:15:22Z","title":"Masked-Attention Diffusion Guidance for Spatially Controlling\n Text-to-Image Generation","summary":" Text-to-image synthesis has achieved high-quality results with recent\nadvances in diffusion models. However, text input alone has high spatial\nambiguity and limited user controllability. Most existing methods allow spatial\ncontrol through additional visual guidance (e.g, sketches and semantic masks)\nbut require additional training with annotated images. In this paper, we\npropose a method for spatially controlling text-to-image generation without\nfurther training of diffusion models. Our method is based on the insight that\nthe cross-attention maps reflect the positional relationship between words and\npixels. Our aim is to control the attention maps according to given semantic\nmasks and text prompts. To this end, we first explore a simple approach of\ndirectly swapping the cross-attention maps with constant maps computed from the\nsemantic regions. Moreover, we propose masked-attention guidance, which can\ngenerate images more faithful to semantic masks than the first approach.\nMasked-attention guidance indirectly controls attention to each word and pixel\naccording to the semantic regions by manipulating noise images fed to diffusion\nmodels. Experiments show that our method enables more accurate spatial control\nthan baselines qualitatively and quantitatively.\n","authors":["Yuki Endo"],"pdf_url":"https://arxiv.org/pdf/2308.06027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.04531v3","updated":"2023-08-11T09:11:59Z","published":"2022-06-09T14:25:23Z","title":"ECLAD: Extracting Concepts with Local Aggregated Descriptors","summary":" Convolutional neural networks (CNNs) are increasingly being used in critical\nsystems, where robustness and alignment are crucial. In this context, the field\nof explainable artificial intelligence has proposed the generation of\nhigh-level explanations of the prediction process of CNNs through concept\nextraction. While these methods can detect whether or not a concept is present\nin an image, they are unable to determine its location. What is more, a fair\ncomparison of such approaches is difficult due to a lack of proper validation\nprocedures. To address these issues, we propose a novel method for automatic\nconcept extraction and localization based on representations obtained through\npixel-wise aggregations of CNN activation maps. Further, we introduce a process\nfor the validation of concept-extraction techniques based on synthetic datasets\nwith pixel-wise annotations of their main components, reducing the need for\nhuman intervention. Extensive experimentation on both synthetic and real-world\ndatasets demonstrates that our method outperforms state-of-the-art\nalternatives.\n","authors":["Andres Felipe Posada-Moreno","Nikita Surya","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2206.04531v3.pdf","comment":"34 pages, under review"},{"id":"http://arxiv.org/abs/2308.06024v1","updated":"2023-08-11T09:02:03Z","published":"2023-08-11T09:02:03Z","title":"Spatial-information Guided Adaptive Context-aware Network for Efficient\n RGB-D Semantic Segmentation","summary":" Efficient RGB-D semantic segmentation has received considerable attention in\nmobile robots, which plays a vital role in analyzing and recognizing\nenvironmental information. According to previous studies, depth information can\nprovide corresponding geometric relationships for objects and scenes, but\nactual depth data usually exist as noise. To avoid unfavorable effects on\nsegmentation accuracy and computation, it is necessary to design an efficient\nframework to leverage cross-modal correlations and complementary cues. In this\npaper, we propose an efficient lightweight encoder-decoder network that reduces\nthe computational parameters and guarantees the robustness of the algorithm.\nWorking with channel and spatial fusion attention modules, our network\neffectively captures multi-level RGB-D features. A globally guided local\naffinity context module is proposed to obtain sufficient high-level context\ninformation. The decoder utilizes a lightweight residual unit that combines\nshort- and long-distance information with a few redundant computations.\nExperimental results on NYUv2, SUN RGB-D, and Cityscapes datasets show that our\nmethod achieves a better trade-off among segmentation accuracy, inference time,\nand parameters than the state-of-the-art methods. The source code will be at\nhttps://github.com/MVME-HBUT/SGACNet\n","authors":["Yang Zhang","Chenyun Xiong","Junjie Liu","Xuhui Ye","Guodong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06024v1.pdf","comment":"Accepted by IEEE Sensors Journal"},{"id":"http://arxiv.org/abs/2308.06022v1","updated":"2023-08-11T08:54:45Z","published":"2023-08-11T08:54:45Z","title":"Scale-Preserving Automatic Concept Extraction (SPACE)","summary":" Convolutional Neural Networks (CNN) have become a common choice for\nindustrial quality control, as well as other critical applications in the\nIndustry 4.0. When these CNNs behave in ways unexpected to human users or\ndevelopers, severe consequences can arise, such as economic losses or an\nincreased risk to human life. Concept extraction techniques can be applied to\nincrease the reliability and transparency of CNNs through generating global\nexplanations for trained neural network models. The decisive features of image\ndatasets in quality control often depend on the feature's scale; for example,\nthe size of a hole or an edge. However, existing concept extraction methods do\nnot correctly represent scale, which leads to problems interpreting these\nmodels as we show herein. To address this issue, we introduce the\nScale-Preserving Automatic Concept Extraction (SPACE) algorithm, as a\nstate-of-the-art alternative concept extraction technique for CNNs, focused on\nindustrial applications. SPACE is specifically designed to overcome the\naforementioned problems by avoiding scale changes throughout the concept\nextraction process. SPACE proposes an approach based on square slices of input\nimages, which are selected and then tiled before being clustered into concepts.\nOur method provides explanations of the models' decision-making process in the\nform of human-understandable concepts. We evaluate SPACE on three image\nclassification datasets in the context of industrial quality control. Through\nexperimental results, we illustrate how SPACE outperforms other methods and\nprovides actionable insights on the decision mechanisms of CNNs. Finally, code\nfor the implementation of SPACE is provided.\n","authors":["Andrés Felipe Posada-Moreno","Lukas Kreisköther","Tassilo Glander","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2308.06022v1.pdf","comment":"22 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.06015v1","updated":"2023-08-11T08:44:58Z","published":"2023-08-11T08:44:58Z","title":"Enhancing Generalization of Universal Adversarial Perturbation through\n Gradient Aggregation","summary":" Deep neural networks are vulnerable to universal adversarial perturbation\n(UAP), an instance-agnostic perturbation capable of fooling the target model\nfor most samples. Compared to instance-specific adversarial examples, UAP is\nmore challenging as it needs to generalize across various samples and models.\nIn this paper, we examine the serious dilemma of UAP generation methods from a\ngeneralization perspective -- the gradient vanishing problem using small-batch\nstochastic gradient optimization and the local optima problem using large-batch\noptimization. To address these problems, we propose a simple and effective\nmethod called Stochastic Gradient Aggregation (SGA), which alleviates the\ngradient vanishing and escapes from poor local optima at the same time.\nSpecifically, SGA employs the small-batch training to perform multiple\niterations of inner pre-search. Then, all the inner gradients are aggregated as\na one-step gradient estimation to enhance the gradient stability and reduce\nquantization errors. Extensive experiments on the standard ImageNet dataset\ndemonstrate that our method significantly enhances the generalization ability\nof UAP and outperforms other state-of-the-art methods. The code is available at\nhttps://github.com/liuxuannan/Stochastic-Gradient-Aggregation.\n","authors":["Xuannan Liu","Yaoyao Zhong","Yuhang Zhang","Lixiong Qin","Weihong Deng"],"pdf_url":"https://arxiv.org/pdf/2308.06015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17271v2","updated":"2023-08-11T08:35:06Z","published":"2023-05-26T21:36:08Z","title":"Robust Lane Detection through Self Pre-training with Masked Sequential\n Autoencoders and Fine-tuning with Customized PolyLoss","summary":" Lane detection is crucial for vehicle localization which makes it the\nfoundation for automated driving and many intelligent and advanced driving\nassistant systems. Available vision-based lane detection methods do not make\nfull use of the valuable features and aggregate contextual information,\nespecially the interrelationships between lane lines and other regions of the\nimages in continuous frames. To fill this research gap and upgrade lane\ndetection performance, this paper proposes a pipeline consisting of self\npre-training with masked sequential autoencoders and fine-tuning with\ncustomized PolyLoss for the end-to-end neural network models using\nmulti-continuous image frames. The masked sequential autoencoders are adopted\nto pre-train the neural network models with reconstructing the missing pixels\nfrom a random masked image as the objective. Then, in the fine-tuning\nsegmentation phase where lane detection segmentation is performed, the\ncontinuous image frames are served as the inputs, and the pre-trained model\nweights are transferred and further updated using the backpropagation mechanism\nwith customized PolyLoss calculating the weighted errors between the output\nlane detection results and the labeled ground truth. Extensive experiment\nresults demonstrate that, with the proposed pipeline, the lane detection model\nperformance on both normal and challenging scenes can be advanced beyond the\nstate-of-the-art, delivering the best testing accuracy (98.38%), precision\n(0.937), and F1-measure (0.924) on the normal scene testing set, together with\nthe best overall accuracy (98.36%) and precision (0.844) in the challenging\nscene test set, while the training time can be substantially shortened.\n","authors":["Ruohan Li","Yongqi Dong"],"pdf_url":"https://arxiv.org/pdf/2305.17271v2.pdf","comment":"12 pages, 8 figures, accepted by journal of IEEE Transactions on\n Intelligent Transportation Systems"},{"id":"http://arxiv.org/abs/2308.06009v1","updated":"2023-08-11T08:30:08Z","published":"2023-08-11T08:30:08Z","title":"ViGT: Proposal-free Video Grounding with Learnable Token in Transformer","summary":" The video grounding (VG) task aims to locate the queried action or event in\nan untrimmed video based on rich linguistic descriptions. Existing\nproposal-free methods are trapped in complex interaction between video and\nquery, overemphasizing cross-modal feature fusion and feature correlation for\nVG. In this paper, we propose a novel boundary regression paradigm that\nperforms regression token learning in a transformer. Particularly, we present a\nsimple but effective proposal-free framework, namely Video Grounding\nTransformer (ViGT), which predicts the temporal boundary using a learnable\nregression token rather than multi-modal or cross-modal features. In ViGT, the\nbenefits of a learnable token are manifested as follows. (1) The token is\nunrelated to the video or the query and avoids data bias toward the original\nvideo and query. (2) The token simultaneously performs global context\naggregation from video and query features. First, we employed a sharing feature\nencoder to project both video and query into a joint feature space before\nperforming cross-modal co-attention (i.e., video-to-query attention and\nquery-to-video attention) to highlight discriminative features in each\nmodality. Furthermore, we concatenated a learnable regression token [REG] with\nthe video and query features as the input of a vision-language transformer.\nFinally, we utilized the token [REG] to predict the target moment and visual\nfeatures to constrain the foreground and background probabilities at each\ntimestamp. The proposed ViGT performed well on three public datasets: ANet\nCaptions, TACoS and YouCookII. Extensive ablation studies and qualitative\nanalysis further validated the interpretability of ViGT.\n","authors":["Kun Li","Dan Guo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06009v1.pdf","comment":"This paper has been accepted by SCIENCE CHINA Information Sciences"},{"id":"http://arxiv.org/abs/2308.05993v1","updated":"2023-08-11T08:00:30Z","published":"2023-08-11T08:00:30Z","title":"Image-based Geolocalization by Ground-to-2.5D Map Matching","summary":" We study the image-based geolocalization problem that aims to locate\nground-view query images on cartographic maps. Previous methods often utilize\ncross-view localization techniques to match ground-view query images with 2D\nmaps. However, the performance of these methods is frequently unsatisfactory\ndue to the significant cross-view appearance differences. In this paper, we\nextend cross-view matching to 2.5D spaces, where the heights of the structures\n- such as trees, buildings, and other objects - can provide additional\ninformation to guide the cross-view matching. We present a new approach to\nlearning representative embeddings from multi-model data. Specifically, we\nfirst align 2D maps to ground-view panoramic images with polar transform to\nreduce the gap between panoramic images and maps. Then we leverage global\nfusion to fuse the multi-modal features from 2D and 2.5D maps to increase the\ndistinctiveness of location embeddings. We construct the first large-scale\nground-to-2.5D map geolocalization dataset to validate our method and\nfacilitate the research. We test our learned embeddings on two popular\nlocalization approaches, i.e., single-image based localization, and route based\nlocalization. Extensive experiments demonstrate that our proposed method\nachieves significantly higher localization accuracy and faster convergence than\nprevious 2D map-based approaches.\n","authors":["Mengjie Zhou","Liu Liu","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.05993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05991v1","updated":"2023-08-11T07:57:17Z","published":"2023-08-11T07:57:17Z","title":"Cyclic-Bootstrap Labeling for Weakly Supervised Object Detection","summary":" Recent progress in weakly supervised object detection is featured by a\ncombination of multiple instance detection networks (MIDN) and ordinal online\nrefinement. However, with only image-level annotation, MIDN inevitably assigns\nhigh scores to some unexpected region proposals when generating pseudo labels.\nThese inaccurate high-scoring region proposals will mislead the training of\nsubsequent refinement modules and thus hamper the detection performance. In\nthis work, we explore how to ameliorate the quality of pseudo-labeling in MIDN.\nFormally, we devise Cyclic-Bootstrap Labeling (CBL), a novel weakly supervised\nobject detection pipeline, which optimizes MIDN with rank information from a\nreliable teacher network. Specifically, we obtain this teacher network by\nintroducing a weighted exponential moving average strategy to take advantage of\nvarious refinement modules. A novel class-specific ranking distillation\nalgorithm is proposed to leverage the output of weighted ensembled teacher\nnetwork for distilling MIDN with rank information. As a result, MIDN is guided\nto assign higher scores to accurate proposals among their neighboring ones,\nthus benefiting the subsequent pseudo labeling. Extensive experiments on the\nprevalent PASCAL VOC 2007 \\& 2012 and COCO datasets demonstrate the superior\nperformance of our CBL framework. Code will be available at\nhttps://github.com/Yinyf0804/WSOD-CBL/.\n","authors":["Yufei Yin","Jiajun Deng","Wengang Zhou","Li Li","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.05991v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05988v1","updated":"2023-08-11T07:56:10Z","published":"2023-08-11T07:56:10Z","title":"MS3D++: Ensemble of Experts for Multi-Source Unsupervised Domain\n Adaption in 3D Object Detection","summary":" Deploying 3D detectors in unfamiliar domains has been demonstrated to result\nin a drastic drop of up to 70-90% in detection rate due to variations in lidar,\ngeographical region, or weather conditions from their original training\ndataset. This domain gap leads to missing detections for densely observed\nobjects, misaligned confidence scores, and increased high-confidence false\npositives, rendering the detector highly unreliable. To address this, we\nintroduce MS3D++, a self-training framework for multi-source unsupervised\ndomain adaptation in 3D object detection. MS3D++ provides a straightforward\napproach to domain adaptation by generating high-quality pseudo-labels,\nenabling the adaptation of 3D detectors to a diverse range of lidar types,\nregardless of their density. Our approach effectively fuses predictions of an\nensemble of multi-frame pre-trained detectors from different source domains to\nimprove domain generalization. We subsequently refine the predictions\ntemporally to ensure temporal consistency in box localization and object\nclassification. Furthermore, we present an in-depth study into the performance\nand idiosyncrasies of various 3D detector components in a cross-domain context,\nproviding valuable insights for improved cross-domain detector ensembling.\nExperimental results on Waymo, nuScenes and Lyft demonstrate that detectors\ntrained with MS3D++ pseudo-labels achieve state-of-the-art performance,\ncomparable to training with human-annotated labels in Bird's Eye View (BEV)\nevaluation for both low and high density lidar.\n","authors":["Darren Tsai","Julie Stephany Berrio","Mao Shan","Eduardo Nebot","Stewart Worrall"],"pdf_url":"https://arxiv.org/pdf/2308.05988v1.pdf","comment":"Code is available at https://github.com/darrenjkt/MS3D"},{"id":"http://arxiv.org/abs/2308.05983v1","updated":"2023-08-11T07:38:46Z","published":"2023-08-11T07:38:46Z","title":"Face Encryption via Frequency-Restricted Identity-Agnostic Attacks","summary":" Billions of people are sharing their daily live images on social media\neveryday. However, malicious collectors use deep face recognition systems to\neasily steal their biometric information (e.g., faces) from these images. Some\nstudies are being conducted to generate encrypted face photos using adversarial\nattacks by introducing imperceptible perturbations to reduce face information\nleakage. However, existing studies need stronger black-box scenario feasibility\nand more natural visual appearances, which challenge the feasibility of privacy\nprotection. To address these problems, we propose a frequency-restricted\nidentity-agnostic (FRIA) framework to encrypt face images from unauthorized\nface recognition without access to personal information. As for the weak\nblack-box scenario feasibility, we obverse that representations of the average\nfeature in multiple face recognition models are similar, thus we propose to\nutilize the average feature via the crawled dataset from the Internet as the\ntarget to guide the generation, which is also agnostic to identities of unknown\nface recognition systems; in nature, the low-frequency perturbations are more\nvisually perceptible by the human vision system. Inspired by this, we restrict\nthe perturbation in the low-frequency facial regions by discrete cosine\ntransform to achieve the visual naturalness guarantee. Extensive experiments on\nseveral face recognition models demonstrate that our FRIA outperforms other\nstate-of-the-art methods in generating more natural encrypted faces while\nattaining high black-box attack success rates of 96%. In addition, we validate\nthe efficacy of FRIA using real-world black-box commercial API, which reveals\nthe potential of FRIA in practice. Our codes can be found in\nhttps://github.com/XinDong10/FRIA.\n","authors":["Xin Dong","Rui Wang","Siyuan Liang","Aishan Liu","Lihua Jing"],"pdf_url":"https://arxiv.org/pdf/2308.05983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05976v1","updated":"2023-08-11T07:20:24Z","published":"2023-08-11T07:20:24Z","title":"Zero-shot Text-driven Physically Interpretable Face Editing","summary":" This paper proposes a novel and physically interpretable method for face\nediting based on arbitrary text prompts. Different from previous\nGAN-inversion-based face editing methods that manipulate the latent space of\nGANs, or diffusion-based methods that model image manipulation as a reverse\ndiffusion process, we regard the face editing process as imposing vector flow\nfields on face images, representing the offset of spatial coordinates and color\nfor each image pixel. Under the above-proposed paradigm, we represent the\nvector flow field in two ways: 1) explicitly represent the flow vectors with\nrasterized tensors, and 2) implicitly parameterize the flow vectors as\ncontinuous, smooth, and resolution-agnostic neural fields, by leveraging the\nrecent advances of implicit neural representations. The flow vectors are\niteratively optimized under the guidance of the pre-trained Contrastive\nLanguage-Image Pretraining~(CLIP) model by maximizing the correlation between\nthe edited image and the text prompt. We also propose a learning-based one-shot\nface editing framework, which is fast and adaptable to any text prompt input.\nOur method can also be flexibly extended to real-time video face editing.\nCompared with state-of-the-art text-driven face editing methods, our method can\ngenerate physically interpretable face editing results with high identity\nconsistency and image quality. Our code will be made publicly available.\n","authors":["Yapeng Meng","Songru Yang","Xu Hu","Rui Zhao","Lincheng Li","Zhenwei Shi","Zhengxia Zou"],"pdf_url":"https://arxiv.org/pdf/2308.05976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05970v1","updated":"2023-08-11T07:07:40Z","published":"2023-08-11T07:07:40Z","title":"Focused Specific Objects NeRF","summary":" Most NeRF-based models are designed for learning the entire scene, and\ncomplex scenes can lead to longer learning times and poorer rendering effects.\nThis paper utilizes scene semantic priors to make improvements in fast\ntraining, allowing the network to focus on the specific targets and not be\naffected by complex backgrounds. The training speed can be increased by 7.78\ntimes with better rendering effect, and small to medium sized targets can be\nrendered faster. In addition, this improvement applies to all NeRF-based\nmodels. Considering the inherent multi-view consistency and smoothness of NeRF,\nthis paper also studies weak supervision by sparsely sampling negative ray\nsamples. With this method, training can be further accelerated and rendering\nquality can be maintained. Finally, this paper extends pixel semantic and color\nrendering formulas and proposes a new scene editing technique that can achieve\nunique displays of the specific semantic targets or masking them in rendering.\nTo address the problem of unsupervised regions incorrect inferences in the\nscene, we also designed a self-supervised loop that combines morphological\noperations and clustering.\n","authors":["Yuesong Li","Feng Pan","Helong Yan","Xiuli Xin","Xiaoxue Feng"],"pdf_url":"https://arxiv.org/pdf/2308.05970v1.pdf","comment":"17 pages,32 figures"},{"id":"http://arxiv.org/abs/2308.05967v1","updated":"2023-08-11T06:54:55Z","published":"2023-08-11T06:54:55Z","title":"YOLOrtho -- A Unified Framework for Teeth Enumeration and Dental Disease\n Detection","summary":" Detecting dental diseases through panoramic X-rays images is a standard\nprocedure for dentists. Normally, a dentist need to identify diseases and find\nthe infected teeth. While numerous machine learning models adopting this\ntwo-step procedure have been developed, there has not been an end-to-end model\nthat can identify teeth and their associated diseases at the same time. To fill\nthe gap, we develop YOLOrtho, a unified framework for teeth enumeration and\ndental disease detection. We develop our model on Dentex Challenge 2023 data,\nwhich consists of three distinct types of annotated data. The first part is\nlabeled with quadrant, and the second part is labeled with quadrant and\nenumeration and the third part is labeled with quadrant, enumeration and\ndisease. To further improve detection, we make use of Tufts Dental public\ndataset. To fully utilize the data and learn both teeth detection and disease\nidentification simultaneously, we formulate diseases as attributes attached to\ntheir corresponding teeth. Due to the nature of position relation in teeth\nenumeration, We replace convolution layer with CoordConv in our model to\nprovide more position information for the model. We also adjust the model\narchitecture and insert one more upsampling layer in FPN in favor of large\nobject detection. Finally, we propose a post-process strategy for teeth layout\nthat corrects teeth enumeration based on linear sum assignment. Results from\nexperiments show that our model exceeds large Diffusion-based model.\n","authors":["Shenxiao Mei","Chenglong Ma","Feihong Shen","Huikai Wu"],"pdf_url":"https://arxiv.org/pdf/2308.05967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05961v1","updated":"2023-08-11T06:41:20Z","published":"2023-08-11T06:41:20Z","title":"Compositional Learning in Transformer-Based Human-Object Interaction\n Detection","summary":" Human-object interaction (HOI) detection is an important part of\nunderstanding human activities and visual scenes. The long-tailed distribution\nof labeled instances is a primary challenge in HOI detection, promoting\nresearch in few-shot and zero-shot learning. Inspired by the combinatorial\nnature of HOI triplets, some existing approaches adopt the idea of\ncompositional learning, in which object and action features are learned\nindividually and re-composed as new training samples. However, these methods\nfollow the CNN-based two-stage paradigm with limited feature extraction\nability, and often rely on auxiliary information for better performance.\nWithout introducing any additional information, we creatively propose a\ntransformer-based framework for compositional HOI learning. Human-object pair\nrepresentations and interaction representations are re-composed across\ndifferent HOI instances, which involves richer contextual information and\npromotes the generalization of knowledge. Experiments show our simple but\neffective method achieves state-of-the-art performance, especially on rare HOI\nclasses.\n","authors":["Zikun Zhuang","Ruihao Qian","Chi Xie","Shuang Liang"],"pdf_url":"https://arxiv.org/pdf/2308.05961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05959v1","updated":"2023-08-11T06:28:19Z","published":"2023-08-11T06:28:19Z","title":"Learned Point Cloud Compression for Classification","summary":" Deep learning is increasingly being used to perform machine vision tasks such\nas classification, object detection, and segmentation on 3D point cloud data.\nHowever, deep learning inference is computationally expensive. The limited\ncomputational capabilities of end devices thus necessitate a codec for\ntransmitting point cloud data over the network for server-side processing. Such\na codec must be lightweight and capable of achieving high compression ratios\nwithout sacrificing accuracy. Motivated by this, we present a novel point cloud\ncodec that is highly specialized for the machine task of classification. Our\ncodec, based on PointNet, achieves a significantly better rate-accuracy\ntrade-off in comparison to alternative methods. In particular, it achieves a\n94% reduction in BD-bitrate over non-specialized codecs on the ModelNet40\ndataset. For low-resource end devices, we also propose two lightweight\nconfigurations of our encoder that achieve similar BD-bitrate reductions of 93%\nand 92% with 3% and 5% drops in top-1 accuracy, while consuming only 0.470 and\n0.048 encoder-side kMACs/point, respectively. Our codec demonstrates the\npotential of specialized codecs for machine analysis of point clouds, and\nprovides a basis for extension to more complex tasks and datasets in the\nfuture.\n","authors":["Mateen Ulhaq","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2308.05959v1.pdf","comment":"6 pages, 4 figures, IEEE MMSP 2023"},{"id":"http://arxiv.org/abs/2308.04830v2","updated":"2023-08-11T05:56:35Z","published":"2023-08-09T09:38:14Z","title":"VAST: Vivify Your Talking Avatar via Zero-Shot Expressive Facial Style\n Transfer","summary":" Current talking face generation methods mainly focus on speech-lip\nsynchronization. However, insufficient investigation on the facial talking\nstyle leads to a lifeless and monotonous avatar. Most previous works fail to\nimitate expressive styles from arbitrary video prompts and ensure the\nauthenticity of the generated video. This paper proposes an unsupervised\nvariational style transfer model (VAST) to vivify the neutral photo-realistic\navatars. Our model consists of three key components: a style encoder that\nextracts facial style representations from the given video prompts; a hybrid\nfacial expression decoder to model accurate speech-related movements; a\nvariational style enhancer that enhances the style space to be highly\nexpressive and meaningful. With our essential designs on facial style learning,\nour model is able to flexibly capture the expressive facial style from\narbitrary video prompts and transfer it onto a personalized image renderer in a\nzero-shot manner. Experimental results demonstrate the proposed approach\ncontributes to a more vivid talking avatar with higher authenticity and richer\nexpressiveness.\n","authors":["Liyang Chen","Zhiyong Wu","Runnan Li","Weihong Bao","Jun Ling","Xu Tan","Sheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.04830v2.pdf","comment":"Accepted by ICCV2023 Workshop"},{"id":"http://arxiv.org/abs/2308.05948v1","updated":"2023-08-11T05:46:52Z","published":"2023-08-11T05:46:52Z","title":"Uncertainty-Aware Cross-Modal Transfer Network for Sketch-Based 3D Shape\n Retrieval","summary":" In recent years, sketch-based 3D shape retrieval has attracted growing\nattention. While many previous studies have focused on cross-modal matching\nbetween hand-drawn sketches and 3D shapes, the critical issue of how to handle\nlow-quality and noisy samples in sketch data has been largely neglected. This\npaper presents an uncertainty-aware cross-modal transfer network (UACTN) that\naddresses this issue. UACTN decouples the representation learning of sketches\nand 3D shapes into two separate tasks: classification-based sketch uncertainty\nlearning and 3D shape feature transfer. We first introduce an end-to-end\nclassification-based approach that simultaneously learns sketch features and\nuncertainty, allowing uncertainty to prevent overfitting noisy sketches by\nassigning different levels of importance to clean and noisy sketches. Then, 3D\nshape features are mapped into the pre-learned sketch embedding space for\nfeature alignment. Extensive experiments and ablation studies on two benchmarks\ndemonstrate the superiority of our proposed method compared to state-of-the-art\nmethods.\n","authors":["Yiyang Cai","Jiaming Lu","Jiewen Wang","Shuang Liang"],"pdf_url":"https://arxiv.org/pdf/2308.05948v1.pdf","comment":"6 pages, 7 figures; To be published in IEEE International Conference\n on Multimedia and Expo 2023"},{"id":"http://arxiv.org/abs/2308.05721v2","updated":"2023-08-11T05:39:02Z","published":"2023-08-10T17:37:49Z","title":"Deformable Mixer Transformer with Gating for Multi-Task Learning of\n Dense Prediction","summary":" CNNs and Transformers have their own advantages and both have been widely\nused for dense prediction in multi-task learning (MTL). Most of the current\nstudies on MTL solely rely on CNN or Transformer. In this work, we present a\nnovel MTL model by combining both merits of deformable CNN and query-based\nTransformer with shared gating for multi-task learning of dense prediction.\nThis combination may offer a simple and efficient solution owing to its\npowerful and flexible task-specific learning and advantages of lower cost, less\ncomplexity and smaller parameters than the traditional MTL methods. We\nintroduce deformable mixer Transformer with gating (DeMTG), a simple and\neffective encoder-decoder architecture up-to-date that incorporates the\nconvolution and attention mechanism in a unified network for MTL. It is\nexquisitely designed to use advantages of each block, and provide deformable\nand comprehensive features for all tasks from local and global perspective.\nFirst, the deformable mixer encoder contains two types of operators: the\nchannel-aware mixing operator leveraged to allow communication among different\nchannels, and the spatial-aware deformable operator with deformable convolution\napplied to efficiently sample more informative spatial locations. Second, the\ntask-aware gating transformer decoder is used to perform the task-specific\npredictions, in which task interaction block integrated with self-attention is\napplied to capture task interaction features, and the task query block\nintegrated with gating attention is leveraged to select corresponding\ntask-specific features. Further, the experiment results demonstrate that the\nproposed DeMTG uses fewer GFLOPs and significantly outperforms current\nTransformer-based and CNN-based competitive models on a variety of metrics on\nthree dense prediction datasets. Our code and models are available at\nhttps://github.com/yangyangxu0/DeMTG.\n","authors":["Yangyang Xu","Yibo Yang","Bernard Ghanemm","Lefei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05721v2.pdf","comment":"submitted to IJCV; an extension to our previous AAAI 2023 paper\n arXiv:2301.03461"},{"id":"http://arxiv.org/abs/2303.06358v2","updated":"2023-08-11T05:36:12Z","published":"2023-03-11T09:40:05Z","title":"O2CTA: Introducing Annotations from OCT to CCTA in Coronary Plaque\n Analysis","summary":" Targeted diagnosis and treatment plans for patients with coronary artery\ndisease vary according to atherosclerotic plaque component. Coronary CT\nangiography (CCTA) is widely used for artery imaging and determining the\nstenosis degree. However, the limited spatial resolution and susceptibility to\nartifacts fail CCTA in obtaining lumen morphological characteristics and plaque\ncomposition. It can be settled by invasive optical coherence tomography (OCT)\nwithout much trouble for physicians, but bringing higher costs and potential\nrisks to patients. Therefore, it is clinically critical to introduce\nannotations of plaque tissue and lumen characteristics from OCT to paired CCTA\nscans, denoted as \\textbf{the O2CTA problem} in this paper. We propose a method\nto handle the O2CTA problem. CCTA scans are first reconstructed into\nmulti-planar reformatted (MPR) images, which agree with OCT images in term of\nsemantic contents. The artery segment in OCT, which is manually labelled, is\nthen spatially aligned with the entire artery in MPR images via the proposed\nalignment strategy. Finally, a classification model involving a 3D CNN and a\nTransformer, is learned to extract local features and capture dependence along\narteries. Experiments on 55 paired OCT and CCTA we curate demonstrate that it\nis feasible to classify the CCTA based on the OCT labels, with an accuracy of\n86.2%, while the manual readings of OCT and CCTA vary significantly, with a\nKappa coefficient of 0.113. We will make our source codes, models, data, and\nresults publicly available to benefit the research community.\n","authors":["Jun Li","Kexin Li","Yafeng Zhou","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2303.06358v2.pdf","comment":"Accepted for oral presentation in MICCAI-BTSD 2023 workshop"},{"id":"http://arxiv.org/abs/2308.01097v3","updated":"2023-08-11T05:30:10Z","published":"2023-08-02T12:04:28Z","title":"Spatio-Temporal Branching for Motion Prediction using Motion Increments","summary":" Human motion prediction (HMP) has emerged as a popular research topic due to\nits diverse applications, but it remains a challenging task due to the\nstochastic and aperiodic nature of future poses. Traditional methods rely on\nhand-crafted features and machine learning techniques, which often struggle to\nmodel the complex dynamics of human motion. Recent deep learning-based methods\nhave achieved success by learning spatio-temporal representations of motion,\nbut these models often overlook the reliability of motion data. Additionally,\nthe temporal and spatial dependencies of skeleton nodes are distinct. The\ntemporal relationship captures motion information over time, while the spatial\nrelationship describes body structure and the relationships between different\nnodes. In this paper, we propose a novel spatio-temporal branching network\nusing incremental information for HMP, which decouples the learning of\ntemporal-domain and spatial-domain features, extracts more motion information,\nand achieves complementary cross-domain knowledge learning through knowledge\ndistillation. Our approach effectively reduces noise interference and provides\nmore expressive information for characterizing motion by separately extracting\ntemporal and spatial features. We evaluate our approach on standard HMP\nbenchmarks and outperform state-of-the-art methods in terms of prediction\naccuracy.\n","authors":["Jiexin Wang","Yujie Zhou","Wenwen Qiang","Ying Ba","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.01097v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03364v2","updated":"2023-08-11T05:21:15Z","published":"2023-08-07T07:39:39Z","title":"Dual Aggregation Transformer for Image Super-Resolution","summary":" Transformer has recently gained considerable popularity in low-level vision\ntasks, including image super-resolution (SR). These networks utilize\nself-attention along different dimensions, spatial or channel, and achieve\nimpressive performance. This inspires us to combine the two dimensions in\nTransformer for a more powerful representation capability. Based on the above\nidea, we propose a novel Transformer model, Dual Aggregation Transformer (DAT),\nfor image SR. Our DAT aggregates features across spatial and channel\ndimensions, in the inter-block and intra-block dual manner. Specifically, we\nalternately apply spatial and channel self-attention in consecutive Transformer\nblocks. The alternate strategy enables DAT to capture the global context and\nrealize inter-block feature aggregation. Furthermore, we propose the adaptive\ninteraction module (AIM) and the spatial-gate feed-forward network (SGFN) to\nachieve intra-block feature aggregation. AIM complements two self-attention\nmechanisms from corresponding dimensions. Meanwhile, SGFN introduces additional\nnon-linear spatial information in the feed-forward network. Extensive\nexperiments show that our DAT surpasses current methods. Code and models are\nobtainable at https://github.com/zhengchen1999/DAT.\n","authors":["Zheng Chen","Yulun Zhang","Jinjin Gu","Linghe Kong","Xiaokang Yang","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03364v2.pdf","comment":"Accepted to ICCV 2023. Code is available at\n https://github.com/zhengchen1999/DAT"},{"id":"http://arxiv.org/abs/2306.08966v2","updated":"2023-08-11T04:55:40Z","published":"2023-06-15T09:01:33Z","title":"Training Multimedia Event Extraction With Generated Images and Captions","summary":" Contemporary news reporting increasingly features multimedia content,\nmotivating research on multimedia event extraction. However, the task lacks\nannotated multimodal training data and artificially generated training data\nsuffer from distribution shift from real-world data. In this paper, we propose\nCross-modality Augmented Multimedia Event Learning (CAMEL), which successfully\nutilizes artificially generated multimodal training data and achieves\nstate-of-the-art performance. We start with two labeled unimodal datasets in\ntext and image respectively, and generate the missing modality using\noff-the-shelf image generators like Stable Diffusion and image captioners like\nBLIP. After that, we train the network on the resultant multimodal datasets. In\norder to learn robust features that are effective across domains, we devise an\niterative and gradual training strategy. Substantial experiments show that\nCAMEL surpasses state-of-the-art (SOTA) baselines on the M2E2 benchmark. On\nmultimedia events in particular, we outperform the prior SOTA by 4.2% F1 on\nevent mention identification and by 9.8% F1 on argument identification, which\nindicates that CAMEL learns synergistic representations from the two\nmodalities. Our work demonstrates a recipe to unleash the power of synthetic\ntraining data in structured prediction.\n","authors":["Zilin Du","Yunxin Li","Xu Guo","Yidan Sun","Boyang Li"],"pdf_url":"https://arxiv.org/pdf/2306.08966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05938v1","updated":"2023-08-11T04:42:10Z","published":"2023-08-11T04:42:10Z","title":"FoodSAM: Any Food Segmentation","summary":" In this paper, we explore the zero-shot capability of the Segment Anything\nModel (SAM) for food image segmentation. To address the lack of class-specific\ninformation in SAM-generated masks, we propose a novel framework, called\nFoodSAM. This innovative approach integrates the coarse semantic mask with\nSAM-generated masks to enhance semantic segmentation quality. Besides, we\nrecognize that the ingredients in food can be supposed as independent\nindividuals, which motivated us to perform instance segmentation on food\nimages. Furthermore, FoodSAM extends its zero-shot capability to encompass\npanoptic segmentation by incorporating an object detector, which renders\nFoodSAM to effectively capture non-food object information. Drawing inspiration\nfrom the recent success of promptable segmentation, we also extend FoodSAM to\npromptable segmentation, supporting various prompt variants. Consequently,\nFoodSAM emerges as an all-encompassing solution capable of segmenting food\nitems at multiple levels of granularity. Remarkably, this pioneering framework\nstands as the first-ever work to achieve instance, panoptic, and promptable\nsegmentation on food images. Extensive experiments demonstrate the feasibility\nand impressing performance of FoodSAM, validating SAM's potential as a\nprominent and influential tool within the domain of food image segmentation. We\nrelease our code at https://github.com/jamesjg/FoodSAM.\n","authors":["Xing Lan","Jiayi Lyu","Hanyu Jiang","Kun Dong","Zehai Niu","Yi Zhang","Jian Xue"],"pdf_url":"https://arxiv.org/pdf/2308.05938v1.pdf","comment":"Code is available at https://github.com/jamesjg/FoodSAM"},{"id":"http://arxiv.org/abs/2208.09668v3","updated":"2023-08-11T04:40:10Z","published":"2022-08-20T12:23:32Z","title":"Generalised Co-Salient Object Detection","summary":" We propose a new setting that relaxes an assumption in the conventional\nCo-Salient Object Detection (CoSOD) setting by allowing the presence of \"noisy\nimages\" which do not show the shared co-salient object. We call this new\nsetting Generalised Co-Salient Object Detection (GCoSOD). We propose a novel\nrandom sampling based Generalised CoSOD Training (GCT) strategy to distill the\nawareness of inter-image absence of co-salient objects into CoSOD models. It\nemploys a Diverse Sampling Self-Supervised Learning (DS3L) that, in addition to\nthe provided supervised co-salient label, introduces additional self-supervised\nlabels for noisy images (being null, that no co-salient object is present).\nFurther, the random sampling process inherent in GCT enables the generation of\na high-quality uncertainty map highlighting potential false-positive\npredictions at instance level. To evaluate the performance of CoSOD models\nunder the GCoSOD setting, we propose two new testing datasets, namely\nCoCA-Common and CoCA-Zero, where a common salient object is partially present\nin the former and completely absent in the latter. Extensive experiments\ndemonstrate that our proposed method significantly improves the performance of\nCoSOD models in terms of the performance under the GCoSOD setting as well as\nthe model calibration degrees.\n","authors":["Jiawei Liu","Jing Zhang","Ruikai Cui","Kaihao Zhang","Weihao Li","Nick Barnes"],"pdf_url":"https://arxiv.org/pdf/2208.09668v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05483v2","updated":"2023-08-11T04:28:51Z","published":"2022-09-12T06:14:04Z","title":"Self-Supervised Coordinate Projection Network for Sparse-View Computed\n Tomography","summary":" In the present work, we propose a Self-supervised COordinate Projection\nnEtwork (SCOPE) to reconstruct the artifacts-free CT image from a single SV\nsinogram by solving the inverse tomography imaging problem. Compared with\nrecent related works that solve similar problems using implicit neural\nrepresentation network (INR), our essential contribution is an effective and\nsimple re-projection strategy that pushes the tomography image reconstruction\nquality over supervised deep learning CT reconstruction works. The proposed\nstrategy is inspired by the simple relationship between linear algebra and\ninverse problems. To solve the under-determined linear equation system, we\nfirst introduce INR to constrain the solution space via image continuity prior\nand achieve a rough solution. And secondly, we propose to generate a dense view\nsinogram that improves the rank of the linear equation system and produces a\nmore stable CT image solution space. Our experiment results demonstrate that\nthe re-projection strategy significantly improves the image reconstruction\nquality (+3 dB for PSNR at least). Besides, we integrate the recent hash\nencoding into our SCOPE model, which greatly accelerates the model training.\nFinally, we evaluate SCOPE in parallel and fan X-ray beam SVCT reconstruction\ntasks. Experimental results indicate that the proposed SCOPE model outperforms\ntwo latest INR-based methods and two well-popular supervised DL methods\nquantitatively and qualitatively.\n","authors":["Qing Wu","Ruimin Feng","Hongjiang Wei","Jingyi Yu","Yuyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2209.05483v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.05932v1","updated":"2023-08-11T04:27:29Z","published":"2023-08-11T04:27:29Z","title":"Generalizing Event-Based Motion Deblurring in Real-World Scenarios","summary":" Event-based motion deblurring has shown promising results by exploiting\nlow-latency events. However, current approaches are limited in their practical\nusage, as they assume the same spatial resolution of inputs and specific\nblurriness distributions. This work addresses these limitations and aims to\ngeneralize the performance of event-based deblurring in real-world scenarios.\nWe propose a scale-aware network that allows flexible input spatial scales and\nenables learning from different temporal scales of motion blur. A two-stage\nself-supervised learning scheme is then developed to fit real-world data\ndistribution. By utilizing the relativity of blurriness, our approach\nefficiently ensures the restored brightness and structure of latent images and\nfurther generalizes deblurring performance to handle varying spatial and\ntemporal scales of motion blur in a self-distillation manner. Our method is\nextensively evaluated, demonstrating remarkable performance, and we also\nintroduce a real-world dataset consisting of multi-scale blurry frames and\nevents to facilitate research in event-based deblurring.\n","authors":["Xiang Zhang","Lei Yu","Wen Yang","Jianzhuang Liu","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2308.05932v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2305.03678v3","updated":"2023-08-11T04:23:29Z","published":"2023-05-05T16:48:45Z","title":"Towards Segment Anything Model (SAM) for Medical Image Segmentation: A\n Survey","summary":" Due to the flexibility of prompting, foundation models have become the\ndominant force in the domains of natural language processing and image\ngeneration. With the recent introduction of the Segment Anything Model (SAM),\nthe prompt-driven paradigm has entered the realm of image segmentation,\nbringing with a range of previously unexplored capabilities. However, it\nremains unclear whether it can be applicable to medical image segmentation due\nto the significant differences between natural images and medical images.In\nthis work, we summarize recent efforts to extend the success of SAM to medical\nimage segmentation tasks, including both empirical benchmarking and\nmethodological adaptations, and discuss potential future directions for SAM in\nmedical image segmentation. Although directly applying SAM to medical image\nsegmentation cannot obtain satisfying performance on multi-modal and\nmulti-target medical datasets, many insights are drawn to guide future research\nto develop foundation models for medical image analysis. To facilitate future\nresearch, we maintain an active repository that contains up-to-date paper list\nand open-source project summary at https://github.com/YichiZhang98/SAM4MIS.\n","authors":["Yichi Zhang","Rushi Jiao"],"pdf_url":"https://arxiv.org/pdf/2305.03678v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06564v3","updated":"2023-08-11T04:17:56Z","published":"2023-05-11T04:43:10Z","title":"Undercover Deepfakes: Detecting Fake Segments in Videos","summary":" The recent renaissance in generative models, driven primarily by the advent\nof diffusion models and iterative improvement in GAN methods, has enabled many\ncreative applications. However, each advancement is also accompanied by a rise\nin the potential for misuse. In the arena of the deepfake generation, this is a\nkey societal issue. In particular, the ability to modify segments of videos\nusing such generative techniques creates a new paradigm of deepfakes which are\nmostly real videos altered slightly to distort the truth.This paradigm has been\nunder-explored by the current deepfake detection methods in the academic\nliterature. In this paper, we present a deepfake detection method that can\naddress this issue by performing deepfake prediction at the frame and video\nlevels. To facilitate testing our method, we prepared a new benchmark dataset\nwhere videos have both real and fake frame sequences with very subtle\ntransitions. We provide a benchmark on the proposed dataset with our detection\nmethod which utilizes the Vision Transformer based on Scaling and Shifting to\nlearn spatial features, and a Timeseries Transformer to learn temporal features\nof the videos to help facilitate the interpretation of possible deepfakes.\nExtensive experiments on a variety of deepfake generation methods show\nexcellent results by the proposed method on temporal segmentation and classical\nvideo-level predictions as well. In particular, the paradigm we address will\nform a powerful tool for the moderation of deepfakes, where human oversight can\nbe better targeted to the parts of videos suspected of being deepfakes. All\nexperiments can be reproduced at: https://t.ly/\\_bOh9.\n","authors":["Sanjay Saha","Rashindrie Perera","Sachith Seneviratne","Tamasha Malepathirana","Sanka Rasnayaka","Deshani Geethika","Terence Sim","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2305.06564v3.pdf","comment":"ICCV 2023 Workshop and Challenge on DeepFake Analysis and Detection"},{"id":"http://arxiv.org/abs/2306.15932v2","updated":"2023-08-11T04:17:11Z","published":"2023-06-28T05:33:11Z","title":"NIPD: A Federated Learning Person Detection Benchmark Based on\n Real-World Non-IID Data","summary":" Federated learning (FL), a privacy-preserving distributed machine learning,\nhas been rapidly applied in wireless communication networks. FL enables\nInternet of Things (IoT) clients to obtain well-trained models while preventing\nprivacy leakage. Person detection can be deployed on edge devices with limited\ncomputing power if combined with FL to process the video data directly at the\nedge. However, due to the different hardware and deployment scenarios of\ndifferent cameras, the data collected by the camera present non-independent and\nidentically distributed (non-IID), and the global model derived from FL\naggregation is less effective. Meanwhile, existing research lacks public data\nset for real-world FL object detection, which is not conducive to studying the\nnon-IID problem on IoT cameras. Therefore, we open source a non-IID IoT person\ndetection (NIPD) data set, which is collected from five different cameras. To\nour knowledge, this is the first true device-based non-IID person detection\ndata set. Based on this data set, we explain how to establish a FL experimental\nplatform and provide a benchmark for non-IID person detection. NIPD is expected\nto promote the application of FL and the security of smart city.\n","authors":["Kangning Yin","Zhen Ding","Zhihua Dong","Dongsheng Chen","Jie Fu","Xinhui Ji","Guangqiang Yin","Zhiguo Wang"],"pdf_url":"https://arxiv.org/pdf/2306.15932v2.pdf","comment":"8 pages, 5 figures, 3 tables, FL-IJCAI 23 conference"},{"id":"http://arxiv.org/abs/2308.05925v1","updated":"2023-08-11T04:01:13Z","published":"2023-08-11T04:01:13Z","title":"CaPhy: Capturing Physical Properties for Animatable Human Avatars","summary":" We present CaPhy, a novel method for reconstructing animatable human avatars\nwith realistic dynamic properties for clothing. Specifically, we aim for\ncapturing the geometric and physical properties of the clothing from real\nobservations. This allows us to apply novel poses to the human avatar with\nphysically correct deformations and wrinkles of the clothing. To this end, we\ncombine unsupervised training with physics-based losses and 3D-supervised\ntraining using scanned data to reconstruct a dynamic model of clothing that is\nphysically realistic and conforms to the human scans. We also optimize the\nphysical parameters of the underlying physical model from the scans by\nintroducing gradient constraints of the physics-based losses. In contrast to\nprevious work on 3D avatar reconstruction, our method is able to generalize to\nnovel poses with realistic dynamic cloth deformations. Experiments on several\nsubjects demonstrate that our method can estimate the physical properties of\nthe garments, resulting in superior quantitative and qualitative results\ncompared with previous methods.\n","authors":["Zhaoqi Su","Liangxiao Hu","Siyou Lin","Hongwen Zhang","Shengping Zhang","Justus Thies","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05921v1","updated":"2023-08-11T03:22:33Z","published":"2023-08-11T03:22:33Z","title":"BATINet: Background-Aware Text to Image Synthesis and Manipulation\n Network","summary":" Background-Induced Text2Image (BIT2I) aims to generate foreground content\naccording to the text on the given background image. Most studies focus on\ngenerating high-quality foreground content, although they ignore the\nrelationship between the two contents. In this study, we analyzed a novel\nBackground-Aware Text2Image (BAT2I) task in which the generated content matches\nthe input background. We proposed a Background-Aware Text to Image synthesis\nand manipulation Network (BATINet), which contains two key components: Position\nDetect Network (PDN) and Harmonize Network (HN). The PDN detects the most\nplausible position of the text-relevant object in the background image. The HN\nharmonizes the generated content referring to background style information.\nFinally, we reconstructed the generation network, which consists of the\nmulti-GAN and attention module to match more user preferences. Moreover, we can\napply BATINet to text-guided image manipulation. It solves the most challenging\ntask of manipulating the shape of an object. We demonstrated through\nqualitative and quantitative evaluations on the CUB dataset that the proposed\nmodel outperforms other state-of-the-art methods.\n","authors":["Ryugo Morita","Zhiqiang Zhang","Jinjia Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.05921v1.pdf","comment":"Accepted to ICIP2023"},{"id":"http://arxiv.org/abs/2111.11011v4","updated":"2023-08-11T03:17:54Z","published":"2021-11-22T06:27:29Z","title":"CDistNet: Perceiving Multi-Domain Character Distance for Robust Text\n Recognition","summary":" The Transformer-based encoder-decoder framework is becoming popular in scene\ntext recognition, largely because it naturally integrates recognition clues\nfrom both visual and semantic domains. However, recent studies show that the\ntwo kinds of clues are not always well registered and therefore, feature and\ncharacter might be misaligned in difficult text (e.g., with a rare shape). As a\nresult, constraints such as character position are introduced to alleviate this\nproblem. Despite certain success, visual and semantic are still separately\nmodeled and they are merely loosely associated. In this paper, we propose a\nnovel module called Multi-Domain Character Distance Perception (MDCDP) to\nestablish a visually and semantically related position embedding. MDCDP uses\nthe position embedding to query both visual and semantic features following the\ncross-attention mechanism. The two kinds of clues are fused into the position\nbranch, generating a content-aware embedding that well perceives character\nspacing and orientation variants, character semantic affinities, and clues\ntying the two kinds of information. They are summarized as the multi-domain\ncharacter distance. We develop CDistNet that stacks multiple MDCDPs to guide a\ngradually precise distance modeling. Thus, the feature-character alignment is\nwell built even various recognition difficulties are presented. We verify\nCDistNet on ten challenging public datasets and two series of augmented\ndatasets created by ourselves. The experiments demonstrate that CDistNet\nperforms highly competitively. It not only ranks top-tier in standard\nbenchmarks, but also outperforms recent popular methods by obvious margins on\nreal and augmented datasets presenting severe text deformation, poor linguistic\nsupport, and rare character layouts. Code is available at\nhttps://github.com/simplify23/CDistNet.\n","authors":["Tianlun Zheng","Zhineng Chen","Shancheng Fang","Hongtao Xie","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2111.11011v4.pdf","comment":"Paper accepted for publication at IJCV 2023"},{"id":"http://arxiv.org/abs/2305.14677v2","updated":"2023-08-11T03:11:41Z","published":"2023-05-24T03:33:30Z","title":"Optimal Linear Subspace Search: Learning to Construct Fast and\n High-Quality Schedulers for Diffusion Models","summary":" In recent years, diffusion models have become the most popular and powerful\nmethods in the field of image synthesis, even rivaling human artists in\nartistic creativity. However, the key issue currently limiting the application\nof diffusion models is its extremely slow generation process. Although several\nmethods were proposed to speed up the generation process, there still exists a\ntrade-off between efficiency and quality. In this paper, we first provide a\ndetailed theoretical and empirical analysis of the generation process of the\ndiffusion models based on schedulers. We transform the designing problem of\nschedulers into the determination of several parameters, and further transform\nthe accelerated generation process into an expansion process of the linear\nsubspace. Based on these analyses, we consequently propose a novel method\ncalled Optimal Linear Subspace Search (OLSS), which accelerates the generation\nprocess by searching for the optimal approximation process of the complete\ngeneration process in the linear subspaces spanned by latent variables. OLSS is\nable to generate high-quality images with a very small number of steps. To\ndemonstrate the effectiveness of our method, we conduct extensive comparative\nexperiments on open-source diffusion models. Experimental results show that\nwith a given number of steps, OLSS can significantly improve the quality of\ngenerated images. Using an NVIDIA A100 GPU, we make it possible to generate a\nhigh-quality image by Stable Diffusion within only one second without other\noptimization techniques.\n","authors":["Zhongjie Duan","Chengyu Wang","Cen Chen","Jun Huang","Weining Qian"],"pdf_url":"https://arxiv.org/pdf/2305.14677v2.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.11219v2","updated":"2023-08-11T03:07:39Z","published":"2023-03-20T15:50:00Z","title":"NeTO:Neural Reconstruction of Transparent Objects with Self-Occlusion\n Aware Refraction-Tracing","summary":" We present a novel method, called NeTO, for capturing 3D geometry of solid\ntransparent objects from 2D images via volume rendering. Reconstructing\ntransparent objects is a very challenging task, which is ill-suited for\ngeneral-purpose reconstruction techniques due to the specular light transport\nphenomena. Although existing refraction-tracing based methods, designed\nspecially for this task, achieve impressive results, they still suffer from\nunstable optimization and loss of fine details, since the explicit surface\nrepresentation they adopted is difficult to be optimized, and the\nself-occlusion problem is ignored for refraction-tracing. In this paper, we\npropose to leverage implicit Signed Distance Function (SDF) as surface\nrepresentation, and optimize the SDF field via volume rendering with a\nself-occlusion aware refractive ray tracing. The implicit representation\nenables our method to be capable of reconstructing high-quality reconstruction\neven with a limited set of images, and the self-occlusion aware strategy makes\nit possible for our method to accurately reconstruct the self-occluded regions.\nExperiments show that our method achieves faithful reconstruction results and\noutperforms prior works by a large margin. Visit our project page at\n\\url{https://www.xxlong.site/NeTO/}\n","authors":["Zongcheng Li","Xiaoxiao Long","Yusen Wang","Tuo Cao","Wenping Wang","Fei Luo","Chunxia Xiao"],"pdf_url":"https://arxiv.org/pdf/2303.11219v2.pdf","comment":"Experiments involving sparse views have some flaws, mainly including\n Figure 1 in the introduction, Figure 7 and Table 1 in the experiments. In\n order to maintain correctness and fairness, we would like to retract the\n paper first"},{"id":"http://arxiv.org/abs/2308.05920v1","updated":"2023-08-11T03:07:31Z","published":"2023-08-11T03:07:31Z","title":"Semantics2Hands: Transferring Hand Motion Semantics between Avatars","summary":" Human hands, the primary means of non-verbal communication, convey intricate\nsemantics in various scenarios. Due to the high sensitivity of individuals to\nhand motions, even minor errors in hand motions can significantly impact the\nuser experience. Real applications often involve multiple avatars with varying\nhand shapes, highlighting the importance of maintaining the intricate semantics\nof hand motions across the avatars. Therefore, this paper aims to transfer the\nhand motion semantics between diverse avatars based on their respective hand\nmodels. To address this problem, we introduce a novel anatomy-based semantic\nmatrix (ASM) that encodes the semantics of hand motions. The ASM quantifies the\npositions of the palm and other joints relative to the local frame of the\ncorresponding joint, enabling precise retargeting of hand motions.\nSubsequently, we obtain a mapping function from the source ASM to the target\nhand joint rotations by employing an anatomy-based semantics reconstruction\nnetwork (ASRN). We train the ASRN using a semi-supervised learning strategy on\nthe Mixamo and InterHand2.6M datasets. We evaluate our method in intra-domain\nand cross-domain hand motion retargeting tasks. The qualitative and\nquantitative results demonstrate the significant superiority of our ASRN over\nthe state-of-the-arts.\n","authors":["Zijie Ye","Jia Jia","Junliang Xing"],"pdf_url":"https://arxiv.org/pdf/2308.05920v1.pdf","comment":"Accepted to MM 2023, 9 pages, 10 figures. Project page:\n https://abcyzj.github.io/S2H/"},{"id":"http://arxiv.org/abs/2209.10510v3","updated":"2023-08-11T03:07:28Z","published":"2022-09-21T17:15:58Z","title":"Learning to Relight Portrait Images via a Virtual Light Stage and\n Synthetic-to-Real Adaptation","summary":" Given a portrait image of a person and an environment map of the target\nlighting, portrait relighting aims to re-illuminate the person in the image as\nif the person appeared in an environment with the target lighting. To achieve\nhigh-quality results, recent methods rely on deep learning. An effective\napproach is to supervise the training of deep neural networks with a\nhigh-fidelity dataset of desired input-output pairs, captured with a light\nstage. However, acquiring such data requires an expensive special capture rig\nand time-consuming efforts, limiting access to only a few resourceful\nlaboratories. To address the limitation, we propose a new approach that can\nperform on par with the state-of-the-art (SOTA) relighting methods without\nrequiring a light stage. Our approach is based on the realization that a\nsuccessful relighting of a portrait image depends on two conditions. First, the\nmethod needs to mimic the behaviors of physically-based relighting. Second, the\noutput has to be photorealistic. To meet the first condition, we propose to\ntrain the relighting network with training data generated by a virtual light\nstage that performs physically-based rendering on various 3D synthetic humans\nunder different environment maps. To meet the second condition, we develop a\nnovel synthetic-to-real approach to bring photorealism to the relighting\nnetwork output. In addition to achieving SOTA results, our approach offers\nseveral advantages over the prior methods, including controllable glares on\nglasses and more temporally-consistent results for relighting videos.\n","authors":["Yu-Ying Yeh","Koki Nagano","Sameh Khamis","Jan Kautz","Ming-Yu Liu","Ting-Chun Wang"],"pdf_url":"https://arxiv.org/pdf/2209.10510v3.pdf","comment":"To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21\n pages, 25 figures, 7 tables. Project page:\n https://research.nvidia.com/labs/dir/lumos/"},{"id":"http://arxiv.org/abs/2308.05911v1","updated":"2023-08-11T02:25:58Z","published":"2023-08-11T02:25:58Z","title":"Collaborative Tracking Learning for Frame-Rate-Insensitive Multi-Object\n Tracking","summary":" Multi-object tracking (MOT) at low frame rates can reduce computational,\nstorage and power overhead to better meet the constraints of edge devices. Many\nexisting MOT methods suffer from significant performance degradation in\nlow-frame-rate videos due to significant location and appearance changes\nbetween adjacent frames. To this end, we propose to explore collaborative\ntracking learning (ColTrack) for frame-rate-insensitive MOT in a query-based\nend-to-end manner. Multiple historical queries of the same target jointly track\nit with richer temporal descriptions. Meanwhile, we insert an information\nrefinement module between every two temporal blocking decoders to better fuse\ntemporal clues and refine features. Moreover, a tracking object consistency\nloss is proposed to guide the interaction between historical queries. Extensive\nexperimental results demonstrate that in high-frame-rate videos, ColTrack\nobtains higher performance than state-of-the-art methods on large-scale\ndatasets Dancetrack and BDD100K, and outperforms the existing end-to-end\nmethods on MOT17. More importantly, ColTrack has a significant advantage over\nstate-of-the-art methods in low-frame-rate videos, which allows it to obtain\nfaster processing speeds by reducing frame-rate requirements while maintaining\nhigher performance. Code will be released at\nhttps://github.com/yolomax/ColTrack\n","authors":["Yiheng Liu","Junta Wu","Yi Fu"],"pdf_url":"https://arxiv.org/pdf/2308.05911v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2301.11189v3","updated":"2023-08-11T02:21:27Z","published":"2023-01-26T15:55:43Z","title":"Improving Statistical Fidelity for Neural Image Compression with\n Implicit Local Likelihood Models","summary":" Lossy image compression aims to represent images in as few bits as possible\nwhile maintaining fidelity to the original. Theoretical results indicate that\noptimizing distortion metrics such as PSNR or MS-SSIM necessarily leads to a\ndiscrepancy in the statistics of original images from those of reconstructions,\nin particular at low bitrates, often manifested by the blurring of the\ncompressed images. Previous work has leveraged adversarial discriminators to\nimprove statistical fidelity. Yet these binary discriminators adopted from\ngenerative modeling tasks may not be ideal for image compression. In this\npaper, we introduce a non-binary discriminator that is conditioned on quantized\nlocal image representations obtained via VQ-VAE autoencoders. Our evaluations\non the CLIC2020, DIV2K and Kodak datasets show that our discriminator is more\neffective for jointly optimizing distortion (e.g., PSNR) and statistical\nfidelity (e.g., FID) than the PatchGAN of the state-of-the-art HiFiC model. On\nCLIC2020, we obtain the same FID as HiFiC with 30-40\\% fewer bits.\n","authors":["Matthew J. Muckley","Alaaeldin El-Nouby","Karen Ullrich","Hervé Jégou","Jakob Verbeek"],"pdf_url":"https://arxiv.org/pdf/2301.11189v3.pdf","comment":"Upload camera-ready to arXiv. Official version available at\n https://proceedings.mlr.press/v202/muckley23a.html"},{"id":"http://arxiv.org/abs/2308.02463v2","updated":"2023-08-11T02:19:33Z","published":"2023-08-04T17:00:38Z","title":"Towards Generalist Foundation Model for Radiology","summary":" In this study, we aim to initiate the development of Radiology Foundation\nModel, termed as RadFM.We consider the construction of foundational models from\nthe perspectives of data, model design, and evaluation thoroughly. Our\ncontribution can be concluded as follows: (i), we construct a large-scale\nMedical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans.\nTo the best of our knowledge, this is the first multi-modal dataset containing\n3D medical scans. (ii), We propose an architecture that enables visually\nconditioned generative pre-training, allowing for the integration of text input\ninterleaved with 2D or 3D medical scans to generate response for diverse\nradiologic tasks. The model was initially pre-trained on MedMD and subsequently\ndomain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD,\ncontaining 3M radiologic visual-language pairs. (iii), we propose a new\nevaluation benchmark that comprises five tasks, aiming to comprehensively\nassess the capability of foundation models in handling practical clinical\nproblems. Our experimental results confirm that RadFM significantly outperforms\nexisting multi-modal foundation models. The codes, data, and model checkpoint\nwill all be made publicly available to promote further research and development\nin the field.\n","authors":["Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08058v3","updated":"2023-08-11T01:25:14Z","published":"2023-02-16T03:40:40Z","title":"Learning Non-Local Spatial-Angular Correlation for Light Field Image\n Super-Resolution","summary":" Exploiting spatial-angular correlation is crucial to light field (LF) image\nsuper-resolution (SR), but is highly challenging due to its non-local property\ncaused by the disparities among LF images. Although many deep neural networks\n(DNNs) have been developed for LF image SR and achieved continuously improved\nperformance, existing methods cannot well leverage the long-range\nspatial-angular correlation and thus suffer a significant performance drop when\nhandling scenes with large disparity variations. In this paper, we propose a\nsimple yet effective method to learn the non-local spatial-angular correlation\nfor LF image SR. In our method, we adopt the epipolar plane image (EPI)\nrepresentation to project the 4D spatial-angular correlation onto multiple 2D\nEPI planes, and then develop a Transformer network with repetitive\nself-attention operations to learn the spatial-angular correlation by modeling\nthe dependencies between each pair of EPI pixels. Our method can fully\nincorporate the information from all angular views while achieving a global\nreceptive field along the epipolar line. We conduct extensive experiments with\ninsightful visualizations to validate the effectiveness of our method.\nComparative results on five public datasets show that our method not only\nachieves state-of-the-art SR performance, but also performs robust to disparity\nvariations. Code is publicly available at\nhttps://github.com/ZhengyuLiang24/EPIT.\n","authors":["Zhengyu Liang","Yingqian Wang","Longguang Wang","Jungang Yang","Shilin Zhou","Yulan Guo"],"pdf_url":"https://arxiv.org/pdf/2302.08058v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.04402v3","updated":"2023-08-11T01:21:44Z","published":"2023-08-08T17:04:53Z","title":"Person Re-Identification without Identification via Event Anonymization","summary":" Wide-scale use of visual surveillance in public spaces puts individual\nprivacy at stake while increasing resource consumption (energy, bandwidth, and\ncomputation). Neuromorphic vision sensors (event-cameras) have been recently\nconsidered a valid solution to the privacy issue because they do not capture\ndetailed RGB visual information of the subjects in the scene. However, recent\ndeep learning architectures have been able to reconstruct images from event\ncameras with high fidelity, reintroducing a potential threat to privacy for\nevent-based vision applications. In this paper, we aim to anonymize\nevent-streams to protect the identity of human subjects against such image\nreconstruction attacks. To achieve this, we propose an end-to-end network\narchitecture jointly optimized for the twofold objective of preserving privacy\nand performing a downstream task such as person ReId. Our network learns to\nscramble events, enforcing the degradation of images recovered from the privacy\nattacker. In this work, we also bring to the community the first ever\nevent-based person ReId dataset gathered to evaluate the performance of our\napproach. We validate our approach with extensive experiments and report\nresults on the synthetic event data simulated from the publicly available\nSoftBio dataset and our proposed Event-ReId dataset.\n","authors":["Shafiq Ahmad","Pietro Morerio","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.04402v3.pdf","comment":"Accepted at International Conference on Computer Vision (ICCV), 2023"},{"id":"http://arxiv.org/abs/2308.05896v1","updated":"2023-08-11T01:11:46Z","published":"2023-08-11T01:11:46Z","title":"Semantic-embedded Similarity Prototype for Scene Recognition","summary":" Due to the high inter-class similarity caused by the complex composition\nwithin scenes and the co-existing objects across scenes, various studies have\nexplored object semantic knowledge within scenes to improve scene recognition.\nHowever, a resulting issue arises as semantic segmentation or object detection\ntechniques demand heavy computational power, thereby burdening the network\nconsiderably. This limitation often renders object-assisted approaches\nincompatible with edge devices. In contrast, this paper proposes a\nsemantic-based similarity prototype that assists the scene recognition network\nto achieve higher accuracy without increasing network parameters. It is simple\nand can be plug-and-played into existing pipelines. More specifically, a\nstatistical strategy is introduced to depict semantic knowledge in scenes as\nclass-level semantic representations. These representations are utilized to\nexplore inter-class correlations, ultimately constructing a similarity\nprototype. Furthermore, we propose two ways to use the similarity prototype to\nsupport network training from the perspective of gradient label softening and\nbatch-level contrastive loss, respectively. Comprehensive evaluations on\nmultiple benchmarks show that our similarity prototype enhances the performance\nof existing networks without adding any computational burden. Code and the\nstatistical similarity prototype will be available soon.\n","authors":["Chuanxin Song","Hanbo Wu","Xin Ma"],"pdf_url":"https://arxiv.org/pdf/2308.05896v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2210.17020v2","updated":"2023-08-11T00:47:30Z","published":"2022-10-31T02:25:38Z","title":"A Law of Data Separation in Deep Learning","summary":" While deep learning has enabled significant advances in many areas of\nscience, its black-box nature hinders architecture design for future artificial\nintelligence applications and interpretation for high-stakes decision makings.\nWe addressed this issue by studying the fundamental question of how deep neural\nnetworks process data in the intermediate layers. Our finding is a simple and\nquantitative law that governs how deep neural networks separate data according\nto class membership throughout all layers for classification. This law shows\nthat each layer improves data separation at a constant geometric rate, and its\nemergence is observed in a collection of network architectures and datasets\nduring training. This law offers practical guidelines for designing\narchitectures, improving model robustness and out-of-sample performance, as\nwell as interpreting the predictions.\n","authors":["Hangfeng He","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2210.17020v2.pdf","comment":"Accepted at PNAS"},{"id":"http://arxiv.org/abs/2201.07646v4","updated":"2023-08-11T00:13:54Z","published":"2022-01-19T15:23:46Z","title":"A Survey on Training Challenges in Generative Adversarial Networks for\n Biomedical Image Analysis","summary":" In biomedical image analysis, the applicability of deep learning methods is\ndirectly impacted by the quantity of image data available. This is due to deep\nlearning models requiring large image datasets to provide high-level\nperformance. Generative Adversarial Networks (GANs) have been widely utilized\nto address data limitations through the generation of synthetic biomedical\nimages. GANs consist of two models. The generator, a model that learns how to\nproduce synthetic images based on the feedback it receives. The discriminator,\na model that classifies an image as synthetic or real and provides feedback to\nthe generator. Throughout the training process, a GAN can experience several\ntechnical challenges that impede the generation of suitable synthetic imagery.\nFirst, the mode collapse problem whereby the generator either produces an\nidentical image or produces a uniform image from distinct input features.\nSecond, the non-convergence problem whereby the gradient descent optimizer\nfails to reach a Nash equilibrium. Thirdly, the vanishing gradient problem\nwhereby unstable training behavior occurs due to the discriminator achieving\noptimal classification performance resulting in no meaningful feedback being\nprovided to the generator. These problems result in the production of synthetic\nimagery that is blurry, unrealistic, and less diverse. To date, there has been\nno survey article outlining the impact of these technical challenges in the\ncontext of the biomedical imagery domain. This work presents a review and\ntaxonomy based on solutions to the training problems of GANs in the biomedical\nimaging domain. This survey highlights important challenges and outlines future\nresearch directions about the training of GANs in the domain of biomedical\nimagery.\n","authors":["Muhammad Muneeb Saad","Ruairi O'Reilly","Mubashir Husain Rehmani"],"pdf_url":"https://arxiv.org/pdf/2201.07646v4.pdf","comment":"Submitted to the AI Review Journal"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.06212v1","updated":"2023-08-11T16:30:44Z","published":"2023-08-11T16:30:44Z","title":"A Large Language Model Enhanced Conversational Recommender System","summary":" Conversational recommender systems (CRSs) aim to recommend high-quality items\nto users through a dialogue interface. It usually contains multiple sub-tasks,\nsuch as user preference elicitation, recommendation, explanation, and item\ninformation search. To develop effective CRSs, there are some challenges: 1)\nhow to properly manage sub-tasks; 2) how to effectively solve different\nsub-tasks; and 3) how to correctly generate responses that interact with users.\nRecently, Large Language Models (LLMs) have exhibited an unprecedented ability\nto reason and generate, presenting a new opportunity to develop more powerful\nCRSs. In this work, we propose a new LLM-based CRS, referred to as LLMCRS, to\naddress the above challenges. For sub-task management, we leverage the\nreasoning ability of LLM to effectively manage sub-task. For sub-task solving,\nwe collaborate LLM with expert models of different sub-tasks to achieve the\nenhanced performance. For response generation, we utilize the generation\nability of LLM as a language interface to better interact with users.\nSpecifically, LLMCRS divides the workflow into four stages: sub-task detection,\nmodel matching, sub-task execution, and response generation. LLMCRS also\ndesigns schema-based instruction, demonstration-based instruction, dynamic\nsub-task and model matching, and summary-based generation to instruct LLM to\ngenerate desired results in the workflow. Finally, to adapt LLM to\nconversational recommendations, we also propose to fine-tune LLM with\nreinforcement learning from CRSs performance feedback, referred to as RLPF.\nExperimental results on benchmark datasets show that LLMCRS with RLPF\noutperforms the existing methods.\n","authors":["Yue Feng","Shuchang Liu","Zhenghai Xue","Qingpeng Cai","Lantao Hu","Peng Jiang","Kun Gai","Fei Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12356v2","updated":"2023-08-11T16:15:52Z","published":"2022-08-25T22:10:18Z","title":"Lib-SibGMU -- A University Library Circulation Dataset for Recommender\n Systems Developmen","summary":" We opensource under CC BY 4.0 license Lib-SibGMU - a university library\ncirculation dataset - for a wide research community, and benchmark major\nalgorithms for recommender systems on this dataset. For a recommender\narchitecture that consists of a vectorizer that turns the history of the books\nborrowed into a vector, and a neighborhood-based recommender, trained\nseparately, we show that using the fastText model as a vectorizer delivers\ncompetitive results.\n","authors":["Eduard Zubchuk","Mikhail Arhipkin","Dmitry Menshikov","Aleksandr Karaush","Nikolay Mikhaylovskiy"],"pdf_url":"https://arxiv.org/pdf/2208.12356v2.pdf","comment":"Dataset copyright discussion"},{"id":"http://arxiv.org/abs/2308.06144v1","updated":"2023-08-11T14:06:41Z","published":"2023-08-11T14:06:41Z","title":"Identification of the Relevance of Comments in Codes Using Bag of Words\n and Transformer Based Models","summary":" The Forum for Information Retrieval (FIRE) started a shared task this year\nfor classification of comments of different code segments. This is binary text\nclassification task where the objective is to identify whether comments given\nfor certain code segments are relevant or not. The BioNLP-IISERB group at the\nIndian Institute of Science Education and Research Bhopal (IISERB) participated\nin this task and submitted five runs for five different models. The paper\npresents the overview of the models and other significant findings on the\ntraining corpus. The methods involve different feature engineering schemes and\ntext classification techniques. The performance of the classical bag of words\nmodel and transformer-based models were explored to identify significant\nfeatures from the given training corpus. We have explored different classifiers\nviz., random forest, support vector machine and logistic regression using the\nbag of words model. Furthermore, the pre-trained transformer based models like\nBERT, RoBERT and ALBERT were also used by fine-tuning them on the given\ntraining corpus. The performance of different such models over the training\ncorpus were reported and the best five models were implemented on the given\ntest corpus. The empirical results show that the bag of words model outperforms\nthe transformer based models, however, the performance of our runs are not\nreasonably well in both training and test corpus. This paper also addresses the\nlimitations of the models and scope for further improvement.\n","authors":["Sruthi S","Tanmay Basu"],"pdf_url":"https://arxiv.org/pdf/2308.06144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06091v1","updated":"2023-08-11T12:04:36Z","published":"2023-08-11T12:04:36Z","title":"Toward a Better Understanding of Loss Functions for Collaborative\n Filtering","summary":" Collaborative filtering (CF) is a pivotal technique in modern recommender\nsystems. The learning process of CF models typically consists of three\ncomponents: interaction encoder, loss function, and negative sampling. Although\nmany existing studies have proposed various CF models to design sophisticated\ninteraction encoders, recent work shows that simply reformulating the loss\nfunctions can achieve significant performance gains. This paper delves into\nanalyzing the relationship among existing loss functions. Our mathematical\nanalysis reveals that the previous loss functions can be interpreted as\nalignment and uniformity functions: (i) the alignment matches user and item\nrepresentations, and (ii) the uniformity disperses user and item distributions.\nInspired by this analysis, we propose a novel loss function that improves the\ndesign of alignment and uniformity considering the unique patterns of datasets\ncalled Margin-aware Alignment and Weighted Uniformity (MAWU). The key novelty\nof MAWU is two-fold: (i) margin-aware alignment (MA) mitigates\nuser/item-specific popularity biases, and (ii) weighted uniformity (WU) adjusts\nthe significance between user and item uniformities to reflect the inherent\ncharacteristics of datasets. Extensive experimental results show that MF and\nLightGCN equipped with MAWU are comparable or superior to state-of-the-art CF\nmodels with various loss functions on three public datasets.\n","authors":["Seongmin Park","Mincheol Yoon","Jae-woong Lee","Hogun Park","Jongwuk Lee"],"pdf_url":"https://arxiv.org/pdf/2308.06091v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2307.15464v3","updated":"2023-08-11T10:08:00Z","published":"2023-07-28T10:34:47Z","title":"Framework to Automatically Determine the Quality of Open Data Catalogs","summary":" Data catalogs play a crucial role in modern data-driven organizations by\nfacilitating the discovery, understanding, and utilization of diverse data\nassets. However, ensuring their quality and reliability is complex, especially\nin open and large-scale data environments. This paper proposes a framework to\nautomatically determine the quality of open data catalogs, addressing the need\nfor efficient and reliable quality assessment mechanisms. Our framework can\nanalyze various core quality dimensions, such as accuracy, completeness,\nconsistency, scalability, and timeliness, offer several alternatives for the\nassessment of compatibility and similarity across such catalogs as well as the\nimplementation of a set of non-core quality dimensions such as provenance,\nreadability, and licensing. The goal is to empower data-driven organizations to\nmake informed decisions based on trustworthy and well-curated data assets. The\nsource code that illustrates our approach can be downloaded from\nhttps://www.github.com/jorge-martinez-gil/dataq/.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.15464v3.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2308.06037v1","updated":"2023-08-11T09:32:58Z","published":"2023-08-11T09:32:58Z","title":"Deep Context Interest Network for Click-Through Rate Prediction","summary":" Click-Through Rate (CTR) prediction, estimating the probability of a user\nclicking on an item, is essential in industrial applications, such as online\nadvertising. Many works focus on user behavior modeling to improve CTR\nprediction performance. However, most of those methods only model users'\npositive interests from users' click items while ignoring the context\ninformation, which is the display items around the clicks, resulting in\ninferior performance. In this paper, we highlight the importance of context\ninformation on user behavior modeling and propose a novel model named Deep\nContext Interest Network (DCIN), which integrally models the click and its\ndisplay context to learn users' context-aware interests. DCIN consists of three\nkey modules: 1) Position-aware Context Aggregation Module (PCAM), which\nperforms aggregation of display items with an attention mechanism; 2)\nFeedback-Context Fusion Module (FCFM), which fuses the representation of clicks\nand display contexts through non-linear feature interaction; 3) Interest\nMatching Module (IMM), which activates interests related with the target item.\nMoreover, we provide our hands-on solution to implement our DCIN model on\nlarge-scale industrial systems. The significant improvements in both offline\nand online evaluations demonstrate the superiority of our proposed DCIN method.\nNotably, DCIN has been deployed on our online advertising system serving the\nmain traffic, which brings 1.5% CTR and 1.5% RPM lift.\n","authors":["Xuyang Hou","Zhe Wang","Qi Liu","Tan Qu","Jia Cheng","Jun Lei"],"pdf_url":"https://arxiv.org/pdf/2308.06037v1.pdf","comment":"accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2205.08776v3","updated":"2023-08-11T09:31:09Z","published":"2022-05-18T07:55:33Z","title":"AdaMCT: Adaptive Mixture of CNN-Transformer for Sequential\n Recommendation","summary":" Sequential recommendation (SR) aims to model users dynamic preferences from a\nseries of interactions. A pivotal challenge in user modeling for SR lies in the\ninherent variability of user preferences. An effective SR model is expected to\ncapture both the long-term and short-term preferences exhibited by users,\nwherein the former can offer a comprehensive understanding of stable interests\nthat impact the latter. To more effectively capture such information, we\nincorporate locality inductive bias into the Transformer by amalgamating its\nglobal attention mechanism with a local convolutional filter, and adaptively\nascertain the mixing importance on a personalized basis through layer-aware\nadaptive mixture units, termed as AdaMCT. Moreover, as users may repeatedly\nbrowse potential purchases, it is expected to consider multiple relevant items\nconcurrently in long-/short-term preferences modeling. Given that softmax-based\nattention may promote unimodal activation, we propose the Squeeze-Excitation\nAttention (with sigmoid activation) into SR models to capture multiple\npertinent items (keys) simultaneously. Extensive experiments on three widely\nemployed benchmarks substantiate the effectiveness and efficiency of our\nproposed approach. Source code is available at\nhttps://github.com/juyongjiang/AdaMCT.\n","authors":["Juyong Jiang","Peiyan Zhang","Yingtao Luo","Chaozhuo Li","Jae Boum Kim","Kai Zhang","Senzhang Wang","Xing Xie","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2205.08776v3.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2308.06018v1","updated":"2023-08-11T08:48:56Z","published":"2023-08-11T08:48:56Z","title":"Designing a User Contextual Profile Ontology: A Focus on the Vehicle\n Sales Domain","summary":" In the digital age, it is crucial to understand and tailor experiences for\nusers interacting with systems and applications. This requires the creation of\nuser contextual profiles that combine user profiles with contextual\ninformation. However, there is a lack of research on the integration of\ncontextual information with different user profiles. This study aims to address\nthis gap by designing a user contextual profile ontology that considers both\nuser profiles and contextual information on each profile. Specifically, we\npresent a design and development of the user contextual profile ontology with a\nfocus on the vehicle sales domain. Our designed ontology serves as a structural\nfoundation for standardizing the representation of user profiles and contextual\ninformation, enhancing the system's ability to capture user preferences and\ncontextual information of the user accurately. Moreover, we illustrate a case\nstudy using the User Contextual Profile Ontology in generating personalized\nrecommendations for vehicle sales domain.\n","authors":["Ngoc Luyen Le","Marie-Hélène Abel","Philippe Gouspillou"],"pdf_url":"https://arxiv.org/pdf/2308.06018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01118v2","updated":"2023-08-11T07:43:27Z","published":"2023-08-02T12:58:11Z","title":"A Survey on Popularity Bias in Recommender Systems","summary":" Recommender systems help people find relevant content in a personalized way.\nOne main promise of such systems is that they are able to increase the\nvisibility of items in the long tail, i.e., the lesser-known items in a\ncatalogue. Existing research, however, suggests that in many situations today's\nrecommendation algorithms instead exhibit a popularity bias, meaning that they\noften focus on rather popular items in their recommendations. Such a bias may\nnot only lead to limited value of the recommendations for consumers and\nproviders in the short run, but it may also cause undesired reinforcement\neffects over time. In this paper, we discuss the potential reasons for\npopularity bias and we review existing approaches to detect, quantify and\nmitigate popularity bias in recommender systems. Our survey therefore includes\nboth an overview of the computational metrics used in the literature as well as\na review of the main technical approaches to reduce the bias. We furthermore\ncritically discuss today's literature, where we observe that the research is\nalmost entirely based on computational experiments and on certain assumptions\nregarding the practical effects of including long-tail items in the\nrecommendations.\n","authors":["Anastasiia Klimashevskaia","Dietmar Jannach","Mehdi Elahi","Christoph Trattner"],"pdf_url":"https://arxiv.org/pdf/2308.01118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05972v1","updated":"2023-08-11T07:09:55Z","published":"2023-08-11T07:09:55Z","title":"Augmented Negative Sampling for Collaborative Filtering","summary":" Negative sampling is essential for implicit-feedback-based collaborative\nfiltering, which is used to constitute negative signals from massive unlabeled\ndata to guide supervised learning. The state-of-the-art idea is to utilize hard\nnegative samples that carry more useful information to form a better decision\nboundary. To balance efficiency and effectiveness, the vast majority of\nexisting methods follow the two-pass approach, in which the first pass samples\na fixed number of unobserved items by a simple static distribution and then the\nsecond pass selects the final negative items using a more sophisticated\nnegative sampling strategy. However, selecting negative samples from the\noriginal items is inherently restricted, and thus may not be able to contrast\npositive samples well. In this paper, we confirm this observation via\nexperiments and introduce two limitations of existing solutions: ambiguous trap\nand information discrimination. Our response to such limitations is to\nintroduce augmented negative samples. This direction renders a substantial\ntechnical challenge because constructing unconstrained negative samples may\nintroduce excessive noise that distorts the decision boundary. To this end, we\nintroduce a novel generic augmented negative sampling paradigm and provide a\nconcrete instantiation. First, we disentangle hard and easy factors of negative\nitems. Next, we generate new candidate negative samples by augmenting only the\neasy factors in a regulated manner: the direction and magnitude of the\naugmentation are carefully calibrated. Finally, we design an advanced negative\nsampling strategy to identify the final augmented negative samples, which\nconsiders not only the score function used in existing methods but also a new\nmetric called augmentation gain. Extensive experiments on real-world datasets\ndemonstrate that our method significantly outperforms state-of-the-art\nbaselines.\n","authors":["Yuhan Zhao","Rui Chen","Riwei Lai","Qilong Han","Hongtao Song","Li Chen"],"pdf_url":"https://arxiv.org/pdf/2308.05972v1.pdf","comment":"11 pages, 16 figures,"},{"id":"http://arxiv.org/abs/2308.05935v1","updated":"2023-08-11T04:36:26Z","published":"2023-08-11T04:36:26Z","title":"LittleMu: Deploying an Online Virtual Teaching Assistant via\n Heterogeneous Sources Integration and Chain of Teach Prompts","summary":" Teaching assistants have played essential roles in the long history of\neducation. However, few MOOC platforms are providing human or virtual teaching\nassistants to support learning for massive online students due to the\ncomplexity of real-world online education scenarios and the lack of training\ndata. In this paper, we present a virtual MOOC teaching assistant, LittleMu\nwith minimum labeled training data, to provide question answering and chit-chat\nservices. Consisting of two interactive modules of heterogeneous retrieval and\nlanguage model prompting, LittleMu first integrates structural, semi- and\nunstructured knowledge sources to support accurate answers for a wide range of\nquestions. Then, we design delicate demonstrations named \"Chain of Teach\"\nprompts to exploit the large-scale pre-trained model to handle complex\nuncollected questions. Except for question answering, we develop other\neducational services such as knowledge-grounded chit-chat. We test the system's\nperformance via both offline evaluation and online deployment. Since May 2020,\nour LittleMu system has served over 80,000 users with over 300,000 queries from\nover 500 courses on XuetangX MOOC platform, which continuously contributes to a\nmore convenient and fair education. Our code, services, and dataset will be\navailable at https://github.com/THU-KEG/VTA.\n","authors":["Shangqing Tu","Zheyuan Zhang","Jifan Yu","Chunyang Li","Siyu Zhang","Zijun Yao","Lei Hou","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2308.05935v1.pdf","comment":"7 pages, 3 figures, Accepted by CIKM 23"},{"id":"http://arxiv.org/abs/2211.00732v3","updated":"2023-08-11T04:00:59Z","published":"2022-10-28T12:54:30Z","title":"Kuaipedia: a Large-scale Multi-modal Short-video Encyclopedia","summary":" Online encyclopedias, such as Wikipedia, have been well-developed and\nresearched in the last two decades. One can find any attributes or other\ninformation of a wiki item on a wiki page edited by a community of volunteers.\nHowever, the traditional text, images and tables can hardly express some\naspects of an wiki item. For example, when we talk about ``Shiba Inu'', one may\ncare more about ``How to feed it'' or ``How to train it not to protect its\nfood''. Currently, short-video platforms have become a hallmark in the online\nworld. Whether you're on TikTok, Instagram, Kuaishou, or YouTube Shorts,\nshort-video apps have changed how we consume and create content today. Except\nfor producing short videos for entertainment, we can find more and more authors\nsharing insightful knowledge widely across all walks of life. These short\nvideos, which we call knowledge videos, can easily express any aspects (e.g.\nhair or how-to-feed) consumers want to know about an item (e.g. Shiba Inu), and\nthey can be systematically analyzed and organized like an online encyclopedia.\nIn this paper, we propose Kuaipedia, a large-scale multi-modal encyclopedia\nconsisting of items, aspects, and short videos lined to them, which was\nextracted from billions of videos of Kuaishou (Kwai), a well-known short-video\nplatform in China. We first collected items from multiple sources and mined\nuser-centered aspects from millions of users' queries to build an item-aspect\ntree. Then we propose a new task called ``multi-modal item-aspect linking'' as\nan expansion of ``entity linking'' to link short videos into item-aspect pairs\nand build the whole short-video encyclopedia. Intrinsic evaluations show that\nour encyclopedia is of large scale and highly accurate. We also conduct\nsufficient extrinsic experiments to show how Kuaipedia can help fundamental\napplications such as entity typing and entity linking.\n","authors":["Haojie Pan","Zepeng Zhai","Yuzhou Zhang","Ruiji Fu","Ming Liu","Yangqiu Song","Zhongyuan Wang","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2211.00732v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05902v1","updated":"2023-08-11T01:52:23Z","published":"2023-08-11T01:52:23Z","title":"LTP-MMF: Towards Long-term Provider Max-min Fairness Under\n Recommendation Feedback Loops","summary":" Multi-stakeholder recommender systems involve various roles, such as users,\nproviders. Previous work pointed out that max-min fairness (MMF) is a better\nmetric to support weak providers. However, when considering MMF, the features\nor parameters of these roles vary over time, how to ensure long-term provider\nMMF has become a significant challenge. We observed that recommendation\nfeedback loops (named RFL) will influence the provider MMF greatly in the long\nterm. RFL means that recommender system can only receive feedback on exposed\nitems from users and update recommender models incrementally based on this\nfeedback. When utilizing the feedback, the recommender model will regard\nunexposed item as negative. In this way, tail provider will not get the\nopportunity to be exposed, and its items will always be considered as negative\nsamples. Such phenomenons will become more and more serious in RFL. To\nalleviate the problem, this paper proposes an online ranking model named\nLong-Term Provider Max-min Fairness (named LTP-MMF). Theoretical analysis shows\nthat the long-term regret of LTP-MMF enjoys a sub-linear bound. Experimental\nresults on three public recommendation benchmarks demonstrated that LTP-MMF can\noutperform the baselines in the long term.\n","authors":["Chen Xu","Xiaopeng Ye","Jun Xu","Xiao Zhang","Weiran Shen","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.05902v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.06660"},{"id":"http://arxiv.org/abs/2308.06368v1","updated":"2023-08-11T20:05:01Z","published":"2023-08-11T20:05:01Z","title":"Topic-Level Bayesian Surprise and Serendipity for Recommender Systems","summary":" A recommender system that optimizes its recommendations solely to fit a\nuser's history of ratings for consumed items can create a filter bubble,\nwherein the user does not get to experience items from novel, unseen\ncategories. One approach to mitigate this undesired behavior is to recommend\nitems with high potential for serendipity, namely surprising items that are\nlikely to be highly rated. In this paper, we propose a content-based\nformulation of serendipity that is rooted in Bayesian surprise and use it to\nmeasure the serendipity of items after they are consumed and rated by the user.\nWhen coupled with a collaborative-filtering component that identifies similar\nusers, this enables recommending items with high potential for serendipity. To\nfacilitate the evaluation of topic-level models for surprise and serendipity,\nwe introduce a dataset of book reading histories extracted from Goodreads,\ncontaining over 26 thousand users and close to 1.3 million books, where we\nmanually annotate 449 books read by 4 users in terms of their time-dependent,\ntopic-level surprise. Experimental evaluations show that models that use\nBayesian surprise correlate much better with the manual annotations of\ntopic-level surprise than distance-based heuristics, and also obtain better\nserendipitous item recommendation performance.\n","authors":["Tonmoy Hasan","Razvan Bunescu"],"pdf_url":"https://arxiv.org/pdf/2308.06368v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.06262v1","updated":"2023-08-11T17:54:44Z","published":"2023-08-11T17:54:44Z","title":"Foundation Model is Efficient Multimodal Multitask Model Selector","summary":" This paper investigates an under-explored but important problem: given a\ncollection of pre-trained neural networks, predicting their performance on each\nmulti-modal task without fine-tuning them, such as image recognition,\nreferring, captioning, visual question answering, and text question answering.\nA brute-force approach is to finetune all models on all target datasets,\nbringing high computational costs. Although recent-advanced approaches employed\nlightweight metrics to measure models' transferability,they often depend\nheavily on the prior knowledge of a single task, making them inapplicable in a\nmulti-modal multi-task scenario. To tackle this issue, we propose an efficient\nmulti-task model selector (EMMS), which employs large-scale foundation models\nto transform diverse label formats such as categories, texts, and bounding\nboxes of different downstream tasks into a unified noisy label embedding. EMMS\ncan estimate a model's transferability through a simple weighted linear\nregression, which can be efficiently solved by an alternating minimization\nalgorithm with a convergence guarantee. Extensive experiments on 5 downstream\ntasks with 24 datasets show that EMMS is fast, effective, and generic enough to\nassess the transferability of pre-trained models, making it the first model\nselection method in the multi-task scenario. For instance, compared with the\nstate-of-the-art method LogME enhanced by our label embeddings, EMMS achieves\n9.0\\%, 26.3\\%, 20.1\\%, 54.8\\%, 12.2\\% performance gain on image recognition,\nreferring, captioning, visual question answering, and text question answering,\nwhile bringing 5.13x, 6.29x, 3.59x, 6.19x, and 5.66x speedup in wall-clock\ntime, respectively. The code is available at\nhttps://github.com/OpenGVLab/Multitask-Model-Selector.\n","authors":["Fanqing Meng","Wenqi Shao","Zhanglin Peng","Chonghe Jiang","Kaipeng Zhang","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.06262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.13148v3","updated":"2023-08-11T17:51:06Z","published":"2021-07-28T03:22:58Z","title":"Combining Machine Learning Classifiers for Stock Trading with Effective\n Feature Extraction","summary":" The unpredictability and volatility of the stock market render it challenging\nto make a substantial profit using any generalised scheme. Many previous\nstudies tried different techniques to build a machine learning model, which can\nmake a significant profit in the US stock market by performing live trading.\nHowever, very few studies have focused on the importance of finding the best\nfeatures for a particular trading period. Our top approach used the performance\nto narrow down the features from a total of 148 to about 30. Furthermore, the\ntop 25 features were dynamically selected before each time training our machine\nlearning model. It uses ensemble learning with four classifiers: Gaussian Naive\nBayes, Decision Tree, Logistic Regression with L1 regularization, and\nStochastic Gradient Descent, to decide whether to go long or short on a\nparticular stock. Our best model performed daily trade between July 2011 and\nJanuary 2019, generating 54.35% profit. Finally, our work showcased that\nmixtures of weighted classifiers perform better than any individual predictor\nof making trading decisions in the stock market.\n","authors":["A. K. M. Amanat Ullah","Fahim Imtiaz","Miftah Uddin Md Ihsan","Md. Golam Rabiul Alam","Mahbub Majumdar"],"pdf_url":"https://arxiv.org/pdf/2107.13148v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07640v2","updated":"2023-08-11T17:50:41Z","published":"2023-02-14T14:07:09Z","title":"Detection and classification of vocal productions in large scale audio\n recordings","summary":" We propose an automatic data processing pipeline to extract vocal productions\nfrom large-scale natural audio recordings and classify these vocal productions.\nThe pipeline is based on a deep neural network and adresses both issues\nsimultaneously. Though a series of computationel steps (windowing, creation of\na noise class, data augmentation, re-sampling, transfer learning, Bayesian\noptimisation), it automatically trains a neural network without requiring a\nlarge sample of labeled data and important computing resources. Our end-to-end\nmethodology can handle noisy recordings made under different recording\nconditions. We test it on two different natural audio data sets, one from a\ngroup of Guinea baboons recorded from a primate research center and one from\nhuman babies recorded at home. The pipeline trains a model on 72 and 77 minutes\nof labeled audio recordings, with an accuracy of 94.58% and 99.76%. It is then\nused to process 443 and 174 hours of natural continuous recordings and it\ncreates two new databases of 38.8 and 35.2 hours, respectively. We discuss the\nstrengths and limitations of this approach that can be applied to any massive\naudio recording.\n","authors":["Guillem Bonafos","Pierre Pudlo","Jean-Marc Freyermuth","Thierry Legou","Joël Fagot","Samuel Tronçon","Arnaud Rey"],"pdf_url":"https://arxiv.org/pdf/2302.07640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.06817v2","updated":"2023-08-11T17:45:27Z","published":"2022-12-13T18:55:15Z","title":"RT-1: Robotics Transformer for Real-World Control at Scale","summary":" By transferring knowledge from large, diverse, task-agnostic datasets, modern\nmachine learning models can solve specific downstream tasks either zero-shot or\nwith small task-specific datasets to a high level of performance. While this\ncapability has been demonstrated in other fields such as computer vision,\nnatural language processing or speech recognition, it remains to be shown in\nrobotics, where the generalization capabilities of the models are particularly\ncritical due to the difficulty of collecting real-world robotic data. We argue\nthat one of the keys to the success of such general robotic models lies with\nopen-ended task-agnostic training, combined with high-capacity architectures\nthat can absorb all of the diverse, robotic data. In this paper, we present a\nmodel class, dubbed Robotics Transformer, that exhibits promising scalable\nmodel properties. We verify our conclusions in a study of different model\nclasses and their ability to generalize as a function of the data size, model\nsize, and data diversity based on a large-scale data collection on real robots\nperforming real-world tasks. The project's website and videos can be found at\nrobotics-transformer1.github.io\n","authors":["Anthony Brohan","Noah Brown","Justice Carbajal","Yevgen Chebotar","Joseph Dabis","Chelsea Finn","Keerthana Gopalakrishnan","Karol Hausman","Alex Herzog","Jasmine Hsu","Julian Ibarz","Brian Ichter","Alex Irpan","Tomas Jackson","Sally Jesmonth","Nikhil J Joshi","Ryan Julian","Dmitry Kalashnikov","Yuheng Kuang","Isabel Leal","Kuang-Huei Lee","Sergey Levine","Yao Lu","Utsav Malla","Deeksha Manjunath","Igor Mordatch","Ofir Nachum","Carolina Parada","Jodilyn Peralta","Emily Perez","Karl Pertsch","Jornell Quiambao","Kanishka Rao","Michael Ryoo","Grecia Salazar","Pannag Sanketi","Kevin Sayed","Jaspiar Singh","Sumedh Sontakke","Austin Stone","Clayton Tan","Huong Tran","Vincent Vanhoucke","Steve Vega","Quan Vuong","Fei Xia","Ted Xiao","Peng Xu","Sichun Xu","Tianhe Yu","Brianna Zitkovich"],"pdf_url":"https://arxiv.org/pdf/2212.06817v2.pdf","comment":"See website at robotics-transformer1.github.io"},{"id":"http://arxiv.org/abs/2308.06248v1","updated":"2023-08-11T17:29:02Z","published":"2023-08-11T17:29:02Z","title":"FunnyBirds: A Synthetic Vision Dataset for a Part-Based Analysis of\n Explainable AI Methods","summary":" The field of explainable artificial intelligence (XAI) aims to uncover the\ninner workings of complex deep neural models. While being crucial for\nsafety-critical domains, XAI inherently lacks ground-truth explanations, making\nits automatic evaluation an unsolved problem. We address this challenge by\nproposing a novel synthetic vision dataset, named FunnyBirds, and accompanying\nautomatic evaluation protocols. Our dataset allows performing semantically\nmeaningful image interventions, e.g., removing individual object parts, which\nhas three important implications. First, it enables analyzing explanations on a\npart level, which is closer to human comprehension than existing methods that\nevaluate on a pixel level. Second, by comparing the model output for inputs\nwith removed parts, we can estimate ground-truth part importances that should\nbe reflected in the explanations. Third, by mapping individual explanations\ninto a common space of part importances, we can analyze a variety of different\nexplanation types in a single common framework. Using our tools, we report\nresults for 24 different combinations of neural models and XAI methods,\ndemonstrating the strengths and weaknesses of the assessed methods in a fully\nautomatic and systematic manner.\n","authors":["Robin Hesse","Simone Schaub-Meyer","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2308.06248v1.pdf","comment":"Accepted at ICCV 2023. Code: https://github.com/visinf/funnybirds"},{"id":"http://arxiv.org/abs/2308.06239v1","updated":"2023-08-11T17:15:12Z","published":"2023-08-11T17:15:12Z","title":"Private Distribution Learning with Public Data: The View from Sample\n Compression","summary":" We study the problem of private distribution learning with access to public\ndata. In this setup, which we refer to as public-private learning, the learner\nis given public and private samples drawn from an unknown distribution $p$\nbelonging to a class $\\mathcal Q$, with the goal of outputting an estimate of\n$p$ while adhering to privacy constraints (here, pure differential privacy)\nonly with respect to the private samples.\n We show that the public-private learnability of a class $\\mathcal Q$ is\nconnected to the existence of a sample compression scheme for $\\mathcal Q$, as\nwell as to an intermediate notion we refer to as list learning. Leveraging this\nconnection: (1) approximately recovers previous results on Gaussians over\n$\\mathbb R^d$; and (2) leads to new ones, including sample complexity upper\nbounds for arbitrary $k$-mixtures of Gaussians over $\\mathbb R^d$, results for\nagnostic and distribution-shift resistant learners, as well as closure\nproperties for public-private learnability under taking mixtures and products\nof distributions. Finally, via the connection to list learning, we show that\nfor Gaussians in $\\mathbb R^d$, at least $d$ public samples are necessary for\nprivate learnability, which is close to the known upper bound of $d+1$ public\nsamples.\n","authors":["Shai Ben-David","Alex Bie","Clément L. Canonne","Gautam Kamath","Vikrant Singhal"],"pdf_url":"https://arxiv.org/pdf/2308.06239v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2211.05961v2","updated":"2023-08-11T17:08:02Z","published":"2022-11-11T02:14:29Z","title":"Inverse Kernel Decomposition","summary":" The state-of-the-art dimensionality reduction approaches largely rely on\ncomplicated optimization procedures. On the other hand, closed-form approaches\nrequiring merely eigen-decomposition do not have enough sophistication and\nnonlinearity. In this paper, we propose a novel nonlinear dimensionality\nreduction method -- Inverse Kernel Decomposition (IKD) -- based on an\neigen-decomposition of the sample covariance matrix of data. The method is\ninspired by Gaussian process latent variable models (GPLVMs) and has comparable\nperformance with GPLVMs. To deal with very noisy data with weak correlations,\nwe propose two solutions -- blockwise and geodesic -- to make use of locally\ncorrelated data points and provide better and numerically more stable latent\nestimations. We use synthetic datasets and four real-world datasets to show\nthat IKD is a better dimensionality reduction method than other\neigen-decomposition-based methods, and achieves comparable performance against\noptimization-based methods with faster running speeds. Open-source IKD\nimplementation in Python can be accessed at this\n\\url{https://github.com/JerrySoybean/ikd}.\n","authors":["Chengrui Li","Anqi Wu"],"pdf_url":"https://arxiv.org/pdf/2211.05961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06228v1","updated":"2023-08-11T16:58:57Z","published":"2023-08-11T16:58:57Z","title":"MaxFloodCast: Ensemble Machine Learning Model for Predicting Peak\n Inundation Depth And Decoding Influencing Features","summary":" Timely, accurate, and reliable information is essential for decision-makers,\nemergency managers, and infrastructure operators during flood events. This\nstudy demonstrates a proposed machine learning model, MaxFloodCast, trained on\nphysics-based hydrodynamic simulations in Harris County, offers efficient and\ninterpretable flood inundation depth predictions. Achieving an average\nR-squared of 0.949 and a Root Mean Square Error of 0.61 ft on unseen data, it\nproves reliable in forecasting peak flood inundation depths. Validated against\nHurricane Harvey and Storm Imelda, MaxFloodCast shows the potential in\nsupporting near-time floodplain management and emergency operations. The\nmodel's interpretability aids decision-makers in offering critical information\nto inform flood mitigation strategies, to prioritize areas with critical\nfacilities and to examine how rainfall in other watersheds influences flood\nexposure in one area. The MaxFloodCast model enables accurate and interpretable\ninundation depth predictions while significantly reducing computational time,\nthereby supporting emergency response efforts and flood risk management more\neffectively.\n","authors":["Cheng-Chun Lee","Lipai Huang","Federico Antolini","Matthew Garcia","Andrew Juanb","Samuel D. Brody","Ali Mostafavi"],"pdf_url":"https://arxiv.org/pdf/2308.06228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06221v1","updated":"2023-08-11T16:48:31Z","published":"2023-08-11T16:48:31Z","title":"Automated Sizing and Training of Efficient Deep Autoencoders using\n Second Order Algorithms","summary":" We propose a multi-step training method for designing generalized linear\nclassifiers. First, an initial multi-class linear classifier is found through\nregression. Then validation error is minimized by pruning of unnecessary\ninputs. Simultaneously, desired outputs are improved via a method similar to\nthe Ho-Kashyap rule. Next, the output discriminants are scaled to be net\nfunctions of sigmoidal output units in a generalized linear classifier. We then\ndevelop a family of batch training algorithm for the multi layer perceptron\nthat optimizes its hidden layer size and number of training epochs. Next, we\ncombine pruning with a growing approach. Later, the input units are scaled to\nbe the net function of the sigmoidal output units that are then feed into as\ninput to the MLP. We then propose resulting improvements in each of the deep\nlearning blocks thereby improving the overall performance of the deep\narchitecture. We discuss the principles and formulation regarding learning\nalgorithms for deep autoencoders. We investigate several problems in deep\nautoencoders networks including training issues, the theoretical, mathematical\nand experimental justification that the networks are linear, optimizing the\nnumber of hidden units in each layer and determining the depth of the deep\nlearning model. A direct implication of the current work is the ability to\nconstruct fast deep learning models using desktop level computational\nresources. This, in our opinion, promotes our design philosophy of building\nsmall but powerful algorithms. Performance gains are demonstrated at each step.\nUsing widely available datasets, the final network's ten fold testing error is\nshown to be less than that of several other linear, generalized linear\nclassifiers, multi layer perceptron and deep learners reported in the\nliterature.\n","authors":["Kanishka Tyagi","Chinmay Rane","Michael Manry"],"pdf_url":"https://arxiv.org/pdf/2308.06221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04365v4","updated":"2023-08-11T16:40:41Z","published":"2023-08-08T16:04:42Z","title":"SLEM: Machine Learning for Path Modeling and Causal Inference with Super\n Learner Equation Modeling","summary":" Causal inference is a crucial goal of science, enabling researchers to arrive\nat meaningful conclusions regarding the predictions of hypothetical\ninterventions using observational data. Path models, Structural Equation Models\n(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to\nunambiguously specify assumptions regarding the causal structure underlying a\nphenomenon. Unlike DAGs, which make very few assumptions about the functional\nand parametric form, SEM assumes linearity. This can result in functional\nmisspecification which prevents researchers from undertaking reliable effect\nsize estimation. In contrast, we propose Super Learner Equation Modeling, a\npath modeling technique integrating machine learning Super Learner ensembles.\nWe empirically demonstrate its ability to provide consistent and unbiased\nestimates of causal effects, its competitive performance for linear models when\ncompared with SEM, and highlight its superiority over SEM when dealing with\nnon-linear relationships. We provide open-source code, and a tutorial notebook\nwith example usage, accentuating the easy-to-use nature of the method.\n","authors":["Matthew J. Vowels"],"pdf_url":"https://arxiv.org/pdf/2308.04365v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06213v1","updated":"2023-08-11T16:32:00Z","published":"2023-08-11T16:32:00Z","title":"Change Point Detection With Conceptors","summary":" Offline change point detection seeks to identify points in a time series\nwhere the data generating process changes. This problem is well studied for\nunivariate i.i.d. data, but becomes challenging with increasing dimension and\ntemporal dependence. For the at most one change point problem, we propose the\nuse of a conceptor matrix to learn the characteristic dynamics of a specified\ntraining window in a time series. The associated random recurrent neural\nnetwork acts as a featurizer of the data, and change points are identified from\na univariate quantification of the distance between the featurization and the\nspace spanned by a representative conceptor matrix. This model agnostic method\ncan suggest potential locations of interest that warrant further study. We\nprove that, under mild assumptions, the method provides a consistent estimate\nof the true change point, and quantile estimates for statistics are produced\nvia a moving block bootstrap of the original data. The method is tested on\nsimulations from several classes of processes, and we evaluate performance with\nclustering metrics, graphical methods, and observed Type 1 error control. We\napply our method to publicly available neural data from rats experiencing bouts\nof non-REM sleep prior to exploration of a radial maze.\n","authors":["Noah D. Gade","Jordan Rodu"],"pdf_url":"https://arxiv.org/pdf/2308.06213v1.pdf","comment":"Main Text 30 pages, 9 figures; Supplementary Material 29 pages, 2\n figures"},{"id":"http://arxiv.org/abs/2010.03322v3","updated":"2023-08-11T16:28:40Z","published":"2020-10-07T10:48:18Z","title":"A method for escaping limit cycles in training GANs","summary":" This paper mainly conducts further research to alleviate the issue of limit\ncycling behavior in training generative adversarial networks (GANs) through the\nproposed predictive centripetal acceleration algorithm (PCAA). Specifically, we\nfirst derive the upper and lower bounds on the last-iterate convergence rates\nof PCAA for the general bilinear game, with the upper bound notably improving\nupon previous results. Then, we combine PCAA with the adaptive moment\nestimation algorithm (Adam) to propose PCAA-Adam, a practical approach for\ntraining GANs. Finally, we validate the effectiveness of the proposed algorithm\nthrough experiments conducted on bilinear games, multivariate Gaussian\ndistributions, and the CelebA dataset, respectively.\n","authors":["Li Keke","Yang Xinmin"],"pdf_url":"https://arxiv.org/pdf/2010.03322v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06034v3","updated":"2023-08-11T16:17:54Z","published":"2023-06-09T16:55:49Z","title":"RANS-PINN based Simulation Surrogates for Predicting Turbulent Flows","summary":" Physics-informed neural networks (PINNs) provide a framework to build\nsurrogate models for dynamical systems governed by differential equations.\nDuring the learning process, PINNs incorporate a physics-based regularization\nterm within the loss function to enhance generalization performance. Since\nsimulating dynamics controlled by partial differential equations (PDEs) can be\ncomputationally expensive, PINNs have gained popularity in learning parametric\nsurrogates for fluid flow problems governed by Navier-Stokes equations. In this\nwork, we introduce RANS-PINN, a modified PINN framework, to predict flow fields\n(i.e., velocity and pressure) in high Reynolds number turbulent flow regimes.\nTo account for the additional complexity introduced by turbulence, RANS-PINN\nemploys a 2-equation eddy viscosity model based on a Reynolds-averaged\nNavier-Stokes (RANS) formulation. Furthermore, we adopt a novel training\napproach that ensures effective initialization and balance among the various\ncomponents of the loss function. The effectiveness of the RANS-PINN framework\nis then demonstrated using a parametric PINN.\n","authors":["Shinjan Ghosh","Amit Chakraborty","Georgia Olympia Brikis","Biswadip Dey"],"pdf_url":"https://arxiv.org/pdf/2306.06034v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.03664v4","updated":"2023-08-11T16:15:45Z","published":"2021-11-05T14:14:05Z","title":"Oracle Teacher: Leveraging Target Information for Better Knowledge\n Distillation of CTC Models","summary":" Knowledge distillation (KD), best known as an effective method for model\ncompression, aims at transferring the knowledge of a bigger network (teacher)\nto a much smaller network (student). Conventional KD methods usually employ the\nteacher model trained in a supervised manner, where output labels are treated\nonly as targets. Extending this supervised scheme further, we introduce a new\ntype of teacher model for connectionist temporal classification (CTC)-based\nsequence models, namely Oracle Teacher, that leverages both the source inputs\nand the output labels as the teacher model's input. Since the Oracle Teacher\nlearns a more accurate CTC alignment by referring to the target information, it\ncan provide the student with more optimal guidance. One potential risk for the\nproposed approach is a trivial solution that the model's output directly copies\nthe target input. Based on a many-to-one mapping property of the CTC algorithm,\nwe present a training strategy that can effectively prevent the trivial\nsolution and thus enables utilizing both source and target inputs for model\ntraining. Extensive experiments are conducted on two sequence learning tasks:\nspeech recognition and scene text recognition. From the experimental results,\nwe empirically show that the proposed model improves the students across these\ntasks while achieving a considerable speed-up in the teacher model's training\ntime.\n","authors":["Ji Won Yoon","Hyung Yong Kim","Hyeonseung Lee","Sunghwan Ahn","Nam Soo Kim"],"pdf_url":"https://arxiv.org/pdf/2111.03664v4.pdf","comment":"Accepted by IEEE/ACM Transactions on Audio, Speech and Language\n Processing"},{"id":"http://arxiv.org/abs/2308.06204v1","updated":"2023-08-11T16:09:12Z","published":"2023-08-11T16:09:12Z","title":"Safety in Traffic Management Systems: A Comprehensive Survey","summary":" Traffic management systems play a vital role in ensuring safe and efficient\ntransportation on roads. However, the use of advanced technologies in traffic\nmanagement systems has introduced new safety challenges. Therefore, it is\nimportant to ensure the safety of these systems to prevent accidents and\nminimize their impact on road users. In this survey, we provide a comprehensive\nreview of the literature on safety in traffic management systems. Specifically,\nwe discuss the different safety issues that arise in traffic management\nsystems, the current state of research on safety in these systems, and the\ntechniques and methods proposed to ensure the safety of these systems. We also\nidentify the limitations of the existing research and suggest future research\ndirections.\n","authors":["Wenlu Du","Ankan Dash","Jing Li","Hua Wei","Guiling Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06204v1.pdf","comment":"Accepted by MDPI Designs journal, the Special Issue Design and\n Application of Intelligent Transportation Systems. 30 pages, 6 figures,\n published on 10 August 2023"},{"id":"http://arxiv.org/abs/2308.06203v1","updated":"2023-08-11T15:58:15Z","published":"2023-08-11T15:58:15Z","title":"Towards a Causal Probabilistic Framework for Prediction,\n Action-Selection & Explanations for Robot Block-Stacking Tasks","summary":" Uncertainties in the real world mean that is impossible for system designers\nto anticipate and explicitly design for all scenarios that a robot might\nencounter. Thus, robots designed like this are fragile and fail outside of\nhighly-controlled environments. Causal models provide a principled framework to\nencode formal knowledge of the causal relationships that govern the robot's\ninteraction with its environment, in addition to probabilistic representations\nof noise and uncertainty typically encountered by real-world robots. Combined\nwith causal inference, these models permit an autonomous agent to understand,\nreason about, and explain its environment. In this work, we focus on the\nproblem of a robot block-stacking task due to the fundamental perception and\nmanipulation capabilities it demonstrates, required by many applications\nincluding warehouse logistics and domestic human support robotics. We propose a\nnovel causal probabilistic framework to embed a physics simulation capability\ninto a structural causal model to permit robots to perceive and assess the\ncurrent state of a block-stacking task, reason about the next-best action from\nplacement candidates, and generate post-hoc counterfactual explanations. We\nprovide exemplar next-best action selection results and outline planned\nexperimentation in simulated and real-world robot block-stacking tasks.\n","authors":["Ricardo Cannizzaro","Jonathan Routley","Lars Kunze"],"pdf_url":"https://arxiv.org/pdf/2308.06203v1.pdf","comment":"3 pages, 3 figures, accepted to the \"Causality for Robotics:\n Answering the Question of Why\" workshop at the 2023 IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS)"},{"id":"http://arxiv.org/abs/2308.06202v1","updated":"2023-08-11T15:57:45Z","published":"2023-08-11T15:57:45Z","title":"Exploring Predicate Visual Context in Detecting of Human-Object\n Interactions","summary":" Recently, the DETR framework has emerged as the dominant approach for\nhuman--object interaction (HOI) research. In particular, two-stage\ntransformer-based HOI detectors are amongst the most performant and\ntraining-efficient approaches. However, these often condition HOI\nclassification on object features that lack fine-grained contextual\ninformation, eschewing pose and orientation information in favour of visual\ncues about object identity and box extremities. This naturally hinders the\nrecognition of complex or ambiguous interactions. In this work, we study these\nissues through visualisations and carefully designed experiments. Accordingly,\nwe investigate how best to re-introduce image features via cross-attention.\nWith an improved query design, extensive exploration of keys and values, and\nbox pair positional embeddings as spatial guidance, our model with enhanced\npredicate visual context (PViC) outperforms state-of-the-art methods on the\nHICO-DET and V-COCO benchmarks, while maintaining low training cost.\n","authors":["Frederic Z. Zhang","Yuhui Yuan","Dylan Campbell","Zhuoyao Zhong","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2308.06202v1.pdf","comment":"To appear in ICCV2023"},{"id":"http://arxiv.org/abs/2303.06628v2","updated":"2023-08-11T15:56:32Z","published":"2023-03-12T10:28:07Z","title":"Preventing Zero-Shot Transfer Degradation in Continual Learning of\n Vision-Language Models","summary":" Continual learning (CL) can help pre-trained vision-language models\nefficiently adapt to new or under-trained data distributions without\nre-training. Nevertheless, during the continual training of the Contrastive\nLanguage-Image Pre-training (CLIP) model, we observe that the model's zero-shot\ntransfer ability significantly degrades due to catastrophic forgetting.\nExisting CL methods can mitigate forgetting by replaying previous data.\nHowever, since the CLIP dataset is private, replay methods cannot access the\npre-training dataset. In addition, replaying data of previously learned\ndownstream tasks can enhance their performance but comes at the cost of\nsacrificing zero-shot performance. To address this challenge, we propose a\nnovel method ZSCL to prevent zero-shot transfer degradation in the continual\nlearning of vision-language models in both feature and parameter space. In the\nfeature space, a reference dataset is introduced for distillation between the\ncurrent and initial models. The reference dataset should have semantic\ndiversity but no need to be labeled, seen in pre-training, or matched\nimage-text pairs. In parameter space, we prevent a large parameter shift by\naveraging weights during the training. We propose a more challenging\nMulti-domain Task Incremental Learning (MTIL) benchmark to evaluate different\nmethods, where tasks are from various domains instead of class-separated in a\nsingle dataset. Our method outperforms other methods in the traditional\nclass-incremental learning setting and the MTIL by 9.7% average score. Our code\nlocates at https://github.com/Thunderbeee/ZSCL.\n","authors":["Zangwei Zheng","Mingyuan Ma","Kai Wang","Ziheng Qin","Xiangyu Yue","Yang You"],"pdf_url":"https://arxiv.org/pdf/2303.06628v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06197v1","updated":"2023-08-11T15:42:48Z","published":"2023-08-11T15:42:48Z","title":"Complex Facial Expression Recognition Using Deep Knowledge Distillation\n of Basic Features","summary":" Complex emotion recognition is a cognitive task that has so far eluded the\nsame excellent performance of other tasks that are at or above the level of\nhuman cognition. Emotion recognition through facial expressions is particularly\ndifficult due to the complexity of emotions expressed by the human face. For a\nmachine to approach the same level of performance in this domain as a human, it\nmay need to synthesise knowledge and understand new concepts in real-time as\nhumans do. Humans are able to learn new concepts using only few examples, by\ndistilling the important information from memories and discarding the rest.\nSimilarly, continual learning methods learn new classes whilst retaining the\nknowledge of known classes, whilst few-shot learning methods are able to learn\nnew classes using very few training examples. We propose a novel continual\nlearning method inspired by human cognition and learning that can accurately\nrecognise new compound expression classes using few training samples, by\nbuilding on and retaining its knowledge of basic expression classes. Using\nGradCAM visualisations, we demonstrate the relationship between basic and\ncompound facial expressions, which our method leverages through knowledge\ndistillation and a novel Predictive Sorting Memory Replay. Our method achieves\nthe current state-of-the-art in continual learning for complex facial\nexpression recognition with 74.28% Overall Accuracy on new classes. We also\ndemonstrate that using continual learning for complex facial expression\nrecognition achieves far better performance than non-continual learning\nmethods, improving on state-of-the-art non-continual learning methods by\n13.95%. To the best of our knowledge, our work is also the first to apply\nfew-shot learning to complex facial expression recognition, achieving the\nstate-of-the-art with 100% accuracy using a single training sample for each\nexpression class.\n","authors":["Angus Maiden","Bahareh Nakisa"],"pdf_url":"https://arxiv.org/pdf/2308.06197v1.pdf","comment":"17 pages, 9 figures, 6 tables. Code available at\n https://github.com/AngusMaiden/complex-FER"},{"id":"http://arxiv.org/abs/2308.04522v2","updated":"2023-08-11T15:39:03Z","published":"2023-08-08T18:37:24Z","title":"Deep Learning for Diverse Data Types Steganalysis: A Review","summary":" Steganography and steganalysis are two interrelated aspects of the field of\ninformation security. Steganography seeks to conceal communications, whereas\nsteganalysis is aimed to either find them or even, if possible, recover the\ndata they contain. Steganography and steganalysis have attracted a great deal\nof interest, particularly from law enforcement. Steganography is often used by\ncybercriminals and even terrorists to avoid being captured while in possession\nof incriminating evidence, even encrypted, since cryptography is prohibited or\nrestricted in many countries. Therefore, knowledge of cutting-edge techniques\nto uncover concealed information is crucial in exposing illegal acts. Over the\nlast few years, a number of strong and reliable steganography and steganalysis\ntechniques have been introduced in the literature. This review paper provides a\ncomprehensive overview of deep learning-based steganalysis techniques used to\ndetect hidden information within digital media. The paper covers all types of\ncover in steganalysis, including image, audio, and video, and discusses the\nmost commonly used deep learning techniques. In addition, the paper explores\nthe use of more advanced deep learning techniques, such as deep transfer\nlearning (DTL) and deep reinforcement learning (DRL), to enhance the\nperformance of steganalysis systems. The paper provides a systematic review of\nrecent research in the field, including data sets and evaluation metrics used\nin recent studies. It also presents a detailed analysis of DTL-based\nsteganalysis approaches and their performance on different data sets. The\nreview concludes with a discussion on the current state of deep learning-based\nsteganalysis, challenges, and future research directions.\n","authors":["Hamza Kheddar","Mustapha Hemis","Yassine Himeur","David Megías","Abbes Amira"],"pdf_url":"https://arxiv.org/pdf/2308.04522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09466v2","updated":"2023-08-11T15:30:29Z","published":"2023-04-19T07:27:21Z","title":"MAMAF-Net: Motion-Aware and Multi-Attention Fusion Network for Stroke\n Diagnosis","summary":" Stroke is a major cause of mortality and disability worldwide from which one\nin four people are in danger of incurring in their lifetime. The pre-hospital\nstroke assessment plays a vital role in identifying stroke patients accurately\nto accelerate further examination and treatment in hospitals. Accordingly, the\nNational Institutes of Health Stroke Scale (NIHSS), Cincinnati Pre-hospital\nStroke Scale (CPSS) and Face Arm Speed Time (F.A.S.T.) are globally known tests\nfor stroke assessment. However, the validity of these tests is skeptical in the\nabsence of neurologists and access to healthcare may be limited. Therefore, in\nthis study, we propose a motion-aware and multi-attention fusion network\n(MAMAF-Net) that can detect stroke from multimodal examination videos. Contrary\nto other studies on stroke detection from video analysis, our study for the\nfirst time proposes an end-to-end solution from multiple video recordings of\neach subject with a dataset encapsulating stroke, transient ischemic attack\n(TIA), and healthy controls. The proposed MAMAF-Net consists of motion-aware\nmodules to sense the mobility of patients, attention modules to fuse the\nmulti-input video data, and 3D convolutional layers to perform diagnosis from\nthe attention-based extracted features. Experimental results over the collected\nStroke-data dataset show that the proposed MAMAF-Net achieves a successful\ndetection of stroke with 93.62% sensitivity and 95.33% AUC score.\n","authors":["Aysen Degerli","Pekka Jakala","Juha Pajula","Milla Immonen","Miguel Bordallo Lopez"],"pdf_url":"https://arxiv.org/pdf/2304.09466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1901.08571v3","updated":"2023-08-11T15:17:35Z","published":"2019-01-24T18:43:16Z","title":"Nonparametric Inference under B-bits Quantization","summary":" Statistical inference based on lossy or incomplete samples is often needed in\nresearch areas such as signal/image processing, medical image storage, remote\nsensing, signal transmission. In this paper, we propose a nonparametric testing\nprocedure based on samples quantized to $B$ bits through a computationally\nefficient algorithm. Under mild technical conditions, we establish the\nasymptotic properties of the proposed test statistic and investigate how the\ntesting power changes as $B$ increases. In particular, we show that if $B$\nexceeds a certain threshold, the proposed nonparametric testing procedure\nachieves the classical minimax rate of testing (Shang and Cheng, 2015) for\nspline models. We further extend our theoretical investigations to a\nnonparametric linearity test and an adaptive nonparametric test, expanding the\napplicability of the proposed methods. Extensive simulation studies {together\nwith a real-data analysis} are used to demonstrate the validity and\neffectiveness of the proposed tests.\n","authors":["Kexuan Li","Ruiqi Liu","Ganggang Xu","Zuofeng Shang"],"pdf_url":"https://arxiv.org/pdf/1901.08571v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06175v1","updated":"2023-08-11T15:04:34Z","published":"2023-08-11T15:04:34Z","title":"Assessing Guest Nationality Composition from Hotel Reviews","summary":" Many hotels target guest acquisition efforts to specific markets in order to\nbest anticipate individual preferences and needs of their guests. Likewise,\nsuch strategic positioning is a prerequisite for efficient marketing budget\nallocation. Official statistics report on the number of visitors from different\ncountries, but no fine-grained information on the guest composition of\nindividual businesses exists. There is, however, growing interest in such data\nfrom competitors, suppliers, researchers and the general public. We demonstrate\nhow machine learning can be leveraged to extract references to guest\nnationalities from unstructured text reviews in order to dynamically assess and\nmonitor the dynamics of guest composition of individual businesses. In\nparticular, we show that a rather simple architecture of pre-trained embeddings\nand stacked LSTM layers provides a better performance-runtime tradeoff than\nmore complex state-of-the-art language models.\n","authors":["Fabian Gröger","Marc Pouly","Flavia Tinner","Leif Brandes"],"pdf_url":"https://arxiv.org/pdf/2308.06175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.02302v3","updated":"2023-08-11T15:02:43Z","published":"2021-11-03T15:38:58Z","title":"Selecting the number of clusters, clustering models, and algorithms. A\n unifying approach based on the quadratic discriminant score","summary":" Cluster analysis requires many decisions: the clustering method and the\nimplied reference model, the number of clusters and, often, several\nhyper-parameters and algorithms' tunings. In practice, one produces several\npartitions, and a final one is chosen based on validation or selection\ncriteria. There exist an abundance of validation methods that, implicitly or\nexplicitly, assume a certain clustering notion. Moreover, they are often\nrestricted to operate on partitions obtained from a specific method. In this\npaper, we focus on groups that can be well separated by quadratic or linear\nboundaries. The reference cluster concept is defined through the quadratic\ndiscriminant score function and parameters describing clusters' size, center\nand scatter. We develop two cluster-quality criteria called quadratic scores.\nWe show that these criteria are consistent with groups generated from a general\nclass of elliptically-symmetric distributions. The quest for this type of\ngroups is common in applications. The connection with likelihood theory for\nmixture models and model-based clustering is investigated. Based on bootstrap\nresampling of the quadratic scores, we propose a selection rule that allows\nchoosing among many clustering solutions. The proposed method has the\ndistinctive advantage that it can compare partitions that cannot be compared\nwith other state-of-the-art methods. Extensive numerical experiments and the\nanalysis of real data show that, even if some competing methods turn out to be\nsuperior in some setups, the proposed methodology achieves a better overall\nperformance.\n","authors":["Luca Coraggio","Pietro Coretto"],"pdf_url":"https://arxiv.org/pdf/2111.02302v3.pdf","comment":"Supplemental materials are included at the end of the paper"},{"id":"http://arxiv.org/abs/2308.06173v1","updated":"2023-08-11T15:02:19Z","published":"2023-08-11T15:02:19Z","title":"Physical Adversarial Attacks For Camera-based Smart Systems: Current\n Trends, Categorization, Applications, Research Challenges, and Future Outlook","summary":" In this paper, we present a comprehensive survey of the current trends\nfocusing specifically on physical adversarial attacks. We aim to provide a\nthorough understanding of the concept of physical adversarial attacks,\nanalyzing their key characteristics and distinguishing features. Furthermore,\nwe explore the specific requirements and challenges associated with executing\nattacks in the physical world. Our article delves into various physical\nadversarial attack methods, categorized according to their target tasks in\ndifferent applications, including classification, detection, face recognition,\nsemantic segmentation and depth estimation. We assess the performance of these\nattack methods in terms of their effectiveness, stealthiness, and robustness.\nWe examine how each technique strives to ensure the successful manipulation of\nDNNs while mitigating the risk of detection and withstanding real-world\ndistortions. Lastly, we discuss the current challenges and outline potential\nfuture research directions in the field of physical adversarial attacks. We\nhighlight the need for enhanced defense mechanisms, the exploration of novel\nattack strategies, the evaluation of attacks in different application domains,\nand the establishment of standardized benchmarks and evaluation criteria for\nphysical adversarial attacks. Through this comprehensive survey, we aim to\nprovide a valuable resource for researchers, practitioners, and policymakers to\ngain a holistic understanding of physical adversarial attacks in computer\nvision and facilitate the development of robust and secure DNN-based systems.\n","authors":["Amira Guesmi","Muhammad Abdullah Hanif","Bassem Ouni","Muhammed Shafique"],"pdf_url":"https://arxiv.org/pdf/2308.06173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12364v3","updated":"2023-08-11T14:59:36Z","published":"2023-03-22T08:03:27Z","title":"ExBEHRT: Extended Transformer for Electronic Health Records to Predict\n Disease Subtypes & Progressions","summary":" In this study, we introduce ExBEHRT, an extended version of BEHRT (BERT\napplied to electronic health records), and apply different algorithms to\ninterpret its results. While BEHRT considers only diagnoses and patient age, we\nextend the feature space to several multimodal records, namely demographics,\nclinical characteristics, vital signs, smoking status, diagnoses, procedures,\nmedications, and laboratory tests, by applying a novel method to unify the\nfrequencies and temporal dimensions of the different features. We show that\nadditional features significantly improve model performance for various\ndownstream tasks in different diseases. To ensure robustness, we interpret\nmodel predictions using an adaptation of expected gradients, which has not been\npreviously applied to transformers with EHR data and provides more granular\ninterpretations than previous approaches such as feature and token importances.\nFurthermore, by clustering the model representations of oncology patients, we\nshow that the model has an implicit understanding of the disease and is able to\nclassify patients with the same cancer type into different risk groups. Given\nthe additional features and interpretability, ExBEHRT can help make informed\ndecisions about disease trajectories, diagnoses, and risk factors of various\ndiseases.\n","authors":["Maurice Rupp","Oriane Peter","Thirupathi Pattipaka"],"pdf_url":"https://arxiv.org/pdf/2303.12364v3.pdf","comment":"ICLR 2023 Workshop on Trustworthy Machine Learning for Healthcare\n (Website: https://sites.google.com/view/tml4h2023/accepted-papers )"},{"id":"http://arxiv.org/abs/2308.06155v1","updated":"2023-08-11T14:33:20Z","published":"2023-08-11T14:33:20Z","title":"Phased Deep Spatio-temporal Learning for Highway Traffic Volume\n Prediction","summary":" Inter-city highway transportation is significant for citizens' modern urban\nlife and generates heterogeneous sensory data with spatio-temporal\ncharacteristics. As a routine analysis in transportation domain, daily traffic\nvolume estimation faces challenges for highway toll stations including lacking\nof exploration of correlative spatio-temporal features from a long-term\nperspective and effective means to deal with data imbalance which always\ndeteriorates the predictive performance. In this paper, a deep spatio-temporal\nlearning method is proposed to predict daily traffic volume in three phases. In\nfeature pre-processing phase, data is normalized elaborately according to\nlatent long-tail distribution. In spatio-temporal learning phase, a hybrid\nmodel is employed combining fully convolution network (FCN) and long short-term\nmemory (LSTM), which considers time, space, meteorology, and calendar from\nheterogeneous data. In decision phase, traffic volumes on a coming day at\nnetwork-wide toll stations would be achieved effectively, which is especially\ncalibrated for vital few highway stations. Using real-world data from one\nChinese provincial highway, extensive experiments show our method has distinct\nimprovement for predictive accuracy than various traditional models, reaching\n5.269 and 0.997 in MPAE and R-squre metrics, respectively.\n","authors":["Weilong Ding","Tianpu Zhang","Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06149v1","updated":"2023-08-11T14:26:29Z","published":"2023-08-11T14:26:29Z","title":"Gaussian Process Regression for Maximum Entropy Distribution","summary":" Maximum-Entropy Distributions offer an attractive family of probability\ndensities suitable for moment closure problems. Yet finding the Lagrange\nmultipliers which parametrize these distributions, turns out to be a\ncomputational bottleneck for practical closure settings. Motivated by recent\nsuccess of Gaussian processes, we investigate the suitability of Gaussian\npriors to approximate the Lagrange multipliers as a map of a given set of\nmoments. Examining various kernel functions, the hyperparameters are optimized\nby maximizing the log-likelihood. The performance of the devised data-driven\nMaximum-Entropy closure is studied for couple of test cases including\nrelaxation of non-equilibrium distributions governed by Bhatnagar-Gross-Krook\nand Boltzmann kinetic equations.\n","authors":["Mohsen Sadr","Manuel Torrilhon","M. Hossein Gorji"],"pdf_url":"https://arxiv.org/pdf/2308.06149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06144v1","updated":"2023-08-11T14:06:41Z","published":"2023-08-11T14:06:41Z","title":"Identification of the Relevance of Comments in Codes Using Bag of Words\n and Transformer Based Models","summary":" The Forum for Information Retrieval (FIRE) started a shared task this year\nfor classification of comments of different code segments. This is binary text\nclassification task where the objective is to identify whether comments given\nfor certain code segments are relevant or not. The BioNLP-IISERB group at the\nIndian Institute of Science Education and Research Bhopal (IISERB) participated\nin this task and submitted five runs for five different models. The paper\npresents the overview of the models and other significant findings on the\ntraining corpus. The methods involve different feature engineering schemes and\ntext classification techniques. The performance of the classical bag of words\nmodel and transformer-based models were explored to identify significant\nfeatures from the given training corpus. We have explored different classifiers\nviz., random forest, support vector machine and logistic regression using the\nbag of words model. Furthermore, the pre-trained transformer based models like\nBERT, RoBERT and ALBERT were also used by fine-tuning them on the given\ntraining corpus. The performance of different such models over the training\ncorpus were reported and the best five models were implemented on the given\ntest corpus. The empirical results show that the bag of words model outperforms\nthe transformer based models, however, the performance of our runs are not\nreasonably well in both training and test corpus. This paper also addresses the\nlimitations of the models and scope for further improvement.\n","authors":["Sruthi S","Tanmay Basu"],"pdf_url":"https://arxiv.org/pdf/2308.06144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06142v1","updated":"2023-08-11T14:02:52Z","published":"2023-08-11T14:02:52Z","title":"CompTLL-UNet: Compressed Domain Text-Line Localization in Challenging\n Handwritten Documents using Deep Feature Learning from JPEG Coefficients","summary":" Automatic localization of text-lines in handwritten documents is still an\nopen and challenging research problem. Various writing issues such as uneven\nspacing between the lines, oscillating and touching text, and the presence of\nskew become much more challenging when the case of complex handwritten document\nimages are considered for segmentation directly in their respective compressed\nrepresentation. This is because, the conventional way of processing compressed\ndocuments is through decompression, but here in this paper, we propose an idea\nthat employs deep feature learning directly from the JPEG compressed\ncoefficients without full decompression to accomplish text-line localization in\nthe JPEG compressed domain. A modified U-Net architecture known as Compressed\nText-Line Localization Network (CompTLL-UNet) is designed to accomplish it. The\nmodel is trained and tested with JPEG compressed version of benchmark datasets\nincluding ICDAR2017 (cBAD) and ICDAR2019 (cBAD), reporting the state-of-the-art\nperformance with reduced storage and computational costs in the JPEG compressed\ndomain.\n","authors":["Bulla Rajesh","Sk Mahafuz Zaman","Mohammed Javed","P. Nagabhushan"],"pdf_url":"https://arxiv.org/pdf/2308.06142v1.pdf","comment":"Accepted in 7th Asian Conference on Pattern Recognition (ACPR 2023),\n 5-8 November 2023, Kitakyushu, Japan"},{"id":"http://arxiv.org/abs/2308.06138v1","updated":"2023-08-11T13:58:42Z","published":"2023-08-11T13:58:42Z","title":"Application of Artificial Neural Networks for Investigation of Pressure\n Filtration Performance, a Zinc Leaching Filter Cake Moisture Modeling","summary":" Machine Learning (ML) is a powerful tool for material science applications.\nArtificial Neural Network (ANN) is a machine learning technique that can\nprovide high prediction accuracy. This study aimed to develop an ANN model to\npredict the cake moisture of the pressure filtration process of zinc\nproduction. The cake moisture was influenced by seven parameters: temperature\n(35 and 65 Celsius), solid concentration (0.2 and 0.38 g/L), pH (2, 3.5, and\n5), air-blow time (2, 10, and 15 min), cake thickness (14, 20, 26, and 34 mm),\npressure, and filtration time. The study conducted 288 tests using two types of\nfabrics: polypropylene (S1) and polyester (S2). The ANN model was evaluated by\nthe Coefficient of determination (R2), the Mean Square Error (MSE), and the\nMean Absolute Error (MAE) metrics for both datasets. The results showed R2\nvalues of 0.88 and 0.83, MSE values of 6.243x10-07 and 1.086x10-06, and MAE\nvalues of 0.00056 and 0.00088 for S1 and S2, respectively. These results\nindicated that the ANN model could predict the cake moisture of pressure\nfiltration in the zinc leaching process with high accuracy.\n","authors":["Masoume Kazemi","Davood Moradkhani","Alireza A. Alipour"],"pdf_url":"https://arxiv.org/pdf/2308.06138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02374v2","updated":"2023-08-11T13:51:55Z","published":"2022-12-05T15:56:08Z","title":"On the Trade-off between Over-smoothing and Over-squashing in Deep Graph\n Neural Networks","summary":" Graph Neural Networks (GNNs) have succeeded in various computer science\napplications, yet deep GNNs underperform their shallow counterparts despite\ndeep learning's success in other domains. Over-smoothing and over-squashing are\nkey challenges when stacking graph convolutional layers, hindering deep\nrepresentation learning and information propagation from distant nodes. Our\nwork reveals that over-smoothing and over-squashing are intrinsically related\nto the spectral gap of the graph Laplacian, resulting in an inevitable\ntrade-off between these two issues, as they cannot be alleviated\nsimultaneously. To achieve a suitable compromise, we propose adding and\nremoving edges as a viable approach. We introduce the Stochastic Jost and Liu\nCurvature Rewiring (SJLR) algorithm, which is computationally efficient and\npreserves fundamental properties compared to previous curvature-based methods.\nUnlike existing approaches, SJLR performs edge addition and removal during GNN\ntraining while maintaining the graph unchanged during testing. Comprehensive\ncomparisons demonstrate SJLR's competitive performance in addressing\nover-smoothing and over-squashing.\n","authors":["Jhony H. Giraldo","Konstantinos Skianis","Thierry Bouwmans","Fragkiskos D. Malliaros"],"pdf_url":"https://arxiv.org/pdf/2212.02374v2.pdf","comment":"This paper has been accepted for publication at the 32nd ACM\n International Conference on Information and Knowledge Management (CIKM) 2023"},{"id":"http://arxiv.org/abs/2302.14057v2","updated":"2023-08-11T13:48:44Z","published":"2023-02-25T10:12:34Z","title":"Cross-modal Contrastive Learning for Multimodal Fake News Detection","summary":" Automatic detection of multimodal fake news has gained a widespread attention\nrecently. Many existing approaches seek to fuse unimodal features to produce\nmultimodal news representations. However, the potential of powerful cross-modal\ncontrastive learning methods for fake news detection has not been well\nexploited. Besides, how to aggregate features from different modalities to\nboost the performance of the decision-making process is still an open question.\nTo address that, we propose COOLANT, a cross-modal contrastive learning\nframework for multimodal fake news detection, aiming to achieve more accurate\nimage-text alignment. To further improve the alignment precision, we leverage\nan auxiliary task to soften the loss term of negative samples during the\ncontrast process. A cross-modal fusion module is developed to learn the\ncross-modality correlations. An attention mechanism with an attention guidance\nmodule is implemented to help effectively and interpretably aggregate the\naligned unimodal representations and the cross-modality correlations. Finally,\nwe evaluate the COOLANT and conduct a comparative study on two widely used\ndatasets, Twitter and Weibo. The experimental results demonstrate that our\nCOOLANT outperforms previous approaches by a large margin and achieves new\nstate-of-the-art results on the two datasets.\n","authors":["Longzheng Wang","Chuang Zhang","Hongbo Xu","Yongxiu Xu","Xiaohan Xu","Siqi Wang"],"pdf_url":"https://arxiv.org/pdf/2302.14057v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.14029v2","updated":"2023-08-11T13:40:46Z","published":"2023-03-24T14:42:42Z","title":"PENTACET data -- 23 Million Contextual Code Comments and 250,000 SATD\n comments","summary":" Most Self-Admitted Technical Debt (SATD) research utilizes explicit SATD\nfeatures such as 'TODO' and 'FIXME' for SATD detection. A closer look reveals\nseveral SATD research uses simple SATD ('Easy to Find') code comments without\nthe contextual data (preceding and succeeding source code context). This work\naddresses this gap through PENTACET (or 5C dataset) data. PENTACET is a large\nCurated Contextual Code Comments per Contributor and the most extensive SATD\ndata. We mine 9,096 Open Source Software Java projects with a total of 435\nmillion LOC. The outcome is a dataset with 23 million code comments, preceding\nand succeeding source code context for each comment, and more than 250,000\ncomments labeled as SATD, including both 'Easy to Find' and 'Hard to Find'\nSATD. We believe PENTACET data will further SATD research using Artificial\nIntelligence techniques.\n","authors":["Murali Sridharan","Leevi Rantala","Mika Mäntylä"],"pdf_url":"https://arxiv.org/pdf/2303.14029v2.pdf","comment":"Accepted in MSR 2023 Tools and Data Showcase"},{"id":"http://arxiv.org/abs/2308.06132v1","updated":"2023-08-11T13:39:21Z","published":"2023-08-11T13:39:21Z","title":"PDE Discovery for Soft Sensors Using Coupled Physics-Informed Neural\n Network with Akaike's Information Criterion","summary":" Soft sensors have been extensively used to monitor key variables using\neasy-to-measure variables and mathematical models. Partial differential\nequations (PDEs) are model candidates for soft sensors in industrial processes\nwith spatiotemporal dependence. However, gaps often exist between idealized\nPDEs and practical situations. Discovering proper structures of PDEs, including\nthe differential operators and source terms, can remedy the gaps. To this end,\na coupled physics-informed neural network with Akaike's criterion information\n(CPINN-AIC) is proposed for PDE discovery of soft sensors. First, CPINN is\nadopted for obtaining solutions and source terms satisfying PDEs. Then, we\npropose a data-physics-hybrid loss function for training CPINN, in which\nundetermined combinations of differential operators are involved. Consequently,\nAIC is used to discover the proper combination of differential operators.\nFinally, the artificial and practical datasets are used to verify the\nfeasibility and effectiveness of CPINN-AIC for soft sensors. The proposed\nCPINN-AIC is a data-driven method to discover proper PDE structures and neural\nnetwork-based solutions for soft sensors.\n","authors":["Aina Wang","Pan Qin","Xi-Ming Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04431v3","updated":"2023-08-11T13:39:06Z","published":"2023-06-07T13:41:55Z","title":"Faithful Knowledge Distillation","summary":" Knowledge distillation (KD) has received much attention due to its success in\ncompressing networks to allow for their deployment in resource-constrained\nsystems. While the problem of adversarial robustness has been studied before in\nthe KD setting, previous works overlook what we term the relative calibration\nof the student network with respect to its teacher in terms of soft\nconfidences. In particular, we focus on two crucial questions with regard to a\nteacher-student pair: (i) do the teacher and student disagree at points close\nto correctly classified dataset examples, and (ii) is the distilled student as\nconfident as the teacher around dataset examples? These are critical questions\nwhen considering the deployment of a smaller student network trained from a\nrobust teacher within a safety-critical setting. To address these questions, we\nintroduce a faithful imitation framework to discuss the relative calibration of\nconfidences and provide empirical and certified methods to evaluate the\nrelative calibration of a student w.r.t. its teacher. Further, to verifiably\nalign the relative calibration incentives of the student to those of its\nteacher, we introduce faithful distillation. Our experiments on the MNIST,\nFashion-MNIST and CIFAR-10 datasets demonstrate the need for such an analysis\nand the advantages of the increased verifiability of faithful distillation over\nalternative adversarial distillation methods.\n","authors":["Tom A. Lamb","Rudy Brunel","Krishnamurthy DJ Dvijotham","M. Pawan Kumar","Philip H. S. Torr","Francisco Eiras"],"pdf_url":"https://arxiv.org/pdf/2306.04431v3.pdf","comment":"7pgs (main content), 4 figures"},{"id":"http://arxiv.org/abs/2308.06129v1","updated":"2023-08-11T13:35:52Z","published":"2023-08-11T13:35:52Z","title":"Uncertainty Quantification for Image-based Traffic Prediction across\n Cities","summary":" Despite the strong predictive performance of deep learning models for traffic\nprediction, their widespread deployment in real-world intelligent\ntransportation systems has been restrained by a lack of interpretability.\nUncertainty quantification (UQ) methods provide an approach to induce\nprobabilistic reasoning, improve decision-making and enhance model deployment\npotential. To gain a comprehensive picture of the usefulness of existing UQ\nmethods for traffic prediction and the relation between obtained uncertainties\nand city-wide traffic dynamics, we investigate their application to a\nlarge-scale image-based traffic dataset spanning multiple cities and time\nperiods. We compare two epistemic and two aleatoric UQ methods on both temporal\nand spatio-temporal transfer tasks, and find that meaningful uncertainty\nestimates can be recovered. We further demonstrate how uncertainty estimates\ncan be employed for unsupervised outlier detection on changes in city traffic\ndynamics. We find that our approach can capture both temporal and spatial\neffects on traffic behaviour in a representative case study for the city of\nMoscow. Our work presents a further step towards boosting uncertainty awareness\nin traffic prediction tasks, and aims to highlight the value contribution of UQ\nmethods to a better understanding of city traffic dynamics.\n","authors":["Alexander Timans","Nina Wiedemann","Nishant Kumar","Ye Hong","Martin Raubal"],"pdf_url":"https://arxiv.org/pdf/2308.06129v1.pdf","comment":"39 pages, 22 figures. Code publicly available at:\n https://github.com/alextimans/traffic4cast-uncertainty"},{"id":"http://arxiv.org/abs/2308.06127v1","updated":"2023-08-11T13:33:59Z","published":"2023-08-11T13:33:59Z","title":"Learning Control Policies for Variable Objectives from Offline Data","summary":" Offline reinforcement learning provides a viable approach to obtain advanced\ncontrol strategies for dynamical systems, in particular when direct interaction\nwith the environment is not available. In this paper, we introduce a conceptual\nextension for model-based policy search methods, called variable objective\npolicy (VOP). With this approach, policies are trained to generalize\nefficiently over a variety of objectives, which parameterize the reward\nfunction. We demonstrate that by altering the objectives passed as input to the\npolicy, users gain the freedom to adjust its behavior or re-balance\noptimization targets at runtime, without need for collecting additional\nobservation batches or re-training.\n","authors":["Marc Weber","Phillip Swazinna","Daniel Hein","Steffen Udluft","Volkmar Sterzing"],"pdf_url":"https://arxiv.org/pdf/2308.06127v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2202.04936v4","updated":"2023-08-11T13:05:07Z","published":"2022-02-10T10:06:22Z","title":"Robust Graph Representation Learning for Local Corruption Recovery","summary":" The performance of graph representation learning is affected by the quality\nof graph input. While existing research usually pursues a globally smoothed\ngraph embedding, we believe the rarely observed anomalies are as well harmful\nto an accurate prediction. This work establishes a graph learning scheme that\nautomatically detects (locally) corrupted feature attributes and recovers\nrobust embedding for prediction tasks. The detection operation leverages a\ngraph autoencoder, which does not make any assumptions about the distribution\nof the local corruptions. It pinpoints the positions of the anomalous node\nattributes in an unbiased mask matrix, where robust estimations are recovered\nwith sparsity promoting regularizer. The optimizer approaches a new embedding\nthat is sparse in the framelet domain and conditionally close to input\nobservations. Extensive experiments are provided to validate our proposed model\ncan recover a robust graph representation from black-box poisoning and achieve\nexcellent performance.\n","authors":["Bingxin Zhou","Yuanhong Jiang","Yu Guang Wang","Jingwei Liang","Junbin Gao","Shirui Pan","Xiaoqun Zhang"],"pdf_url":"https://arxiv.org/pdf/2202.04936v4.pdf","comment":"WWW '23: Proceedings of the ACM Web Conference 2023"},{"id":"http://arxiv.org/abs/2308.06106v1","updated":"2023-08-11T12:43:43Z","published":"2023-08-11T12:43:43Z","title":"Hawkes Processes with Delayed Granger Causality","summary":" We aim to explicitly model the delayed Granger causal effects based on\nmultivariate Hawkes processes. The idea is inspired by the fact that a causal\nevent usually takes some time to exert an effect. Studying this time lag itself\nis of interest. Given the proposed model, we first prove the identifiability of\nthe delay parameter under mild conditions. We further investigate a model\nestimation method under a complex setting, where we want to infer the posterior\ndistribution of the time lags and understand how this distribution varies\nacross different scenarios. We treat the time lags as latent variables and\nformulate a Variational Auto-Encoder (VAE) algorithm to approximate the\nposterior distribution of the time lags. By explicitly modeling the time lags\nin Hawkes processes, we add flexibility to the model. The inferred time-lag\nposterior distributions are of scientific meaning and help trace the original\ncausal time that supports the root cause analysis. We empirically evaluate our\nmodel's event prediction and time-lag inference accuracy on synthetic and real\ndata, achieving promising results.\n","authors":["Chao Yang","Hengyuan Miao","Shuang Li"],"pdf_url":"https://arxiv.org/pdf/2308.06106v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2303.15109v2","updated":"2023-08-11T12:27:42Z","published":"2023-03-27T11:26:34Z","title":"Improving the Transferability of Adversarial Examples via Direction\n Tuning","summary":" In the transfer-based adversarial attacks, adversarial examples are only\ngenerated by the surrogate models and achieve effective perturbation in the\nvictim models. Although considerable efforts have been developed on improving\nthe transferability of adversarial examples generated by transfer-based\nadversarial attacks, our investigation found that, the big deviation between\nthe actual and steepest update directions of the current transfer-based\nadversarial attacks is caused by the large update step length, resulting in the\ngenerated adversarial examples can not converge well. However, directly\nreducing the update step length will lead to serious update oscillation so that\nthe generated adversarial examples also can not achieve great transferability\nto the victim models. To address these issues, a novel transfer-based attack,\nnamely direction tuning attack, is proposed to not only decrease the update\ndeviation in the large step length, but also mitigate the update oscillation in\nthe small sampling step length, thereby making the generated adversarial\nexamples converge well to achieve great transferability on victim models. In\naddition, a network pruning method is proposed to smooth the decision boundary,\nthereby further decreasing the update oscillation and enhancing the\ntransferability of the generated adversarial examples. The experiment results\non ImageNet demonstrate that the average attack success rate (ASR) of the\nadversarial examples generated by our method can be improved from 87.9\\% to\n94.5\\% on five victim models without defenses, and from 69.1\\% to 76.2\\% on\neight advanced defense methods, in comparison with that of latest\ngradient-based attacks.\n","authors":["Xiangyuan Yang","Jie Lin","Hanlin Zhang","Xinyu Yang","Peng Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.15109v2.pdf","comment":"Accepted by INS 2023"},{"id":"http://arxiv.org/abs/2308.06103v1","updated":"2023-08-11T12:27:22Z","published":"2023-08-11T12:27:22Z","title":"Composable Function-preserving Expansions for Transformer Architectures","summary":" Training state-of-the-art neural networks requires a high cost in terms of\ncompute and time. Model scale is recognized to be a critical factor to achieve\nand improve the state-of-the-art. Increasing the scale of a neural network\nnormally requires restarting from scratch by randomly initializing all the\nparameters of the model, as this implies a change of architecture's parameters\nthat does not allow for a straightforward transfer of knowledge from smaller\nsize models. In this work, we propose six composable transformations to\nincrementally increase the size of transformer-based neural networks while\npreserving functionality, allowing to expand the capacity of the model as\nneeded. We provide proof of exact function preservation under minimal\ninitialization constraints for each transformation. The proposed methods may\nenable efficient training pipelines for larger and more powerful models by\nprogressively expanding the architecture throughout training.\n","authors":["Andrea Gesmundo","Kaitlin Maile"],"pdf_url":"https://arxiv.org/pdf/2308.06103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06100v1","updated":"2023-08-11T12:22:37Z","published":"2023-08-11T12:22:37Z","title":"Diffusion-based Visual Counterfactual Explanations -- Towards Systematic\n Quantitative Evaluation","summary":" Latest methods for visual counterfactual explanations (VCE) harness the power\nof deep generative models to synthesize new examples of high-dimensional images\nof impressive quality. However, it is currently difficult to compare the\nperformance of these VCE methods as the evaluation procedures largely vary and\noften boil down to visual inspection of individual examples and small scale\nuser studies. In this work, we propose a framework for systematic, quantitative\nevaluation of the VCE methods and a minimal set of metrics to be used. We use\nthis framework to explore the effects of certain crucial design choices in the\nlatest diffusion-based generative models for VCEs of natural image\nclassification (ImageNet). We conduct a battery of ablation-like experiments,\ngenerating thousands of VCEs for a suite of classifiers of various complexity,\naccuracy and robustness. Our findings suggest multiple directions for future\nadvancements and improvements of VCE methods. By sharing our methodology and\nour approach to tackle the computational challenges of such a study on a\nlimited hardware setup (including the complete code base), we offer a valuable\nguidance for researchers in the field fostering consistency and transparency in\nthe assessment of counterfactual explanations.\n","authors":["Philipp Vaeth","Alexander M. Fruehwald","Benjamin Paassen","Magda Gregorova"],"pdf_url":"https://arxiv.org/pdf/2308.06100v1.pdf","comment":"Accepted at the 5th International Workshop on eXplainable Knowledge\n Discovery in Data Mining @ ECML 2023"},{"id":"http://arxiv.org/abs/2306.04542v2","updated":"2023-08-11T12:20:50Z","published":"2023-06-07T15:46:47Z","title":"On the Design Fundamentals of Diffusion Models: A Survey","summary":" Diffusion models are generative models, which gradually add and remove noise\nto learn the underlying distribution of training data for data generation. The\ncomponents of diffusion models have gained significant attention with many\ndesign choices proposed. Existing reviews have primarily focused on\nhigher-level solutions, thereby covering less on the design fundamentals of\ncomponents. This study seeks to address this gap by providing a comprehensive\nand coherent review on component-wise design choices in diffusion models.\nSpecifically, we organize this review according to their three key components,\nnamely the forward process, the reverse process, and the sampling procedure.\nThis allows us to provide a fine-grained perspective of diffusion models,\nbenefiting future studies in the analysis of individual components, the\napplicability of design choices, and the implementation of diffusion models.\n","authors":["Ziyi Chang","George Alex Koulieris","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2306.04542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06095v1","updated":"2023-08-11T12:07:45Z","published":"2023-08-11T12:07:45Z","title":"Neural Conversation Models and How to Rein Them in: A Survey of Failures\n and Fixes","summary":" Recent conditional language models are able to continue any kind of text\nsource in an often seemingly fluent way. This fact encouraged research in the\narea of open-domain conversational systems that are based on powerful language\nmodels and aim to imitate an interlocutor by generating appropriate\ncontributions to a written dialogue. From a linguistic perspective, however,\nthe complexity of contributing to a conversation is high. In this survey, we\ninterpret Grice's maxims of cooperative conversation from the perspective of\nthis specific research area and systematize the literature under the aspect of\nwhat makes a contribution appropriate: A neural conversation model has to be\nfluent, informative, consistent, coherent, and follow social norms. In order to\nensure these qualities, recent approaches try to tame the underlying language\nmodels at various intervention points, such as data, training regime or\ndecoding. Sorted by these categories and intervention points, we discuss\npromising attempts and suggest novel ways for future research.\n","authors":["Fabian Galetzka","Anne Beyer","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2308.06095v1.pdf","comment":"Represents the state of the field in 2022; partially based on the\n first authors 2022 PhD thesis"},{"id":"http://arxiv.org/abs/2308.06094v1","updated":"2023-08-11T12:05:32Z","published":"2023-08-11T12:05:32Z","title":"Reinforcement Logic Rule Learning for Temporal Point Processes","summary":" We propose a framework that can incrementally expand the explanatory temporal\nlogic rule set to explain the occurrence of temporal events. Leveraging the\ntemporal point process modeling and learning framework, the rule content and\nweights will be gradually optimized until the likelihood of the observational\nevent sequences is optimal. The proposed algorithm alternates between a master\nproblem, where the current rule set weights are updated, and a subproblem,\nwhere a new rule is searched and included to best increase the likelihood. The\nformulated master problem is convex and relatively easy to solve using\ncontinuous optimization, whereas the subproblem requires searching the huge\ncombinatorial rule predicate and relationship space. To tackle this challenge,\nwe propose a neural search policy to learn to generate the new rule content as\na sequence of actions. The policy parameters will be trained end-to-end using\nthe reinforcement learning framework, where the reward signals can be\nefficiently queried by evaluating the subproblem objective. The trained policy\ncan be used to generate new rules in a controllable way. We evaluate our\nmethods on both synthetic and real healthcare datasets, obtaining promising\nresults.\n","authors":["Chao Yang","Lu Wang","Kun Gao","Shuang Li"],"pdf_url":"https://arxiv.org/pdf/2308.06094v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2308.06093v1","updated":"2023-08-11T12:05:12Z","published":"2023-08-11T12:05:12Z","title":"Experts Weights Averaging: A New General Training Scheme for Vision\n Transformers","summary":" Structural re-parameterization is a general training scheme for Convolutional\nNeural Networks (CNNs), which achieves performance improvement without\nincreasing inference cost. As Vision Transformers (ViTs) are gradually\nsurpassing CNNs in various visual tasks, one may question: if a training scheme\nspecifically for ViTs exists that can also achieve performance improvement\nwithout increasing inference cost? Recently, Mixture-of-Experts (MoE) has\nattracted increasing attention, as it can efficiently scale up the capacity of\nTransformers at a fixed cost through sparsely activated experts. Considering\nthat MoE can also be viewed as a multi-branch structure, can we utilize MoE to\nimplement a ViT training scheme similar to structural re-parameterization? In\nthis paper, we affirmatively answer these questions, with a new general\ntraining strategy for ViTs. Specifically, we decouple the training and\ninference phases of ViTs. During training, we replace some Feed-Forward\nNetworks (FFNs) of the ViT with specially designed, more efficient MoEs that\nassign tokens to experts by random uniform partition, and perform Experts\nWeights Averaging (EWA) on these MoEs at the end of each iteration. After\ntraining, we convert each MoE into an FFN by averaging the experts,\ntransforming the model back into original ViT for inference. We further provide\na theoretical analysis to show why and how it works. Comprehensive experiments\nacross various 2D and 3D visual tasks, ViT architectures, and datasets validate\nthe effectiveness and generalizability of the proposed training scheme.\nBesides, our training scheme can also be applied to improve performance when\nfine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can\nsignificantly improve the effectiveness of naive MoE in various 2D visual small\ndatasets and 3D visual tasks.\n","authors":["Yongqi Huang","Peng Ye","Xiaoshui Huang","Sheng Li","Tao Chen","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2308.06093v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.06091v1","updated":"2023-08-11T12:04:36Z","published":"2023-08-11T12:04:36Z","title":"Toward a Better Understanding of Loss Functions for Collaborative\n Filtering","summary":" Collaborative filtering (CF) is a pivotal technique in modern recommender\nsystems. The learning process of CF models typically consists of three\ncomponents: interaction encoder, loss function, and negative sampling. Although\nmany existing studies have proposed various CF models to design sophisticated\ninteraction encoders, recent work shows that simply reformulating the loss\nfunctions can achieve significant performance gains. This paper delves into\nanalyzing the relationship among existing loss functions. Our mathematical\nanalysis reveals that the previous loss functions can be interpreted as\nalignment and uniformity functions: (i) the alignment matches user and item\nrepresentations, and (ii) the uniformity disperses user and item distributions.\nInspired by this analysis, we propose a novel loss function that improves the\ndesign of alignment and uniformity considering the unique patterns of datasets\ncalled Margin-aware Alignment and Weighted Uniformity (MAWU). The key novelty\nof MAWU is two-fold: (i) margin-aware alignment (MA) mitigates\nuser/item-specific popularity biases, and (ii) weighted uniformity (WU) adjusts\nthe significance between user and item uniformities to reflect the inherent\ncharacteristics of datasets. Extensive experimental results show that MF and\nLightGCN equipped with MAWU are comparable or superior to state-of-the-art CF\nmodels with various loss functions on three public datasets.\n","authors":["Seongmin Park","Mincheol Yoon","Jae-woong Lee","Hogun Park","Jongwuk Lee"],"pdf_url":"https://arxiv.org/pdf/2308.06091v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2210.16192v3","updated":"2023-08-11T11:58:28Z","published":"2022-10-27T12:59:00Z","title":"Pretraining Respiratory Sound Representations using Metadata and\n Contrastive Learning","summary":" Methods based on supervised learning using annotations in an end-to-end\nfashion have been the state-of-the-art for classification problems. However,\nthey may be limited in their generalization capability, especially in the low\ndata regime. In this study, we address this issue using supervised contrastive\nlearning combined with available metadata to solve multiple pretext tasks that\nlearn a good representation of data. We apply our approach on respiratory sound\nclassification. This task is suited for this setting as demographic information\nsuch as sex and age are correlated with presence of lung diseases, and learning\na system that implicitly encode this information may better detect anomalies.\nSupervised contrastive learning is a paradigm that learns similar\nrepresentations to samples sharing the same class labels and dissimilar\nrepresentations to samples with different class labels. The feature extractor\nlearned using this paradigm extract useful features from the data, and we show\nthat it outperforms cross-entropy in classifying respiratory anomalies in two\ndifferent datasets. We also show that learning representations using only\nmetadata, without class labels, obtains similar performance as using cross\nentropy with those labels only. In addition, when combining class labels with\nmetadata using multiple supervised contrastive learning, an extension of\nsupervised contrastive learning solving an additional task of grouping patients\nwithin the same sex and age group, more informative features are learned. This\nwork suggests the potential of using multiple metadata sources in supervised\ncontrastive settings, in particular in settings with class imbalance and few\ndata. Our code is released at https://github.com/ilyassmoummad/scl_icbhi2017\n","authors":["Ilyass Moummad","Nicolas Farrugia"],"pdf_url":"https://arxiv.org/pdf/2210.16192v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.07831v3","updated":"2023-08-11T11:50:02Z","published":"2022-03-15T12:40:10Z","title":"Graph Neural Network Sensitivity Under Probabilistic Error Model","summary":" Graph convolutional networks (GCNs) can successfully learn the graph signal\nrepresentation by graph convolution. The graph convolution depends on the graph\nfilter, which contains the topological dependency of data and propagates data\nfeatures. However, the estimation errors in the propagation matrix (e.g., the\nadjacency matrix) can have a significant impact on graph filters and GCNs. In\nthis paper, we study the effect of a probabilistic graph error model on the\nperformance of the GCNs. We prove that the adjacency matrix under the error\nmodel is bounded by a function of graph size and error probability. We further\nanalytically specify the upper bound of a normalized adjacency matrix with\nself-loop added. Finally, we illustrate the error bounds by running experiments\non a synthetic dataset and study the sensitivity of a simple GCN under this\nprobabilistic error model on accuracy.\n","authors":["Xinjue Wang","Esa Ollila","Sergiy A. Vorobyov"],"pdf_url":"https://arxiv.org/pdf/2203.07831v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06069v1","updated":"2023-08-11T11:09:06Z","published":"2023-08-11T11:09:06Z","title":"Safeguarding Learning-based Control for Smart Energy Systems with\n Sampling Specifications","summary":" We study challenges using reinforcement learning in controlling energy\nsystems, where apart from performance requirements, one has additional safety\nrequirements such as avoiding blackouts. We detail how these safety\nrequirements in real-time temporal logic can be strengthened via discretization\ninto linear temporal logic (LTL), such that the satisfaction of the LTL\nformulae implies the satisfaction of the original safety requirements. The\ndiscretization enables advanced engineering methods such as synthesizing\nshields for safe reinforcement learning as well as formal verification, where\nfor statistical model checking, the probabilistic guarantee acquired by LTL\nmodel checking forms a lower bound for the satisfaction of the original\nreal-time safety requirements.\n","authors":["Chih-Hong Cheng","Venkatesh Prasad Venkataramanan","Pragya Kirti Gupta","Yun-Fei Hsu","Simon Burton"],"pdf_url":"https://arxiv.org/pdf/2308.06069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15157v3","updated":"2023-08-11T11:06:09Z","published":"2022-06-30T09:40:05Z","title":"HRFuser: A Multi-resolution Sensor Fusion Architecture for 2D Object\n Detection","summary":" Besides standard cameras, autonomous vehicles typically include multiple\nadditional sensors, such as lidars and radars, which help acquire richer\ninformation for perceiving the content of the driving scene. While several\nrecent works focus on fusing certain pairs of sensors - such as camera with\nlidar or radar - by using architectural components specific to the examined\nsetting, a generic and modular sensor fusion architecture is missing from the\nliterature. In this work, we propose HRFuser, a modular architecture for\nmulti-modal 2D object detection. It fuses multiple sensors in a\nmulti-resolution fashion and scales to an arbitrary number of input modalities.\nThe design of HRFuser is based on state-of-the-art high-resolution networks for\nimage-only dense prediction and incorporates a novel multi-window\ncross-attention block as the means to perform fusion of multiple modalities at\nmultiple resolutions. We demonstrate via extensive experiments on nuScenes and\nthe adverse conditions DENSE datasets that our model effectively leverages\ncomplementary features from additional modalities, substantially improving upon\ncamera-only performance and consistently outperforming state-of-the-art 3D and\n2D fusion methods evaluated on 2D object detection metrics. The source code is\npublicly available.\n","authors":["Tim Broedermann","Christos Sakaridis","Dengxin Dai","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2206.15157v3.pdf","comment":"IEEE International Conference on Intelligent Transportation Systems\n (ITSC) 2023"},{"id":"http://arxiv.org/abs/2305.07500v2","updated":"2023-08-11T10:49:52Z","published":"2023-05-12T14:14:39Z","title":"Learning representations that are closed-form Monge mapping optimal with\n application to domain adaptation","summary":" Optimal transport (OT) is a powerful geometric tool used to compare and align\nprobability measures following the least effort principle. Despite its\nwidespread use in machine learning (ML), OT problem still bears its\ncomputational burden, while at the same time suffering from the curse of\ndimensionality for measures supported on general high-dimensional spaces. In\nthis paper, we propose to tackle these challenges using representation\nlearning. In particular, we seek to learn an embedding space such that the\nsamples of the two input measures become alignable in it with a simple affine\nmapping that can be calculated efficiently in closed-form. We then show that\nsuch approach leads to results that are comparable to solving the original OT\nproblem when applied to the transfer learning task on which many OT baselines\nwhere previously evaluated in both homogeneous and heterogeneous DA settings.\nThe code for our contribution is available at\n\\url{https://github.com/Oleffa/LaOT}.\n","authors":["Oliver Struckmeier","Ievgen Redko","Anton Mallasto","Karol Arndt","Markus Heinonen","Ville Kyrki"],"pdf_url":"https://arxiv.org/pdf/2305.07500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.07306v6","updated":"2023-08-11T10:46:29Z","published":"2021-06-14T11:23:59Z","title":"Constraining Linear-chain CRFs to Regular Languages","summary":" A major challenge in structured prediction is to represent the\ninterdependencies within output structures. When outputs are structured as\nsequences, linear-chain conditional random fields (CRFs) are a widely used\nmodel class which can learn \\textit{local} dependencies in the output. However,\nthe CRF's Markov assumption makes it impossible for CRFs to represent\ndistributions with \\textit{nonlocal} dependencies, and standard CRFs are unable\nto respect nonlocal constraints of the data (such as global arity constraints\non output labels). We present a generalization of CRFs that can enforce a broad\nclass of constraints, including nonlocal ones, by specifying the space of\npossible output structures as a regular language $\\mathcal{L}$. The resulting\nregular-constrained CRF (RegCCRF) has the same formal properties as a standard\nCRF, but assigns zero probability to all label sequences not in $\\mathcal{L}$.\nNotably, RegCCRFs can incorporate their constraints during training, while\nrelated models only enforce constraints during decoding. We prove that\nconstrained training is never worse than constrained decoding, and show\nempirically that it can be substantially better in practice. Additionally, we\ndemonstrate a practical benefit on downstream tasks by incorporating a RegCCRF\ninto a deep neural model for semantic role labeling, exceeding state-of-the-art\nresults on a standard dataset.\n","authors":["Sean Papay","Roman Klinger","Sebastian Padó"],"pdf_url":"https://arxiv.org/pdf/2106.07306v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06058v1","updated":"2023-08-11T10:17:29Z","published":"2023-08-11T10:17:29Z","title":"Adaptive SGD with Polyak stepsize and Line-search: Robust Convergence\n and Variance Reduction","summary":" The recently proposed stochastic Polyak stepsize (SPS) and stochastic\nline-search (SLS) for SGD have shown remarkable effectiveness when training\nover-parameterized models. However, in non-interpolation settings, both\nalgorithms only guarantee convergence to a neighborhood of a solution which may\nresult in a worse output than the initial guess. While artificially decreasing\nthe adaptive stepsize has been proposed to address this issue (Orvieto et al.\n[2022]), this approach results in slower convergence rates for convex and\nover-parameterized models. In this work, we make two contributions: Firstly, we\npropose two new variants of SPS and SLS, called AdaSPS and AdaSLS, which\nguarantee convergence in non-interpolation settings and maintain sub-linear and\nlinear convergence rates for convex and strongly convex functions when training\nover-parameterized models. AdaSLS requires no knowledge of problem-dependent\nparameters, and AdaSPS requires only a lower bound of the optimal function\nvalue as input. Secondly, we equip AdaSPS and AdaSLS with a novel variance\nreduction technique and obtain algorithms that require\n$\\smash{\\widetilde{\\mathcal{O}}}(n+1/\\epsilon)$ gradient evaluations to achieve\nan $\\mathcal{O}(\\epsilon)$-suboptimality for convex functions, which improves\nupon the slower $\\mathcal{O}(1/\\epsilon^2)$ rates of AdaSPS and AdaSLS without\nvariance reduction in the non-interpolation regimes. Moreover, our result\nmatches the fast rates of AdaSVRG but removes the inner-outer-loop structure,\nwhich is easier to implement and analyze. Finally, numerical experiments on\nsynthetic and real datasets validate our theory and demonstrate the\neffectiveness and robustness of our algorithms.\n","authors":["Xiaowen Jiang","Sebastian U. Stich"],"pdf_url":"https://arxiv.org/pdf/2308.06058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15939v2","updated":"2023-08-11T10:11:25Z","published":"2023-03-28T12:52:40Z","title":"Generating artificial digital image correlation data using\n physics-guided adversarial networks","summary":" Digital image correlation (DIC) has become a valuable tool in the evaluation\nof mechanical experiments, particularly fatigue crack growth experiments. The\nevaluation requires accurate information of the crack path and crack tip\nposition, which is difficult to obtain due to inherent noise and artefacts.\nMachine learning models have been extremely successful in recognizing this\nrelevant information. But for the training of robust models, which generalize\nwell, big data is needed. However, data is typically scarce in the field of\nmaterial science and engineering because experiments are expensive and\ntime-consuming. We present a method to generate synthetic DIC data using\ngenerative adversarial networks with a physics-guided discriminator. To decide\nwhether data samples are real or fake, this discriminator additionally receives\nthe derived von Mises equivalent strain. We show that this physics-guided\napproach leads to improved results in terms of visual quality of samples,\nsliced Wasserstein distance, and geometry score.\n","authors":["David Melching","Erik Schultheis","Eric Breitbarth"],"pdf_url":"https://arxiv.org/pdf/2303.15939v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06053v1","updated":"2023-08-11T10:05:53Z","published":"2023-08-11T10:05:53Z","title":"Cost-effective On-device Continual Learning over Memory Hierarchy with\n Miro","summary":" Continual learning (CL) trains NN models incrementally from a continuous\nstream of tasks. To remember previously learned knowledge, prior studies store\nold samples over a memory hierarchy and replay them when new tasks arrive. Edge\ndevices that adopt CL to preserve data privacy are typically energy-sensitive\nand thus require high model accuracy while not compromising energy efficiency,\ni.e., cost-effectiveness. Our work is the first to explore the design space of\nhierarchical memory replay-based CL to gain insights into achieving\ncost-effectiveness on edge devices. We present Miro, a novel system runtime\nthat carefully integrates our insights into the CL framework by enabling it to\ndynamically configure the CL system based on resource states for the best\ncost-effectiveness. To reach this goal, Miro also performs online profiling on\nparameters with clear accuracy-energy trade-offs and adapts to optimal values\nwith low overhead. Extensive evaluations show that Miro significantly\noutperforms baseline systems we build for comparison, consistently achieving\nhigher cost-effectiveness.\n","authors":["Xinyue Ma","Suyeon Jeong","Minjia Zhang","Di Wang","Jonghyun Choi","Myeongjae Jeon"],"pdf_url":"https://arxiv.org/pdf/2308.06053v1.pdf","comment":"This paper is submitted for publication to MobiCom 2023"},{"id":"http://arxiv.org/abs/2303.16618v2","updated":"2023-08-11T10:01:35Z","published":"2023-03-29T12:19:23Z","title":"Personalised Language Modelling of Screen Characters Using Rich Metadata\n Annotations","summary":" Language models that are sensitive to external context can more effectively\ncapture the speaking patterns of individuals with specific characteristics or\nin particular environments. However, obtaining and leveraging such annotations\ncan be challenging. In this work, we show how to leverage rich character and\nfilm annotations to personalise language models in a scalable manner. Our best\nmodel can reduce perplexity by up to 6.5% compared to a parameter-matched\nlanguage model. Our approach performs on par with speaker-specific fine-tuning\nwhen the fine-tuning data (i.e. past dialogue) for individual speakers is\navailable. On top of that, it also generalises well to a scenario with no such\ndata, relying on combinations of demographic characteristics expressed via\nmetadata. Our findings are consistent across two corpora, one of which is also\na contribution of this paper: Cornell-rich contains rich manual annotations for\n863 speaking characters from the Cornell Movie Dialog Corpus, including\nfeatures such as characteristic quotes and character descriptions, along with\nsix automatically extracted metadata features for over 95% of the featured\nfilms. Finally, we also present a cost-benefit analysis highlighting which\nannotations are most cost-effective in reducing perplexity.\n","authors":["Sebastian Vincent","Rowanne Sumner","Alice Dowek","Charlotte Blundell","Emily Preston","Chris Bayliss","Chris Oakley","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2303.16618v2.pdf","comment":"9 pages; 4 figures; 6 tables. Preprint"},{"id":"http://arxiv.org/abs/2303.08032v2","updated":"2023-08-11T09:59:07Z","published":"2023-03-14T16:11:47Z","title":"Verifying the Robustness of Automatic Credibility Assessment","summary":" Text classification methods have been widely investigated as a way to detect\ncontent of low credibility: fake news, social media bots, propaganda, etc.\nQuite accurate models (likely based on deep neural networks) help in moderating\npublic electronic platforms and often cause content creators to face rejection\nof their submissions or removal of already published texts. Having the\nincentive to evade further detection, content creators try to come up with a\nslightly modified version of the text (known as an attack with an adversarial\nexample) that exploit the weaknesses of classifiers and result in a different\noutput. Here we systematically test the robustness of popular text classifiers\nagainst available attacking techniques and discover that, indeed, in some cases\ninsignificant changes in input text can mislead the models. We also introduce\nBODEGA: a benchmark for testing both victim models and attack methods on four\nmisinformation detection tasks in an evaluation framework designed to simulate\nreal use-cases of content moderation. Finally, we manually analyse a subset\nadversarial examples and check what kinds of modifications are used in\nsuccessful attacks. The BODEGA code and data is openly shared in hope of\nenhancing the comparability and replicability of further research in this area\n","authors":["Piotr Przybyła","Alexander Shvets","Horacio Saggion"],"pdf_url":"https://arxiv.org/pdf/2303.08032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06051v1","updated":"2023-08-11T09:58:47Z","published":"2023-08-11T09:58:47Z","title":"Towards Instance-adaptive Inference for Federated Learning","summary":" Federated learning (FL) is a distributed learning paradigm that enables\nmultiple clients to learn a powerful global model by aggregating local\ntraining. However, the performance of the global model is often hampered by\nnon-i.i.d. distribution among the clients, requiring extensive efforts to\nmitigate inter-client data heterogeneity. Going beyond inter-client data\nheterogeneity, we note that intra-client heterogeneity can also be observed on\ncomplex real-world data and seriously deteriorate FL performance. In this\npaper, we present a novel FL algorithm, i.e., FedIns, to handle intra-client\ndata heterogeneity by enabling instance-adaptive inference in the FL framework.\nInstead of huge instance-adaptive models, we resort to a parameter-efficient\nfine-tuning method, i.e., scale and shift deep features (SSF), upon a\npre-trained model. Specifically, we first train an SSF pool for each client,\nand aggregate these SSF pools on the server side, thus still maintaining a low\ncommunication cost. To enable instance-adaptive inference, for a given\ninstance, we dynamically find the best-matched SSF subsets from the pool and\naggregate them to generate an adaptive SSF specified for the instance, thereby\nreducing the intra-client as well as the inter-client heterogeneity. Extensive\nexperiments show that our FedIns outperforms state-of-the-art FL algorithms,\ne.g., a 6.64\\% improvement against the top-performing method with less than\n15\\% communication cost on Tiny-ImageNet. Our code and models will be publicly\nreleased.\n","authors":["Chun-Mei Feng","Kai Yu","Nian Liu","Xinxing Xu","Salman Khan","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.06051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.03063v2","updated":"2023-08-11T09:31:23Z","published":"2022-08-05T09:36:55Z","title":"Enhancing the Robustness via Adversarial Learning and Joint\n Spatial-Temporal Embeddings in Traffic Forecasting","summary":" Traffic forecasting is an essential problem in urban planning and computing.\nThe complex dynamic spatial-temporal dependencies among traffic objects (e.g.,\nsensors and road segments) have been calling for highly flexible models;\nunfortunately, sophisticated models may suffer from poor robustness especially\nin capturing the trend of the time series (1st-order derivatives with time),\nleading to unrealistic forecasts. To address the challenge of balancing\ndynamics and robustness, we propose TrendGCN, a new scheme that extends the\nflexibility of GCNs and the distribution-preserving capacity of generative and\nadversarial loss for handling sequential data with inherent statistical\ncorrelations. On the one hand, our model simultaneously incorporates spatial\n(node-wise) embeddings and temporal (time-wise) embeddings to account for\nheterogeneous space-and-time convolutions; on the other hand, it uses GAN\nstructure to systematically evaluate statistical consistencies between the real\nand the predicted time series in terms of both the temporal trending and the\ncomplex spatial-temporal dependencies. Compared with traditional approaches\nthat handle step-wise predictive errors independently, our approach can produce\nmore realistic and robust forecasts. Experiments on six benchmark traffic\nforecasting datasets and theoretical analysis both demonstrate the superiority\nand the state-of-the-art performance of TrendGCN. Source code is available at\nhttps://github.com/juyongjiang/TrendGCN.\n","authors":["Juyong Jiang","Binqing Wu","Ling Chen","Kai Zhang","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2208.03063v2.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2204.08247v3","updated":"2023-08-11T09:30:41Z","published":"2022-04-18T10:50:03Z","title":"Joint Multi-view Unsupervised Feature Selection and Graph Learning","summary":" Despite significant progress, previous multi-view unsupervised feature\nselection methods mostly suffer from two limitations. First, they generally\nutilize either cluster structure or similarity structure to guide the feature\nselection, which neglect the possibility of a joint formulation with mutual\nbenefits. Second, they often learn the similarity structure by either global\nstructure learning or local structure learning, which lack the capability of\ngraph learning with both global and local structural awareness. In light of\nthis, this paper presents a joint multi-view unsupervised feature selection and\ngraph learning (JMVFG) approach. Particularly, we formulate the multi-view\nfeature selection with orthogonal decomposition, where each target matrix is\ndecomposed into a view-specific basis matrix and a view-consistent cluster\nindicator. The cross-space locality preservation is incorporated to bridge the\ncluster structure learning in the projected space and the similarity learning\n(i.e., graph learning) in the original space. Further, a unified objective\nfunction is presented to enable the simultaneous learning of the cluster\nstructure, the global and local similarity structures, and the multi-view\nconsistency and inconsistency, upon which an alternating optimization algorithm\nis developed with theoretically proved convergence. Extensive experiments on a\nvariety of real-world multi-view datasets demonstrate the superiority of our\napproach for both the multi-view feature selection and graph learning tasks.\nThe code is available at https://github.com/huangdonghere/JMVFG.\n","authors":["Si-Guo Fang","Dong Huang","Chang-Dong Wang","Yong Tang"],"pdf_url":"https://arxiv.org/pdf/2204.08247v3.pdf","comment":"To appear in IEEE Transactions on Emerging Topics in Computational\n Intelligence"},{"id":"http://arxiv.org/abs/2305.02640v4","updated":"2023-08-11T09:30:08Z","published":"2023-05-04T08:20:37Z","title":"Towards Causal Representation Learning and Deconfounding from Indefinite\n Data","summary":" Owing to the cross-pollination between causal discovery and deep learning,\nnon-statistical data (e.g., images, text, etc.) encounters significant\nconflicts in terms of properties and methods with traditional causal data. To\nunify these data types of varying forms, we redefine causal data from two novel\nperspectives and then propose three data paradigms. Among them, the indefinite\ndata (like dialogues or video sources) induce low sample utilization and\nincapability of the distribution assumption, both leading to the fact that\nlearning causal representation from indefinite data is, as of yet, largely\nunexplored. We design the causal strength variational model to settle down\nthese two problems. Specifically, we leverage the causal strength instead of\nindependent noise as the latent variable to construct evidence lower bound. By\nthis design ethos, The causal strengths of different structures are regarded as\na distribution and can be expressed as a 2D matrix. Moreover, considering the\nlatent confounders, we disentangle the causal graph G into two relation\nsubgraphs O and C. O contains pure relations between observed variables, while\nC represents the relations from latent variables to observed variables. We\nimplement the above designs as a dynamic variational inference model, tailored\nto learn causal representation from indefinite data under latent confounding.\nFinally, we conduct comprehensive experiments on synthetic and real-world data\nto demonstrate the effectiveness of our method.\n","authors":["Hang Chen","Xinyu Yang","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2305.02640v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.01558v2","updated":"2023-08-11T09:20:55Z","published":"2022-04-04T15:05:45Z","title":"Con$^{2}$DA: Simplifying Semi-supervised Domain Adaptation by Learning\n Consistent and Contrastive Feature Representations","summary":" In this work, we present Con$^{2}$DA, a simple framework that extends recent\nadvances in semi-supervised learning to the semi-supervised domain adaptation\n(SSDA) problem. Our framework generates pairs of associated samples by\nperforming stochastic data transformations to a given input. Associated data\npairs are mapped to a feature representation space using a feature extractor.\nWe use different loss functions to enforce consistency between the feature\nrepresentations of associated data pairs of samples. We show that these learned\nrepresentations are useful to deal with differences in data distributions in\nthe domain adaptation problem. We performed experiments to study the main\ncomponents of our model and we show that (i) learning of the consistent and\ncontrastive feature representations is crucial to extract good discriminative\nfeatures across different domains, and ii) our model benefits from the use of\nstrong augmentation policies. With these findings, our method achieves\nstate-of-the-art performances in three benchmark datasets for SSDA.\n","authors":["Manuel Pérez-Carrasco","Pavlos Protopapas","Guillermo Cabrera-Vives"],"pdf_url":"https://arxiv.org/pdf/2204.01558v2.pdf","comment":"Accepted to NeurIPS 2021 Workshop on Distribution Shifts: Connecting\n Methods and Applications"},{"id":"http://arxiv.org/abs/2206.04531v3","updated":"2023-08-11T09:11:59Z","published":"2022-06-09T14:25:23Z","title":"ECLAD: Extracting Concepts with Local Aggregated Descriptors","summary":" Convolutional neural networks (CNNs) are increasingly being used in critical\nsystems, where robustness and alignment are crucial. In this context, the field\nof explainable artificial intelligence has proposed the generation of\nhigh-level explanations of the prediction process of CNNs through concept\nextraction. While these methods can detect whether or not a concept is present\nin an image, they are unable to determine its location. What is more, a fair\ncomparison of such approaches is difficult due to a lack of proper validation\nprocedures. To address these issues, we propose a novel method for automatic\nconcept extraction and localization based on representations obtained through\npixel-wise aggregations of CNN activation maps. Further, we introduce a process\nfor the validation of concept-extraction techniques based on synthetic datasets\nwith pixel-wise annotations of their main components, reducing the need for\nhuman intervention. Extensive experimentation on both synthetic and real-world\ndatasets demonstrates that our method outperforms state-of-the-art\nalternatives.\n","authors":["Andres Felipe Posada-Moreno","Nikita Surya","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2206.04531v3.pdf","comment":"34 pages, under review"},{"id":"http://arxiv.org/abs/2205.13104v3","updated":"2023-08-11T09:09:45Z","published":"2022-05-26T01:54:48Z","title":"Trainable Weight Averaging: A General Approach for Subspace Training","summary":" Training deep neural networks (DNNs) in low-dimensional subspaces is a\npromising direction for achieving efficient training and better generalization\nperformance. Our previous work extracts the subspaces by performing the\ndimension reduction method over the training trajectory, which verifies that\nDNN could be well-trained in a tiny subspace. However, that method is\ninefficient for subspace extraction and numerically unstable, limiting its\napplicability to more general tasks. In this paper, we connect subspace\ntraining to weight averaging and propose \\emph{Trainable Weight Averaging}\n(TWA), a general approach for subspace training. TWA is efficient in terms of\nsubspace extraction and easy to use, making it a promising new optimizer for\nDNN's training. Our design also includes an efficient scheme that allows\nparallel training across multiple nodes to handle large-scale problems and\nevenly distribute the memory and computation burden to each node. TWA can be\nused for both efficient training and generalization enhancement, for different\nneural network architectures, and for various tasks from image classification\nand object detection, to neural language processing. The code of implementation\nis available at https://github.com/nblt/TWA, which includes extensive\nexperiments covering various benchmark computer vision and neural language\nprocessing tasks with various architectures.\n","authors":["Tao Li","Zhehao Huang","Yingwen Wu","Zhengbao He","Qinghua Tao","Xiaolin Huang","Chih-Jen Lin"],"pdf_url":"https://arxiv.org/pdf/2205.13104v3.pdf","comment":"Journal version in progress. Previously accepted to ICLR 2023"},{"id":"http://arxiv.org/abs/2211.12461v2","updated":"2023-08-11T09:09:10Z","published":"2022-11-22T18:19:10Z","title":"A Neural-Network-Based Convex Regularizer for Image Reconstruction","summary":" The emergence of deep-learning-based methods to solve image-reconstruction\nproblems has enabled a significant increase in reconstruction quality.\nUnfortunately, these new methods often lack reliability and explainability, and\nthere is a growing interest to address these shortcomings while retaining the\nboost in performance. In this work, we tackle this issue by revisiting\nregularizers that are the sum of convex-ridge functions. The gradient of such\nregularizers is parameterized by a neural network that has a single hidden\nlayer with increasing and learnable activation functions. This neural network\nis trained within a few minutes as a multistep Gaussian denoiser. The numerical\nexperiments for denoising, CT, and MRI reconstruction show improvements over\nmethods that offer similar reliability guarantees.\n","authors":["Alexis Goujon","Sebastian Neumayer","Pakshal Bohra","Stanislas Ducotterd","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2211.12461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06025v1","updated":"2023-08-11T09:07:38Z","published":"2023-08-11T09:07:38Z","title":"Controlling Character Motions without Observable Driving Source","summary":" How to generate diverse, life-like, and unlimited long head/body sequences\nwithout any driving source? We argue that this under-investigated research\nproblem is non-trivial at all, and has unique technical challenges behind it.\nWithout semantic constraints from the driving sources, using the standard\nautoregressive model to generate infinitely long sequences would easily result\nin 1) out-of-distribution (OOD) issue due to the accumulated error, 2)\ninsufficient diversity to produce natural and life-like motion sequences and 3)\nundesired periodic patterns along the time. To tackle the above challenges, we\npropose a systematic framework that marries the benefits of VQ-VAE and a novel\ntoken-level control policy trained with reinforcement learning using carefully\ndesigned reward functions. A high-level prior model can be easily injected on\ntop to generate unlimited long and diverse sequences. Although we focus on no\ndriving sources now, our framework can be generalized for controlled synthesis\nwith explicit driving sources. Through comprehensive evaluations, we conclude\nthat our proposed framework can address all the above-mentioned challenges and\noutperform other strong baselines very significantly.\n","authors":["Weiyuan Li","Bin Dai","Ziyi Zhou","Qi Yao","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06013v1","updated":"2023-08-11T08:41:00Z","published":"2023-08-11T08:41:00Z","title":"Large Language Models for Telecom: Forthcoming Impact on the Industry","summary":" Large Language Models (LLMs) have emerged as a transformative force,\nrevolutionizing numerous fields well beyond the conventional domain of Natural\nLanguage Processing (NLP) and garnering unprecedented attention. As LLM\ntechnology continues to progress, the telecom industry is facing the prospect\nof its potential impact on its landscape. To elucidate these implications, we\ndelve into the inner workings of LLMs, providing insights into their current\ncapabilities and limitations. We also examine the use cases that can be readily\nimplemented in the telecom industry, streamlining numerous tasks that currently\nhinder operational efficiency and demand significant manpower and engineering\nexpertise. Furthermore, we uncover essential research directions that deal with\nthe distinctive challenges of utilizing the LLMs within the telecom domain.\nAddressing these challenges represents a significant stride towards fully\nharnessing the potential of LLMs and unlocking their capabilities to the\nfullest extent within the telecom domain.\n","authors":["Ali Maatouk","Nicola Piovesan","Fadhel Ayed","Antonio De Domenico","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2308.06013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17271v2","updated":"2023-08-11T08:35:06Z","published":"2023-05-26T21:36:08Z","title":"Robust Lane Detection through Self Pre-training with Masked Sequential\n Autoencoders and Fine-tuning with Customized PolyLoss","summary":" Lane detection is crucial for vehicle localization which makes it the\nfoundation for automated driving and many intelligent and advanced driving\nassistant systems. Available vision-based lane detection methods do not make\nfull use of the valuable features and aggregate contextual information,\nespecially the interrelationships between lane lines and other regions of the\nimages in continuous frames. To fill this research gap and upgrade lane\ndetection performance, this paper proposes a pipeline consisting of self\npre-training with masked sequential autoencoders and fine-tuning with\ncustomized PolyLoss for the end-to-end neural network models using\nmulti-continuous image frames. The masked sequential autoencoders are adopted\nto pre-train the neural network models with reconstructing the missing pixels\nfrom a random masked image as the objective. Then, in the fine-tuning\nsegmentation phase where lane detection segmentation is performed, the\ncontinuous image frames are served as the inputs, and the pre-trained model\nweights are transferred and further updated using the backpropagation mechanism\nwith customized PolyLoss calculating the weighted errors between the output\nlane detection results and the labeled ground truth. Extensive experiment\nresults demonstrate that, with the proposed pipeline, the lane detection model\nperformance on both normal and challenging scenes can be advanced beyond the\nstate-of-the-art, delivering the best testing accuracy (98.38%), precision\n(0.937), and F1-measure (0.924) on the normal scene testing set, together with\nthe best overall accuracy (98.36%) and precision (0.844) in the challenging\nscene test set, while the training time can be substantially shortened.\n","authors":["Ruohan Li","Yongqi Dong"],"pdf_url":"https://arxiv.org/pdf/2305.17271v2.pdf","comment":"12 pages, 8 figures, accepted by journal of IEEE Transactions on\n Intelligent Transportation Systems"},{"id":"http://arxiv.org/abs/2011.07089v3","updated":"2023-08-11T08:30:55Z","published":"2020-11-13T19:04:24Z","title":"Robust Quadruped Jumping via Deep Reinforcement Learning","summary":" In this paper, we consider a general task of jumping varying distances and\nheights for a quadrupedal robot in noisy environments, such as off of uneven\nterrain and with variable robot dynamics parameters. To accurately jump in such\nconditions, we propose a framework using deep reinforcement learning that\nleverages and augments the complex solution of nonlinear trajectory\noptimization for quadrupedal jumping. While the standalone optimization limits\njumping to take-off from flat ground and requires accurate assumptions of robot\ndynamics, our proposed approach improves the robustness to allow jumping off of\nsignificantly uneven terrain with variable robot dynamical parameters and\nenvironmental conditions. Compared with walking and running, the realization of\naggressive jumping on hardware necessitates accounting for the motors'\ntorque-speed relationship as well as the robot's total power limits. By\nincorporating these constraints into our learning framework, we successfully\ndeploy our policy sim-to-real without further tuning, fully exploiting the\navailable onboard power supply and motors. We demonstrate robustness to\nenvironment noise of foot disturbances of up to 6 cm in height, or 33% of the\nrobot's nominal standing height, while jumping 2x the body length in distance.\n","authors":["Guillaume Bellegarda","Chuong Nguyen","Quan Nguyen"],"pdf_url":"https://arxiv.org/pdf/2011.07089v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03045v2","updated":"2023-08-11T08:22:07Z","published":"2023-08-06T08:14:35Z","title":"Machine learning methods for the search for L&T brown dwarfs in the data\n of modern sky surveys","summary":" According to various estimates, brown dwarfs (BD) should account for up to 25\npercent of all objects in the Galaxy. However, few of them are discovered and\nwell-studied, both individually and as a population. Homogeneous and complete\nsamples of brown dwarfs are needed for these kinds of studies. Due to their\nweakness, spectral studies of brown dwarfs are rather laborious. For this\nreason, creating a significant reliable sample of brown dwarfs, confirmed by\nspectroscopic observations, seems unattainable at the moment. Numerous attempts\nhave been made to search for and create a set of brown dwarfs using their\ncolours as a decision rule applied to a vast amount of survey data. In this\nwork, we use machine learning methods such as Random Forest Classifier,\nXGBoost, SVM Classifier and TabNet on PanStarrs DR1, 2MASS and WISE data to\ndistinguish L and T brown dwarfs from objects of other spectral and luminosity\nclasses. The explanation of the models is discussed. We also compare our models\nwith classical decision rules, proving their efficiency and relevance.\n","authors":["Aleksandra Avdeeva"],"pdf_url":"https://arxiv.org/pdf/2308.03045v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.05999v1","updated":"2023-08-11T08:06:58Z","published":"2023-08-11T08:06:58Z","title":"Does AI for science need another ImageNet Or totally different\n benchmarks? A case study of machine learning force fields","summary":" AI for science (AI4S) is an emerging research field that aims to enhance the\naccuracy and speed of scientific computing tasks using machine learning\nmethods. Traditional AI benchmarking methods struggle to adapt to the unique\nchallenges posed by AI4S because they assume data in training, testing, and\nfuture real-world queries are independent and identically distributed, while\nAI4S workloads anticipate out-of-distribution problem instances. This paper\ninvestigates the need for a novel approach to effectively benchmark AI for\nscience, using the machine learning force field (MLFF) as a case study. MLFF is\na method to accelerate molecular dynamics (MD) simulation with low\ncomputational cost and high accuracy. We identify various missed opportunities\nin scientifically meaningful benchmarking and propose solutions to evaluate\nMLFF models, specifically in the aspects of sample efficiency, time domain\nsensitivity, and cross-dataset generalization capabilities. By setting up the\nproblem instantiation similar to the actual scientific applications, more\nmeaningful performance metrics from the benchmark can be achieved. This suite\nof metrics has demonstrated a better ability to assess a model's performance in\nreal-world scientific applications, in contrast to traditional AI benchmarking\nmethodologies. This work is a component of the SAIBench project, an AI4S\nbenchmarking suite. The project homepage is\nhttps://www.computercouncil.org/SAIBench.\n","authors":["Yatao Li","Wanling Gao","Lei Wang","Lixin Sun","Zun Wang","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2308.05999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05481v2","updated":"2023-08-11T07:55:19Z","published":"2023-08-10T10:12:43Z","title":"LLM As DBA","summary":" Database administrators (DBAs) play a crucial role in managing, maintaining\nand optimizing a database system to ensure data availability, performance, and\nreliability. However, it is hard and tedious for DBAs to manage a large number\nof database instances (e.g., millions of instances on the cloud databases).\nRecently large language models (LLMs) have shown great potential to understand\nvaluable documents and accordingly generate reasonable answers. Thus, we\npropose D-Bot, a LLM-based database administrator that can continuously acquire\ndatabase maintenance experience from textual sources, and provide reasonable,\nwell-founded, in-time diagnosis and optimization advice for target databases.\nThis paper presents a revolutionary LLM-centric framework for database\nmaintenance, including (i) database maintenance knowledge detection from\ndocuments and tools, (ii) tree of thought reasoning for root cause analysis,\nand (iii) collaborative diagnosis among multiple LLMs. Our preliminary\nexperimental results that D-Bot can efficiently and effectively diagnose the\nroot causes and our code is available at\ngithub.com/TsinghuaDatabaseGroup/DB-GPT.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhiyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05481v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05986v1","updated":"2023-08-11T07:50:40Z","published":"2023-08-11T07:50:40Z","title":"Fast and Accurate Transferability Measurement by Evaluating Intra-class\n Feature Variance","summary":" Given a set of pre-trained models, how can we quickly and accurately find the\nmost useful pre-trained model for a downstream task? Transferability\nmeasurement is to quantify how transferable is a pre-trained model learned on a\nsource task to a target task. It is used for quickly ranking pre-trained models\nfor a given task and thus becomes a crucial step for transfer learning.\nExisting methods measure transferability as the discrimination ability of a\nsource model for a target data before transfer learning, which cannot\naccurately estimate the fine-tuning performance. Some of them restrict the\napplication of transferability measurement in selecting the best supervised\npre-trained models that have classifiers. It is important to have a general\nmethod for measuring transferability that can be applied in a variety of\nsituations, such as selecting the best self-supervised pre-trained models that\ndo not have classifiers, and selecting the best transferring layer for a target\ntask. In this work, we propose TMI (TRANSFERABILITY MEASUREMENT WITH\nINTRA-CLASS FEATURE VARIANCE), a fast and accurate algorithm to measure\ntransferability. We view transferability as the generalization of a pre-trained\nmodel on a target task by measuring intra-class feature variance. Intra-class\nvariance evaluates the adaptability of the model to a new task, which measures\nhow transferable the model is. Compared to previous studies that estimate how\ndiscriminative the models are, intra-class variance is more accurate than those\nas it does not require an optimal feature extractor and classifier. Extensive\nexperiments on real-world datasets show that TMI outperforms competitors for\nselecting the top-5 best models, and exhibits consistently better correlation\nin 13 out of 17 cases.\n","authors":["Huiwen Xu","U Kang"],"pdf_url":"https://arxiv.org/pdf/2308.05986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01118v2","updated":"2023-08-11T07:43:27Z","published":"2023-08-02T12:58:11Z","title":"A Survey on Popularity Bias in Recommender Systems","summary":" Recommender systems help people find relevant content in a personalized way.\nOne main promise of such systems is that they are able to increase the\nvisibility of items in the long tail, i.e., the lesser-known items in a\ncatalogue. Existing research, however, suggests that in many situations today's\nrecommendation algorithms instead exhibit a popularity bias, meaning that they\noften focus on rather popular items in their recommendations. Such a bias may\nnot only lead to limited value of the recommendations for consumers and\nproviders in the short run, but it may also cause undesired reinforcement\neffects over time. In this paper, we discuss the potential reasons for\npopularity bias and we review existing approaches to detect, quantify and\nmitigate popularity bias in recommender systems. Our survey therefore includes\nboth an overview of the computational metrics used in the literature as well as\na review of the main technical approaches to reduce the bias. We furthermore\ncritically discuss today's literature, where we observe that the research is\nalmost entirely based on computational experiments and on certain assumptions\nregarding the practical effects of including long-tail items in the\nrecommendations.\n","authors":["Anastasiia Klimashevskaia","Dietmar Jannach","Mehdi Elahi","Christoph Trattner"],"pdf_url":"https://arxiv.org/pdf/2308.01118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14150v2","updated":"2023-08-11T07:42:04Z","published":"2022-12-29T02:11:19Z","title":"A Dynamics Theory of Implicit Regularization in Deep Low-Rank Matrix\n Factorization","summary":" Implicit regularization is an important way to interpret neural networks.\nRecent theory starts to explain implicit regularization with the model of deep\nmatrix factorization (DMF) and analyze the trajectory of discrete gradient\ndynamics in the optimization process. These discrete gradient dynamics are\nrelatively small but not infinitesimal, thus fitting well with the practical\nimplementation of neural networks. Currently, discrete gradient dynamics\nanalysis has been successfully applied to shallow networks but encounters the\ndifficulty of complex computation for deep networks. In this work, we introduce\nanother discrete gradient dynamics approach to explain implicit regularization,\ni.e. landscape analysis. It mainly focuses on gradient regions, such as saddle\npoints and local minima. We theoretically establish the connection between\nsaddle point escaping (SPE) stages and the matrix rank in DMF. We prove that,\nfor a rank-R matrix reconstruction, DMF will converge to a second-order\ncritical point after R stages of SPE. This conclusion is further experimentally\nverified on a low-rank matrix reconstruction problem. This work provides a new\ntheory to analyze implicit regularization in deep learning.\n","authors":["Jian Cao","Chen Qian","Yihui Huang","Dicheng Chen","Yuncheng Gao","Jiyang Dong","Di Guo","Xiaobo Qu"],"pdf_url":"https://arxiv.org/pdf/2212.14150v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.05969v1","updated":"2023-08-11T07:07:21Z","published":"2023-08-11T07:07:21Z","title":"Learning nonparametric DAGs with incremental information via high-order\n HSIC","summary":" Score-based methods for learning Bayesain networks(BN) aim to maximizing the\nglobal score functions. However, if local variables have direct and indirect\ndependence simultaneously, the global optimization on score functions misses\nedges between variables with indirect dependent relationship, of which scores\nare smaller than those with direct dependent relationship. In this paper, we\npresent an identifiability condition based on a determined subset of parents to\nidentify the underlying DAG. By the identifiability condition, we develop a\ntwo-phase algorithm namely optimal-tuning (OT) algorithm to locally amend the\nglobal optimization. In the optimal phase, an optimization problem based on\nfirst-order Hilbert-Schmidt independence criterion (HSIC) gives an estimated\nskeleton as the initial determined parents subset. In the tuning phase, the\nskeleton is locally tuned by deletion, addition and DAG-formalization\nstrategies using the theoretically proved incremental properties of high-order\nHSIC. Numerical experiments for different synthetic datasets and real-world\ndatasets show that the OT algorithm outperforms existing methods. Especially in\nSigmoid Mix model with the size of the graph being ${\\rm\\bf d=40}$, the\nstructure intervention distance (SID) of the OT algorithm is 329.7 smaller than\nthe one obtained by CAM, which indicates that the graph estimated by the OT\nalgorithm misses fewer edges compared with CAM.\n","authors":["Yafei Wang","Jianguo Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05959v1","updated":"2023-08-11T06:28:19Z","published":"2023-08-11T06:28:19Z","title":"Learned Point Cloud Compression for Classification","summary":" Deep learning is increasingly being used to perform machine vision tasks such\nas classification, object detection, and segmentation on 3D point cloud data.\nHowever, deep learning inference is computationally expensive. The limited\ncomputational capabilities of end devices thus necessitate a codec for\ntransmitting point cloud data over the network for server-side processing. Such\na codec must be lightweight and capable of achieving high compression ratios\nwithout sacrificing accuracy. Motivated by this, we present a novel point cloud\ncodec that is highly specialized for the machine task of classification. Our\ncodec, based on PointNet, achieves a significantly better rate-accuracy\ntrade-off in comparison to alternative methods. In particular, it achieves a\n94% reduction in BD-bitrate over non-specialized codecs on the ModelNet40\ndataset. For low-resource end devices, we also propose two lightweight\nconfigurations of our encoder that achieve similar BD-bitrate reductions of 93%\nand 92% with 3% and 5% drops in top-1 accuracy, while consuming only 0.470 and\n0.048 encoder-side kMACs/point, respectively. Our codec demonstrates the\npotential of specialized codecs for machine analysis of point clouds, and\nprovides a basis for extension to more complex tasks and datasets in the\nfuture.\n","authors":["Mateen Ulhaq","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2308.05959v1.pdf","comment":"6 pages, 4 figures, IEEE MMSP 2023"},{"id":"http://arxiv.org/abs/2308.05957v1","updated":"2023-08-11T06:19:23Z","published":"2023-08-11T06:19:23Z","title":"Node Embedding for Homophilous Graphs with ARGEW: Augmentation of Random\n walks by Graph Edge Weights","summary":" Representing nodes in a network as dense vectors node embeddings is important\nfor understanding a given network and solving many downstream tasks. In\nparticular, for weighted homophilous graphs where similar nodes are connected\nwith larger edge weights, we desire node embeddings where node pairs with\nstrong weights have closer embeddings. Although random walk based node\nembedding methods like node2vec and node2vec+ do work for weighted networks via\nincluding edge weights in the walk transition probabilities, our experiments\nshow that the embedding result does not adequately reflect edge weights. In\nthis paper, we propose ARGEW (Augmentation of Random walks by Graph Edge\nWeights), a novel augmentation method for random walks that expands the corpus\nin such a way that nodes with larger edge weights end up with closer\nembeddings. ARGEW can work with any random walk based node embedding method,\nbecause it is independent of the random sampling strategy itself and works on\ntop of the already-performed walks. With several real-world networks, we\ndemonstrate that with ARGEW, compared to not using it, the desired pattern that\nnode pairs with larger edge weights have closer embeddings is much clearer. We\nalso examine ARGEW's performance in node classification: node2vec with ARGEW\noutperforms pure node2vec and is not sensitive to hyperparameters (i.e.\nconsistently good). In fact, it achieves similarly good results as supervised\nGCN, even without any node feature or label information during training.\nFinally, we explain why ARGEW works consistently well by exploring the\ncoappearance distributions using a synthetic graph with clear structural roles.\n","authors":["Jun Hee Kim","Jaeman Son","Hyunsoo Kim","Eunjo Lee"],"pdf_url":"https://arxiv.org/pdf/2308.05957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01097v3","updated":"2023-08-11T05:30:10Z","published":"2023-08-02T12:04:28Z","title":"Spatio-Temporal Branching for Motion Prediction using Motion Increments","summary":" Human motion prediction (HMP) has emerged as a popular research topic due to\nits diverse applications, but it remains a challenging task due to the\nstochastic and aperiodic nature of future poses. Traditional methods rely on\nhand-crafted features and machine learning techniques, which often struggle to\nmodel the complex dynamics of human motion. Recent deep learning-based methods\nhave achieved success by learning spatio-temporal representations of motion,\nbut these models often overlook the reliability of motion data. Additionally,\nthe temporal and spatial dependencies of skeleton nodes are distinct. The\ntemporal relationship captures motion information over time, while the spatial\nrelationship describes body structure and the relationships between different\nnodes. In this paper, we propose a novel spatio-temporal branching network\nusing incremental information for HMP, which decouples the learning of\ntemporal-domain and spatial-domain features, extracts more motion information,\nand achieves complementary cross-domain knowledge learning through knowledge\ndistillation. Our approach effectively reduces noise interference and provides\nmore expressive information for characterizing motion by separately extracting\ntemporal and spatial features. We evaluate our approach on standard HMP\nbenchmarks and outperform state-of-the-art methods in terms of prediction\naccuracy.\n","authors":["Jiexin Wang","Yujie Zhou","Wenwen Qiang","Ying Ba","Bing Su","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.01097v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04902v2","updated":"2023-08-11T05:27:05Z","published":"2023-06-08T03:09:49Z","title":"A Cover Time Study of a non-Markovian Algorithm","summary":" Given a traversal algorithm, cover time is the expected number of steps\nneeded to visit all nodes in a given graph. A smaller cover time means a higher\nexploration efficiency of traversal algorithm. Although random walk algorithms\nhave been studied extensively in the existing literature, there has been no\ncover time result for any non-Markovian method. In this work, we stand on a\ntheoretical perspective and show that the negative feedback strategy (a\ncount-based exploration method) is better than the naive random walk search. In\nparticular, the former strategy can locally improve the search efficiency for\nan arbitrary graph. It also achieves smaller cover times for special but\nimportant graphs, including clique graphs, tree graphs, etc. Moreover, we make\nconnections between our results and reinforcement learning literature to give\nnew insights on why classical UCB and MCTS algorithms are so useful. Various\nnumerical results corroborate our theoretical findings.\n","authors":["Guanhua Fang","Gennady Samorodnitsky","Zhiqiang Xu"],"pdf_url":"https://arxiv.org/pdf/2306.04902v2.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2308.00284v2","updated":"2023-08-11T04:43:16Z","published":"2023-08-01T04:46:35Z","title":"CLAMS: A Cluster Ambiguity Measure for Estimating Perceptual Variability\n in Visual Clustering","summary":" Visual clustering is a common perceptual task in scatterplots that supports\ndiverse analytics tasks (e.g., cluster identification). However, even with the\nsame scatterplot, the ways of perceiving clusters (i.e., conducting visual\nclustering) can differ due to the differences among individuals and ambiguous\ncluster boundaries. Although such perceptual variability casts doubt on the\nreliability of data analysis based on visual clustering, we lack a systematic\nway to efficiently assess this variability. In this research, we study\nperceptual variability in conducting visual clustering, which we call Cluster\nAmbiguity. To this end, we introduce CLAMS, a data-driven visual quality\nmeasure for automatically predicting cluster ambiguity in monochrome\nscatterplots. We first conduct a qualitative study to identify key factors that\naffect the visual separation of clusters (e.g., proximity or size difference\nbetween clusters). Based on study findings, we deploy a regression module that\nestimates the human-judged separability of two clusters. Then, CLAMS predicts\ncluster ambiguity by analyzing the aggregated results of all pairwise\nseparability between clusters that are generated by the module. CLAMS\noutperforms widely-used clustering techniques in predicting ground truth\ncluster ambiguity. Meanwhile, CLAMS exhibits performance on par with human\nannotators. We conclude our work by presenting two applications for optimizing\nand benchmarking data mining techniques using CLAMS. The interactive demo of\nCLAMS is available at clusterambiguity.dev.\n","authors":["Hyeon Jeon","Ghulam Jilani Quadri","Hyunwook Lee","Paul Rosen","Danielle Albers Szafir","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2308.00284v2.pdf","comment":"IEEE Transactions on Visualization and Computer Graphics (TVCG)\n (Proc. IEEE VIS 2023); equally contributed by Hyeon Jeon and Ghulam Jilani\n Quadri"},{"id":"http://arxiv.org/abs/2308.00282v2","updated":"2023-08-11T04:39:33Z","published":"2023-08-01T04:38:15Z","title":"ZADU: A Python Library for Evaluating the Reliability of Dimensionality\n Reduction Embeddings","summary":" Dimensionality reduction (DR) techniques inherently distort the original\nstructure of input high-dimensional data, producing imperfect low-dimensional\nembeddings. Diverse distortion measures have thus been proposed to evaluate the\nreliability of DR embeddings. However, implementing and executing distortion\nmeasures in practice has so far been time-consuming and tedious. To address\nthis issue, we present ZADU, a Python library that provides distortion\nmeasures. ZADU is not only easy to install and execute but also enables\ncomprehensive evaluation of DR embeddings through three key features. First,\nthe library covers a wide range of distortion measures. Second, it\nautomatically optimizes the execution of distortion measures, substantially\nreducing the running time required to execute multiple measures. Last, the\nlibrary informs how individual points contribute to the overall distortions,\nfacilitating the detailed analysis of DR embeddings. By simulating a real-world\nscenario of optimizing DR embeddings, we verify that our optimization scheme\nsubstantially reduces the time required to execute distortion measures.\nFinally, as an application of ZADU, we present another library called ZADUVis\nthat allows users to easily create distortion visualizations that depict the\nextent to which each region of an embedding suffers from distortions.\n","authors":["Hyeon Jeon","Aeri Cho","Jinhwa Jang","Soohyun Lee","Jake Hyun","Hyung-Kwon Ko","Jaemin Jo","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2308.00282v2.pdf","comment":"2023 IEEE Visualization and Visual Analytics (IEEE VIS 2023) Short\n paper"},{"id":"http://arxiv.org/abs/2308.00278v2","updated":"2023-08-11T04:35:47Z","published":"2023-08-01T04:33:16Z","title":"Classes are not Clusters: Improving Label-based Evaluation of\n Dimensionality Reduction","summary":" A common way to evaluate the reliability of dimensionality reduction (DR)\nembeddings is to quantify how well labeled classes form compact, mutually\nseparated clusters in the embeddings. This approach is based on the assumption\nthat the classes stay as clear clusters in the original high-dimensional space.\nHowever, in reality, this assumption can be violated; a single class can be\nfragmented into multiple separated clusters, and multiple classes can be merged\ninto a single cluster. We thus cannot always assure the credibility of the\nevaluation using class labels. In this paper, we introduce two novel quality\nmeasures -- Label-Trustworthiness and Label-Continuity (Label-T&C) -- advancing\nthe process of DR evaluation based on class labels. Instead of assuming that\nclasses are well-clustered in the original space, Label-T&C work by (1)\nestimating the extent to which classes form clusters in the original and\nembedded spaces and (2) evaluating the difference between the two. A\nquantitative evaluation showed that Label-T&C outperform widely used DR\nevaluation measures (e.g., Trustworthiness and Continuity, Kullback-Leibler\ndivergence) in terms of the accuracy in assessing how well DR embeddings\npreserve the cluster structure, and are also scalable. Moreover, we present\ncase studies demonstrating that Label-T&C can be successfully used for\nrevealing the intrinsic characteristics of DR techniques and their\nhyperparameters.\n","authors":["Hyeon Jeon","Yun-Hsin Kuo","Michaël Aupetit","Kwan-Liu Ma","Jinwook Seo"],"pdf_url":"https://arxiv.org/pdf/2308.00278v2.pdf","comment":"IEEE Transactions on Visualization and Computer Graphics (TVCG)\n (Proc. IEEE VIS 2023)"},{"id":"http://arxiv.org/abs/2307.15980v3","updated":"2023-08-11T04:32:04Z","published":"2023-07-29T13:02:45Z","title":"Initial State Interventions for Deconfounded Imitation Learning","summary":" Imitation learning suffers from causal confusion. This phenomenon occurs when\nlearned policies attend to features that do not causally influence the expert\nactions but are instead spuriously correlated. Causally confused agents produce\nlow open-loop supervised loss but poor closed-loop performance upon deployment.\nWe consider the problem of masking observed confounders in a disentangled\nrepresentation of the observation space. Our novel masking algorithm leverages\nthe usual ability to intervene in the initial system state, avoiding any\nrequirement involving expert querying, expert reward functions, or causal graph\nspecification. Under certain assumptions, we theoretically prove that this\nalgorithm is conservative in the sense that it does not incorrectly mask\nobservations that causally influence the expert; furthermore, intervening on\nthe initial state serves to strictly reduce excess conservatism. The masking\nalgorithm is applied to behavior cloning for two illustrative control systems:\nCartPole and Reacher.\n","authors":["Samuel Pfrommer","Yatong Bai","Hyunin Lee","Somayeh Sojoudi"],"pdf_url":"https://arxiv.org/pdf/2307.15980v3.pdf","comment":"62nd IEEE Conference on Decision and Control"},{"id":"http://arxiv.org/abs/2209.05483v2","updated":"2023-08-11T04:28:51Z","published":"2022-09-12T06:14:04Z","title":"Self-Supervised Coordinate Projection Network for Sparse-View Computed\n Tomography","summary":" In the present work, we propose a Self-supervised COordinate Projection\nnEtwork (SCOPE) to reconstruct the artifacts-free CT image from a single SV\nsinogram by solving the inverse tomography imaging problem. Compared with\nrecent related works that solve similar problems using implicit neural\nrepresentation network (INR), our essential contribution is an effective and\nsimple re-projection strategy that pushes the tomography image reconstruction\nquality over supervised deep learning CT reconstruction works. The proposed\nstrategy is inspired by the simple relationship between linear algebra and\ninverse problems. To solve the under-determined linear equation system, we\nfirst introduce INR to constrain the solution space via image continuity prior\nand achieve a rough solution. And secondly, we propose to generate a dense view\nsinogram that improves the rank of the linear equation system and produces a\nmore stable CT image solution space. Our experiment results demonstrate that\nthe re-projection strategy significantly improves the image reconstruction\nquality (+3 dB for PSNR at least). Besides, we integrate the recent hash\nencoding into our SCOPE model, which greatly accelerates the model training.\nFinally, we evaluate SCOPE in parallel and fan X-ray beam SVCT reconstruction\ntasks. Experimental results indicate that the proposed SCOPE model outperforms\ntwo latest INR-based methods and two well-popular supervised DL methods\nquantitatively and qualitatively.\n","authors":["Qing Wu","Ruimin Feng","Hongjiang Wei","Jingyi Yu","Yuyao Zhang"],"pdf_url":"https://arxiv.org/pdf/2209.05483v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2308.05930v1","updated":"2023-08-11T04:24:39Z","published":"2023-08-11T04:24:39Z","title":"INR-Arch: A Dataflow Architecture and Compiler for Arbitrary-Order\n Gradient Computations in Implicit Neural Representation Processing","summary":" An increasing number of researchers are finding use for nth-order gradient\ncomputations for a wide variety of applications, including graphics,\nmeta-learning (MAML), scientific computing, and most recently, implicit neural\nrepresentations (INRs). Recent work shows that the gradient of an INR can be\nused to edit the data it represents directly without needing to convert it back\nto a discrete representation. However, given a function represented as a\ncomputation graph, traditional architectures face challenges in efficiently\ncomputing its nth-order gradient due to the higher demand for computing power\nand higher complexity in data movement. This makes it a promising target for\nFPGA acceleration. In this work, we introduce INR-Arch, a framework that\ntransforms the computation graph of an nth-order gradient into a\nhardware-optimized dataflow architecture. We address this problem in two\nphases. First, we design a dataflow architecture that uses FIFO streams and an\noptimized computation kernel library, ensuring high memory efficiency and\nparallel computation. Second, we propose a compiler that extracts and optimizes\ncomputation graphs, automatically configures hardware parameters such as\nlatency and stream depths to optimize throughput, while ensuring deadlock-free\noperation, and outputs High-Level Synthesis (HLS) code for FPGA implementation.\nWe utilize INR editing as our benchmark, presenting results that demonstrate\n1.8-4.8x and 1.5-3.6x speedup compared to CPU and GPU baselines respectively.\nFurthermore, we obtain 3.1-8.9x and 1.7-4.3x lower memory usage, and 1.7-11.3x\nand 5.5-32.8x lower energy-delay product. Our framework will be made\nopen-source and available on GitHub.\n","authors":["Stefan Abi-Karam","Rishov Sarkar","Dejia Xu","Zhiwen Fan","Zhangyang Wang","Cong Hao"],"pdf_url":"https://arxiv.org/pdf/2308.05930v1.pdf","comment":"9 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2301.13349v4","updated":"2023-08-11T04:20:33Z","published":"2023-01-31T00:52:14Z","title":"Unconstrained Dynamic Regret via Sparse Coding","summary":" Motivated by the challenge of nonstationarity in sequential decision making,\nwe study Online Convex Optimization (OCO) under the coupling of two problem\nstructures: the domain is unbounded, and the comparator sequence\n$u_1,\\ldots,u_T$ is arbitrarily time-varying. As no algorithm can guarantee low\nregret simultaneously against all comparator sequences, handling this setting\nrequires moving from minimax optimality to comparator adaptivity. That is,\nsensible regret bounds should depend on certain complexity measures of the\ncomparator relative to one's prior knowledge.\n This paper achieves a new type of these adaptive regret bounds via a sparse\ncoding framework. The complexity of the comparator is measured by its energy\nand its sparsity on a user-specified dictionary, which offers considerable\nversatility. Equipped with a wavelet dictionary for example, our framework\nimproves the state-of-the-art bound (Jacobsen & Cutkosky, 2022) by adapting to\nboth ($i$) the magnitude of the comparator average $||\\bar\nu||=||\\sum_{t=1}^Tu_t/T||$, rather than the maximum $\\max_t||u_t||$; and ($ii$)\nthe comparator variability $\\sum_{t=1}^T||u_t-\\bar u||$, rather than the\nuncentered sum $\\sum_{t=1}^T||u_t||$. Furthermore, our analysis is simpler due\nto decoupling function approximation from regret minimization.\n","authors":["Zhiyu Zhang","Ashok Cutkosky","Ioannis Ch. Paschalidis"],"pdf_url":"https://arxiv.org/pdf/2301.13349v4.pdf","comment":"Small technical improvements + fixing typos"},{"id":"http://arxiv.org/abs/2306.15932v2","updated":"2023-08-11T04:17:11Z","published":"2023-06-28T05:33:11Z","title":"NIPD: A Federated Learning Person Detection Benchmark Based on\n Real-World Non-IID Data","summary":" Federated learning (FL), a privacy-preserving distributed machine learning,\nhas been rapidly applied in wireless communication networks. FL enables\nInternet of Things (IoT) clients to obtain well-trained models while preventing\nprivacy leakage. Person detection can be deployed on edge devices with limited\ncomputing power if combined with FL to process the video data directly at the\nedge. However, due to the different hardware and deployment scenarios of\ndifferent cameras, the data collected by the camera present non-independent and\nidentically distributed (non-IID), and the global model derived from FL\naggregation is less effective. Meanwhile, existing research lacks public data\nset for real-world FL object detection, which is not conducive to studying the\nnon-IID problem on IoT cameras. Therefore, we open source a non-IID IoT person\ndetection (NIPD) data set, which is collected from five different cameras. To\nour knowledge, this is the first true device-based non-IID person detection\ndata set. Based on this data set, we explain how to establish a FL experimental\nplatform and provide a benchmark for non-IID person detection. NIPD is expected\nto promote the application of FL and the security of smart city.\n","authors":["Kangning Yin","Zhen Ding","Zhihua Dong","Dongsheng Chen","Jie Fu","Xinhui Ji","Guangqiang Yin","Zhiguo Wang"],"pdf_url":"https://arxiv.org/pdf/2306.15932v2.pdf","comment":"8 pages, 5 figures, 3 tables, FL-IJCAI 23 conference"},{"id":"http://arxiv.org/abs/2306.14701v2","updated":"2023-08-11T03:39:46Z","published":"2023-06-26T13:47:38Z","title":"Hard Sample Mining Enabled Supervised Contrastive Feature Learning for\n Wind Turbine Pitch System Fault Diagnosis","summary":" The efficient utilization of wind power by wind turbines relies on the\nability of their pitch systems to adjust blade pitch angles in response to\nvarying wind speeds. However, the presence of multiple health conditions in the\npitch system due to the long-term wear and tear poses challenges in accurately\nclassifying them, thus increasing the maintenance cost of wind turbines or even\ndamaging them. This paper proposes a novel method based on hard sample\nmining-enabled supervised contrastive learning (HSMSCL) to address this\nproblem. The proposed method employs cosine similarity to identify hard samples\nand subsequently, leverages supervised contrastive learning to learn more\ndiscriminative representations by constructing hard sample pairs. Furthermore,\nthe hard sample mining framework in the proposed method also constructs hard\nsamples with learned representations to make the training process of the\nmultilayer perceptron (MLP) more challenging and make it a more effective\nclassifier. The proposed approach progressively improves the fault diagnosis\nmodel by introducing hard samples in the SCL and MLP phases, thus enhancing its\nperformance in complex multi-class fault diagnosis tasks.\n To evaluate the effectiveness of the proposed method, two real datasets\ncomprising wind turbine pitch system cog belt fracture data are utilized. The\nfault diagnosis performance of the proposed method is compared against existing\nmethods, and the results demonstrate its superior performance. The proposed\napproach exhibits significant improvements in fault diagnosis performance,\nproviding promising prospects for enhancing the reliability and efficiency of\nwind turbine pitch system fault diagnosis.\n","authors":["Zixuan Wang","Bo Qin","Mengxuan Li","Chenlu Zhan","Mark D. Butala","Peng Peng","Hongwei Wang"],"pdf_url":"https://arxiv.org/pdf/2306.14701v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14770v2","updated":"2023-08-11T03:20:07Z","published":"2023-02-28T17:11:42Z","title":"Completeness of Atomic Structure Representations","summary":" In this paper, we address the challenge of obtaining a comprehensive and\nsymmetric representation of point particle groups, such as atoms in a molecule,\nwhich is crucial in physics and theoretical chemistry. The problem has become\neven more important with the widespread adoption of machine-learning techniques\nin science, as it underpins the capacity of models to accurately reproduce\nphysical relationships while being consistent with fundamental symmetries and\nconservation laws. However, the descriptors that are commonly used to represent\npoint clouds -- most notably those adopted to describe matter at the atomic\nscale -- are unable to distinguish between special arrangements of particles.\nThis makes it impossible to machine learn their properties. Frameworks that are\nprovably complete exist but are only so in the limit in which they\nsimultaneously describe the mutual relationship between all atoms, which is\nimpractical. We present a novel approach to construct descriptors of finite\ncorrelations based on the relative arrangement of particle triplets, which can\nbe employed to create symmetry-adapted models with universal approximation\ncapabilities. Our strategy is demonstrated on a class of atomic arrangements\nthat are specifically built to defy a broad class of conventional symmetric\ndescriptors, showcasing its potential for addressing their limitations.\n","authors":["Jigyasa Nigam","Sergey N. Pozdnyakov","Kevin K. Huguenin-Dumittan","Michele Ceriotti"],"pdf_url":"https://arxiv.org/pdf/2302.14770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.10510v3","updated":"2023-08-11T03:07:28Z","published":"2022-09-21T17:15:58Z","title":"Learning to Relight Portrait Images via a Virtual Light Stage and\n Synthetic-to-Real Adaptation","summary":" Given a portrait image of a person and an environment map of the target\nlighting, portrait relighting aims to re-illuminate the person in the image as\nif the person appeared in an environment with the target lighting. To achieve\nhigh-quality results, recent methods rely on deep learning. An effective\napproach is to supervise the training of deep neural networks with a\nhigh-fidelity dataset of desired input-output pairs, captured with a light\nstage. However, acquiring such data requires an expensive special capture rig\nand time-consuming efforts, limiting access to only a few resourceful\nlaboratories. To address the limitation, we propose a new approach that can\nperform on par with the state-of-the-art (SOTA) relighting methods without\nrequiring a light stage. Our approach is based on the realization that a\nsuccessful relighting of a portrait image depends on two conditions. First, the\nmethod needs to mimic the behaviors of physically-based relighting. Second, the\noutput has to be photorealistic. To meet the first condition, we propose to\ntrain the relighting network with training data generated by a virtual light\nstage that performs physically-based rendering on various 3D synthetic humans\nunder different environment maps. To meet the second condition, we develop a\nnovel synthetic-to-real approach to bring photorealism to the relighting\nnetwork output. In addition to achieving SOTA results, our approach offers\nseveral advantages over the prior methods, including controllable glares on\nglasses and more temporally-consistent results for relighting videos.\n","authors":["Yu-Ying Yeh","Koki Nagano","Sameh Khamis","Jan Kautz","Ming-Yu Liu","Ting-Chun Wang"],"pdf_url":"https://arxiv.org/pdf/2209.10510v3.pdf","comment":"To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21\n pages, 25 figures, 7 tables. Project page:\n https://research.nvidia.com/labs/dir/lumos/"},{"id":"http://arxiv.org/abs/2308.05476v2","updated":"2023-08-11T02:50:00Z","published":"2023-08-10T10:07:00Z","title":"Exploring Machine Learning and Transformer-based Approaches for\n Deceptive Text Classification: A Comparative Analysis","summary":" Deceptive text classification is a critical task in natural language\nprocessing that aims to identify deceptive o fraudulent content. This study\npresents a comparative analysis of machine learning and transformer-based\napproaches for deceptive text classification. We investigate the effectiveness\nof traditional machine learning algorithms and state-of-the-art transformer\nmodels, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive\ntext. A labeled dataset consisting of deceptive and non-deceptive texts is used\nfor training and evaluation purposes. Through extensive experimentation, we\ncompare the performance metrics, including accuracy, precision, recall, and F1\nscore, of the different approaches. The results of this study shed light on the\nstrengths and limitations of machine learning and transformer-based methods for\ndeceptive text classification, enabling researchers and practitioners to make\ninformed decisions when dealing with deceptive content.\n","authors":["Anusuya Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.05476v2.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.05906v1","updated":"2023-08-11T02:05:08Z","published":"2023-08-11T02:05:08Z","title":"On the equivalence of Occam algorithms","summary":" Blumer et al. (1987, 1989) showed that any concept class that is learnable by\nOccam algorithms is PAC learnable. Board and Pitt (1990) showed a partial\nconverse of this theorem: for concept classes that are closed under exception\nlists, any class that is PAC learnable is learnable by an Occam algorithm.\nHowever, their Occam algorithm outputs a hypothesis whose complexity is\n$\\delta$-dependent, which is an important limitation. In this paper, we show\nthat their partial converse applies to Occam algorithms with\n$\\delta$-independent complexities as well. Thus, we provide a posteriori\njustification of various theoretical results and algorithm design methods which\nuse the partial converse as a basis for their work.\n","authors":["Zaman Keinath-Esmail"],"pdf_url":"https://arxiv.org/pdf/2308.05906v1.pdf","comment":"13 pages, submitted to Information and Computation"},{"id":"http://arxiv.org/abs/2306.09927v2","updated":"2023-08-11T02:04:46Z","published":"2023-06-16T15:50:03Z","title":"Trained Transformers Learn Linear Models In-Context","summary":" Attention-based neural networks such as transformers have demonstrated a\nremarkable ability to exhibit in-context learning (ICL): Given a short prompt\nsequence of tokens from an unseen task, they can formulate relevant per-token\nand next-token predictions without any parameter updates. By embedding a\nsequence of labeled training data and unlabeled test data as a prompt, this\nallows for transformers to behave like supervised learning algorithms. Indeed,\nrecent work has shown that when training transformer architectures over random\ninstances of linear regression problems, these models' predictions mimic those\nof ordinary least squares.\n Towards understanding the mechanisms underlying this phenomenon, we\ninvestigate the dynamics of ICL in transformers with a single linear\nself-attention layer trained by gradient flow on linear regression tasks. We\nshow that despite non-convexity, gradient flow with a suitable random\ninitialization finds a global minimum of the objective function. At this global\nminimum, when given a test prompt of labeled examples from a new prediction\ntask, the transformer achieves prediction error competitive with the best\nlinear predictor over the test prompt distribution. We additionally\ncharacterize the robustness of the trained transformer to a variety of\ndistribution shifts and show that although a number of shifts are tolerated,\nshifts in the covariate distribution of the prompts are not. Motivated by this,\nwe consider a generalized ICL setting where the covariate distributions can\nvary across prompts. We show that although gradient flow succeeds at finding a\nglobal minimum in this setting, the trained transformer is still brittle under\nmild covariate shifts. We complement this finding with experiments on large,\nnonlinear transformer architectures which we show are more robust under\ncovariate shifts.\n","authors":["Ruiqi Zhang","Spencer Frei","Peter L. Bartlett"],"pdf_url":"https://arxiv.org/pdf/2306.09927v2.pdf","comment":"50 pages, experiments added, reference added, typo corrected"},{"id":"http://arxiv.org/abs/2308.05903v1","updated":"2023-08-11T01:55:14Z","published":"2023-08-11T01:55:14Z","title":"Comparing the quality of neural network uncertainty estimates for\n classification problems","summary":" Traditional deep learning (DL) models are powerful classifiers, but many\napproaches do not provide uncertainties for their estimates. Uncertainty\nquantification (UQ) methods for DL models have received increased attention in\nthe literature due to their usefulness in decision making, particularly for\nhigh-consequence decisions. However, there has been little research done on how\nto evaluate the quality of such methods. We use statistical methods of\nfrequentist interval coverage and interval width to evaluate the quality of\ncredible intervals, and expected calibration error to evaluate classification\npredicted confidence. These metrics are evaluated on Bayesian neural networks\n(BNN) fit using Markov Chain Monte Carlo (MCMC) and variational inference (VI),\nbootstrapped neural networks (NN), Deep Ensembles (DE), and Monte Carlo (MC)\ndropout. We apply these different UQ for DL methods to a hyperspectral image\ntarget detection problem and show the inconsistency of the different methods'\nresults and the necessity of a UQ quality metric. To reconcile these\ndifferences and choose a UQ method that appropriately quantifies the\nuncertainty, we create a simulated data set with fully parameterized\nprobability distribution for a two-class classification problem. The gold\nstandard MCMC performs the best overall, and the bootstrapped NN is a close\nsecond, requiring the same computational expense as DE. Through this\ncomparison, we demonstrate that, for a given data set, different models can\nproduce uncertainty estimates of markedly different quality. This in turn\npoints to a great need for principled assessment methods of UQ quality in DL\napplications.\n","authors":["Daniel Ries","Joshua Michalenko","Tyler Ganter","Rashad Imad-Fayez Baiyasi","Jason Adams"],"pdf_url":"https://arxiv.org/pdf/2308.05903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.11750v3","updated":"2023-08-11T01:45:43Z","published":"2020-10-22T14:14:20Z","title":"Precise High-Dimensional Asymptotics for Quantifying Heterogeneous\n Transfers","summary":" The problem of learning one task with samples from another task has received\nmuch interest recently. In this paper, we ask a fundamental question: when is\ncombining data from two tasks better than learning one task alone? Intuitively,\nthe transfer effect from one task to another task depends on dataset shifts\nsuch as sample sizes and covariance matrices. However, quantifying such a\ntransfer effect is challenging since we need to compare the risks between joint\nlearning and single-task learning, and the comparative advantage of one over\nthe other depends on the exact kind of dataset shift between both tasks. This\npaper uses random matrix theory to tackle this challenge in a linear regression\nsetting with two tasks. We give precise asymptotics about the excess risks of\nsome commonly used estimators in the high-dimensional regime, when the sample\nsizes increase proportionally with the feature dimension at fixed ratios. The\nprecise asymptotics is provided as a function of the sample sizes and\ncovariate/model shifts, which can be used to study transfer effects: In a\nrandom-effects model, we give conditions to determine positive and negative\ntransfers between learning two tasks versus single-task learning; the\nconditions reveal intricate relations between dataset shifts and transfer\neffects. Simulations justify the validity of the asymptotics in finite\ndimensions. Our analysis examines several functions of two different sample\ncovariance matrices, revealing some estimates that generalize classical results\nin the random matrix theory literature, which may be of independent interest.\n","authors":["Fan Yang","Hongyang R. Zhang","Sen Wu","Christopher Ré","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2010.11750v3.pdf","comment":"64 pages, 6 figures; We thoroughly revised the paper by adding new\n results and reorganizing the presentation"},{"id":"http://arxiv.org/abs/2308.05309v2","updated":"2023-08-11T01:09:34Z","published":"2023-08-10T02:53:30Z","title":"Homophily-enhanced Structure Learning for Graph Clustering","summary":" Graph clustering is a fundamental task in graph analysis, and recent advances\nin utilizing graph neural networks (GNNs) have shown impressive results.\nDespite the success of existing GNN-based graph clustering methods, they often\noverlook the quality of graph structure, which is inherent in real-world graphs\ndue to their sparse and multifarious nature, leading to subpar performance.\nGraph structure learning allows refining the input graph by adding missing\nlinks and removing spurious connections. However, previous endeavors in graph\nstructure learning have predominantly centered around supervised settings, and\ncannot be directly applied to our specific clustering tasks due to the absence\nof ground-truth labels. To bridge the gap, we propose a novel method called\n\\textbf{ho}mophily-enhanced structure \\textbf{le}arning for graph clustering\n(HoLe). Our motivation stems from the observation that subtly enhancing the\ndegree of homophily within the graph structure can significantly improve GNNs\nand clustering outcomes. To realize this objective, we develop two\nclustering-oriented structure learning modules, i.e., hierarchical correlation\nestimation and cluster-aware sparsification. The former module enables a more\naccurate estimation of pairwise node relationships by leveraging guidance from\nlatent and clustering spaces, while the latter one generates a sparsified\nstructure based on the similarity matrix and clustering assignments.\nAdditionally, we devise a joint optimization approach alternating between\ntraining the homophily-enhanced structure learning and GNN-based clustering,\nthereby enforcing their reciprocal effects. Extensive experiments on seven\nbenchmark datasets of various types and scales, across a range of clustering\nmetrics, demonstrate the superiority of HoLe against state-of-the-art\nbaselines.\n","authors":["Ming Gu","Gaoming Yang","Sheng Zhou","Ning Ma","Jiawei Chen","Qiaoyu Tan","Meihan Liu","Jiajun Bu"],"pdf_url":"https://arxiv.org/pdf/2308.05309v2.pdf","comment":"11 pages with 7 figures. Accepted by CIKM'23"},{"id":"http://arxiv.org/abs/2308.05893v1","updated":"2023-08-11T00:59:29Z","published":"2023-08-11T00:59:29Z","title":"Learning to Team-Based Navigation: A Review of Deep Reinforcement\n Learning Techniques for Multi-Agent Pathfinding","summary":" Multi-agent pathfinding (MAPF) is a critical field in many large-scale\nrobotic applications, often being the fundamental step in multi-agent systems.\nThe increasing complexity of MAPF in complex and crowded environments, however,\ncritically diminishes the effectiveness of existing solutions. In contrast to\nother studies that have either presented a general overview of the recent\nadvancements in MAPF or extensively reviewed Deep Reinforcement Learning (DRL)\nwithin multi-agent system settings independently, our work presented in this\nreview paper focuses on highlighting the integration of DRL-based approaches in\nMAPF. Moreover, we aim to bridge the current gap in evaluating MAPF solutions\nby addressing the lack of unified evaluation metrics and providing\ncomprehensive clarification on these metrics. Finally, our paper discusses the\npotential of model-based DRL as a promising future direction and provides its\nrequired foundational understanding to address current challenges in MAPF. Our\nobjective is to assist readers in gaining insight into the current research\ndirection, providing unified metrics for comparing different MAPF algorithms\nand expanding their knowledge of model-based DRL to address the existing\nchallenges in MAPF.\n","authors":["Jaehoon Chung","Jamil Fayyad","Younes Al Younes","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2308.05893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17020v2","updated":"2023-08-11T00:47:30Z","published":"2022-10-31T02:25:38Z","title":"A Law of Data Separation in Deep Learning","summary":" While deep learning has enabled significant advances in many areas of\nscience, its black-box nature hinders architecture design for future artificial\nintelligence applications and interpretation for high-stakes decision makings.\nWe addressed this issue by studying the fundamental question of how deep neural\nnetworks process data in the intermediate layers. Our finding is a simple and\nquantitative law that governs how deep neural networks separate data according\nto class membership throughout all layers for classification. This law shows\nthat each layer improves data separation at a constant geometric rate, and its\nemergence is observed in a collection of network architectures and datasets\nduring training. This law offers practical guidelines for designing\narchitectures, improving model robustness and out-of-sample performance, as\nwell as interpreting the predictions.\n","authors":["Hangfeng He","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2210.17020v2.pdf","comment":"Accepted at PNAS"},{"id":"http://arxiv.org/abs/2308.05889v1","updated":"2023-08-11T00:44:46Z","published":"2023-08-11T00:44:46Z","title":"DF2: Distribution-Free Decision-Focused Learning","summary":" Decision-focused learning (DFL) has recently emerged as a powerful approach\nfor predict-then-optimize problems by customizing a predictive model to a\ndownstream optimization task. However, existing end-to-end DFL methods are\nhindered by three significant bottlenecks: model mismatch error, sample average\napproximation error, and gradient approximation error. Model mismatch error\nstems from the misalignment between the model's parameterized predictive\ndistribution and the true probability distribution. Sample average\napproximation error arises when using finite samples to approximate the\nexpected optimization objective. Gradient approximation error occurs as DFL\nrelies on the KKT condition for exact gradient computation, while most methods\napproximate the gradient for backpropagation in non-convex objectives. In this\npaper, we present DF2 -- the first \\textit{distribution-free} decision-focused\nlearning method explicitly designed to address these three bottlenecks. Rather\nthan depending on a task-specific forecaster that requires precise model\nassumptions, our method directly learns the expected optimization function\nduring training. To efficiently learn the function in a data-driven manner, we\ndevise an attention-based model architecture inspired by the distribution-based\nparameterization of the expected objective. Our method is, to the best of our\nknowledge, the first to address all three bottlenecks within a single model. We\nevaluate DF2 on a synthetic problem, a wind power bidding problem, and a\nnon-convex vaccine distribution problem, demonstrating the effectiveness of\nDF2.\n","authors":["Lingkai Kong","Wenhao Mu","Jiaming Cui","Yuchen Zhuang","B. Aditya Prakash","Bo Dai","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05889v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2201.07646v4","updated":"2023-08-11T00:13:54Z","published":"2022-01-19T15:23:46Z","title":"A Survey on Training Challenges in Generative Adversarial Networks for\n Biomedical Image Analysis","summary":" In biomedical image analysis, the applicability of deep learning methods is\ndirectly impacted by the quantity of image data available. This is due to deep\nlearning models requiring large image datasets to provide high-level\nperformance. Generative Adversarial Networks (GANs) have been widely utilized\nto address data limitations through the generation of synthetic biomedical\nimages. GANs consist of two models. The generator, a model that learns how to\nproduce synthetic images based on the feedback it receives. The discriminator,\na model that classifies an image as synthetic or real and provides feedback to\nthe generator. Throughout the training process, a GAN can experience several\ntechnical challenges that impede the generation of suitable synthetic imagery.\nFirst, the mode collapse problem whereby the generator either produces an\nidentical image or produces a uniform image from distinct input features.\nSecond, the non-convergence problem whereby the gradient descent optimizer\nfails to reach a Nash equilibrium. Thirdly, the vanishing gradient problem\nwhereby unstable training behavior occurs due to the discriminator achieving\noptimal classification performance resulting in no meaningful feedback being\nprovided to the generator. These problems result in the production of synthetic\nimagery that is blurry, unrealistic, and less diverse. To date, there has been\nno survey article outlining the impact of these technical challenges in the\ncontext of the biomedical imagery domain. This work presents a review and\ntaxonomy based on solutions to the training problems of GANs in the biomedical\nimaging domain. This survey highlights important challenges and outlines future\nresearch directions about the training of GANs in the domain of biomedical\nimagery.\n","authors":["Muhammad Muneeb Saad","Ruairi O'Reilly","Mubashir Husain Rehmani"],"pdf_url":"https://arxiv.org/pdf/2201.07646v4.pdf","comment":"Submitted to the AI Review Journal"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.04522v2","updated":"2023-08-11T15:39:03Z","published":"2023-08-08T18:37:24Z","title":"Deep Learning for Diverse Data Types Steganalysis: A Review","summary":" Steganography and steganalysis are two interrelated aspects of the field of\ninformation security. Steganography seeks to conceal communications, whereas\nsteganalysis is aimed to either find them or even, if possible, recover the\ndata they contain. Steganography and steganalysis have attracted a great deal\nof interest, particularly from law enforcement. Steganography is often used by\ncybercriminals and even terrorists to avoid being captured while in possession\nof incriminating evidence, even encrypted, since cryptography is prohibited or\nrestricted in many countries. Therefore, knowledge of cutting-edge techniques\nto uncover concealed information is crucial in exposing illegal acts. Over the\nlast few years, a number of strong and reliable steganography and steganalysis\ntechniques have been introduced in the literature. This review paper provides a\ncomprehensive overview of deep learning-based steganalysis techniques used to\ndetect hidden information within digital media. The paper covers all types of\ncover in steganalysis, including image, audio, and video, and discusses the\nmost commonly used deep learning techniques. In addition, the paper explores\nthe use of more advanced deep learning techniques, such as deep transfer\nlearning (DTL) and deep reinforcement learning (DRL), to enhance the\nperformance of steganalysis systems. The paper provides a systematic review of\nrecent research in the field, including data sets and evaluation metrics used\nin recent studies. It also presents a detailed analysis of DTL-based\nsteganalysis approaches and their performance on different data sets. The\nreview concludes with a discussion on the current state of deep learning-based\nsteganalysis, challenges, and future research directions.\n","authors":["Hamza Kheddar","Mustapha Hemis","Yassine Himeur","David Megías","Abbes Amira"],"pdf_url":"https://arxiv.org/pdf/2308.04522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06087v1","updated":"2023-08-11T11:57:58Z","published":"2023-08-11T11:57:58Z","title":"Audio-Visual Spatial Integration and Recursive Attention for Robust\n Sound Source Localization","summary":" The objective of the sound source localization task is to enable machines to\ndetect the location of sound-making objects within a visual scene. While the\naudio modality provides spatial cues to locate the sound source, existing\napproaches only use audio as an auxiliary role to compare spatial regions of\nthe visual modality. Humans, on the other hand, utilize both audio and visual\nmodalities as spatial cues to locate sound sources. In this paper, we propose\nan audio-visual spatial integration network that integrates spatial cues from\nboth modalities to mimic human behavior when detecting sound-making objects.\nAdditionally, we introduce a recursive attention network to mimic human\nbehavior of iterative focusing on objects, resulting in more accurate attention\nregions. To effectively encode spatial information from both modalities, we\npropose audio-visual pair matching loss and spatial region alignment loss. By\nutilizing the spatial cues of audio-visual modalities and recursively focusing\nobjects, our method can perform more robust sound source localization.\nComprehensive experimental results on the Flickr SoundNet and VGG-Sound Source\ndatasets demonstrate the superiority of our proposed method over existing\napproaches. Our code is available at: https://github.com/VisualAIKHU/SIRA-SSL\n","authors":["Sung Jin Um","Dongjin Kim","Jung Uk Kim"],"pdf_url":"https://arxiv.org/pdf/2308.06087v1.pdf","comment":"Camera-Ready, ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.06076v1","updated":"2023-08-11T11:29:01Z","published":"2023-08-11T11:29:01Z","title":"Versatile Face Animator: Driving Arbitrary 3D Facial Avatar in RGBD\n Space","summary":" Creating realistic 3D facial animation is crucial for various applications in\nthe movie production and gaming industry, especially with the burgeoning demand\nin the metaverse. However, prevalent methods such as blendshape-based\napproaches and facial rigging techniques are time-consuming, labor-intensive,\nand lack standardized configurations, making facial animation production\nchallenging and costly. In this paper, we propose a novel self-supervised\nframework, Versatile Face Animator, which combines facial motion capture with\nmotion retargeting in an end-to-end manner, eliminating the need for\nblendshapes or rigs. Our method has the following two main characteristics: 1)\nwe propose an RGBD animation module to learn facial motion from raw RGBD videos\nby hierarchical motion dictionaries and animate RGBD images rendered from 3D\nfacial mesh coarse-to-fine, enabling facial animation on arbitrary 3D\ncharacters regardless of their topology, textures, blendshapes, and rigs; and\n2) we introduce a mesh retarget module to utilize RGBD animation to create 3D\nfacial animation by manipulating facial mesh with controller transformations,\nwhich are estimated from dense optical flow fields and blended together with\ngeodesic-distance-based weights. Comprehensive experiments demonstrate the\neffectiveness of our proposed framework in generating impressive 3D facial\nanimation results, highlighting its potential as a promising solution for the\ncost-effective and efficient production of facial animation in the metaverse.\n","authors":["Haoyu Wang","Haozhe Wu","Junliang Xing","Jia Jia"],"pdf_url":"https://arxiv.org/pdf/2308.06076v1.pdf","comment":"Accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2306.02898v3","updated":"2023-08-11T11:13:08Z","published":"2023-06-05T14:06:24Z","title":"Towards Unified Text-based Person Retrieval: A Large-scale\n Multi-Attribute and Language Search Benchmark","summary":" In this paper, we introduce a large Multi-Attribute and Language Search\ndataset for text-based person retrieval, called MALS, and explore the\nfeasibility of performing pre-training on both attribute recognition and\nimage-text matching tasks in one stone. In particular, MALS contains 1,510,330\nimage-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,\nand all images are annotated with 27 attributes. Considering the privacy\nconcerns and annotation costs, we leverage the off-the-shelf diffusion models\nto generate the dataset. To verify the feasibility of learning from the\ngenerated data, we develop a new joint Attribute Prompt Learning and Text\nMatching Learning (APTM) framework, considering the shared knowledge between\nattribute and text. As the name implies, APTM contains an attribute prompt\nlearning stream and a text matching learning stream. (1) The attribute prompt\nlearning leverages the attribute prompts for image-attribute alignment, which\nenhances the text matching learning. (2) The text matching learning facilitates\nthe representation learning on fine-grained details, and in turn, boosts the\nattribute prompt learning. Extensive experiments validate the effectiveness of\nthe pre-training on MALS, achieving state-of-the-art retrieval performance via\nAPTM on three challenging real-world benchmarks. In particular, APTM achieves a\nconsistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on\nCUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.\n","authors":["Shuyu Yang","Yinan Zhou","Yaxiong Wang","Yujiao Wu","Li Zhu","Zhedong Zheng"],"pdf_url":"https://arxiv.org/pdf/2306.02898v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06009v1","updated":"2023-08-11T08:30:08Z","published":"2023-08-11T08:30:08Z","title":"ViGT: Proposal-free Video Grounding with Learnable Token in Transformer","summary":" The video grounding (VG) task aims to locate the queried action or event in\nan untrimmed video based on rich linguistic descriptions. Existing\nproposal-free methods are trapped in complex interaction between video and\nquery, overemphasizing cross-modal feature fusion and feature correlation for\nVG. In this paper, we propose a novel boundary regression paradigm that\nperforms regression token learning in a transformer. Particularly, we present a\nsimple but effective proposal-free framework, namely Video Grounding\nTransformer (ViGT), which predicts the temporal boundary using a learnable\nregression token rather than multi-modal or cross-modal features. In ViGT, the\nbenefits of a learnable token are manifested as follows. (1) The token is\nunrelated to the video or the query and avoids data bias toward the original\nvideo and query. (2) The token simultaneously performs global context\naggregation from video and query features. First, we employed a sharing feature\nencoder to project both video and query into a joint feature space before\nperforming cross-modal co-attention (i.e., video-to-query attention and\nquery-to-video attention) to highlight discriminative features in each\nmodality. Furthermore, we concatenated a learnable regression token [REG] with\nthe video and query features as the input of a vision-language transformer.\nFinally, we utilized the token [REG] to predict the target moment and visual\nfeatures to constrain the foreground and background probabilities at each\ntimestamp. The proposed ViGT performed well on three public datasets: ANet\nCaptions, TACoS and YouCookII. Extensive ablation studies and qualitative\nanalysis further validated the interpretability of ViGT.\n","authors":["Kun Li","Dan Guo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06009v1.pdf","comment":"This paper has been accepted by SCIENCE CHINA Information Sciences"},{"id":"http://arxiv.org/abs/2308.05995v1","updated":"2023-08-11T08:03:28Z","published":"2023-08-11T08:03:28Z","title":"Audio is all in one: speech-driven gesture synthetics using WavLM\n pre-trained model","summary":" The generation of co-speech gestures for digital humans is an emerging area\nin the field of virtual human creation. Prior research has made progress by\nusing acoustic and semantic information as input and adopting classify method\nto identify the person's ID and emotion for driving co-speech gesture\ngeneration. However, this endeavour still faces significant challenges. These\nchallenges go beyond the intricate interplay between co-speech gestures, speech\nacoustic, and semantics; they also encompass the complexities associated with\npersonality, emotion, and other obscure but important factors. This paper\nintroduces \"diffmotion-v2,\" a speech-conditional diffusion-based and\nnon-autoregressive transformer-based generative model with WavLM pre-trained\nmodel. It can produce individual and stylized full-body co-speech gestures only\nusing raw speech audio, eliminating the need for complex multimodal processing\nand manually annotated. Firstly, considering that speech audio not only\ncontains acoustic and semantic features but also conveys personality traits,\nemotions, and more subtle information related to accompanying gestures, we\npioneer the adaptation of WavLM, a large-scale pre-trained model, to extract\nlow-level and high-level audio information. Secondly, we introduce an adaptive\nlayer norm architecture in the transformer-based layer to learn the\nrelationship between speech information and accompanying gestures. Extensive\nsubjective evaluation experiments are conducted on the Trinity, ZEGGS, and BEAT\ndatasets to confirm the WavLM and the model's ability to synthesize natural\nco-speech gestures with various styles.\n","authors":["Fan Zhang","Naye Ji","Fuxing Gao","Siyuan Zhao","Zhaohan Wang","Shunman Li"],"pdf_url":"https://arxiv.org/pdf/2308.05995v1.pdf","comment":"10 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2306.08966v2","updated":"2023-08-11T04:55:40Z","published":"2023-06-15T09:01:33Z","title":"Training Multimedia Event Extraction With Generated Images and Captions","summary":" Contemporary news reporting increasingly features multimedia content,\nmotivating research on multimedia event extraction. However, the task lacks\nannotated multimodal training data and artificially generated training data\nsuffer from distribution shift from real-world data. In this paper, we propose\nCross-modality Augmented Multimedia Event Learning (CAMEL), which successfully\nutilizes artificially generated multimodal training data and achieves\nstate-of-the-art performance. We start with two labeled unimodal datasets in\ntext and image respectively, and generate the missing modality using\noff-the-shelf image generators like Stable Diffusion and image captioners like\nBLIP. After that, we train the network on the resultant multimodal datasets. In\norder to learn robust features that are effective across domains, we devise an\niterative and gradual training strategy. Substantial experiments show that\nCAMEL surpasses state-of-the-art (SOTA) baselines on the M2E2 benchmark. On\nmultimedia events in particular, we outperform the prior SOTA by 4.2% F1 on\nevent mention identification and by 9.8% F1 on argument identification, which\nindicates that CAMEL learns synergistic representations from the two\nmodalities. Our work demonstrates a recipe to unleash the power of synthetic\ntraining data in structured prediction.\n","authors":["Zilin Du","Yunxin Li","Xu Guo","Yidan Sun","Boyang Li"],"pdf_url":"https://arxiv.org/pdf/2306.08966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05920v1","updated":"2023-08-11T03:07:31Z","published":"2023-08-11T03:07:31Z","title":"Semantics2Hands: Transferring Hand Motion Semantics between Avatars","summary":" Human hands, the primary means of non-verbal communication, convey intricate\nsemantics in various scenarios. Due to the high sensitivity of individuals to\nhand motions, even minor errors in hand motions can significantly impact the\nuser experience. Real applications often involve multiple avatars with varying\nhand shapes, highlighting the importance of maintaining the intricate semantics\nof hand motions across the avatars. Therefore, this paper aims to transfer the\nhand motion semantics between diverse avatars based on their respective hand\nmodels. To address this problem, we introduce a novel anatomy-based semantic\nmatrix (ASM) that encodes the semantics of hand motions. The ASM quantifies the\npositions of the palm and other joints relative to the local frame of the\ncorresponding joint, enabling precise retargeting of hand motions.\nSubsequently, we obtain a mapping function from the source ASM to the target\nhand joint rotations by employing an anatomy-based semantics reconstruction\nnetwork (ASRN). We train the ASRN using a semi-supervised learning strategy on\nthe Mixamo and InterHand2.6M datasets. We evaluate our method in intra-domain\nand cross-domain hand motion retargeting tasks. The qualitative and\nquantitative results demonstrate the significant superiority of our ASRN over\nthe state-of-the-arts.\n","authors":["Zijie Ye","Jia Jia","Junliang Xing"],"pdf_url":"https://arxiv.org/pdf/2308.05920v1.pdf","comment":"Accepted to MM 2023, 9 pages, 10 figures. Project page:\n https://abcyzj.github.io/S2H/"},{"id":"http://arxiv.org/abs/2303.09695v2","updated":"2023-08-11T20:07:48Z","published":"2023-03-17T00:03:38Z","title":"PersonalTailor: Personalizing 2D Pattern Design from 3D Garment Point\n Clouds","summary":" Garment pattern design aims to convert a 3D garment to the corresponding 2D\npanels and their sewing structure. Existing methods rely either on template\nfitting with heuristics and prior assumptions, or on model learning with\ncomplicated shape parameterization. Importantly, both approaches do not allow\nfor personalization of the output garment, which today has increasing demands.\nTo fill this demand, we introduce PersonalTailor: a personalized 2D pattern\ndesign method, where the user can input specific constraints or demands (in\nlanguage or sketch) for personal 2D panel fabrication from 3D point clouds.\nPersonalTailor first learns a multi-modal panel embeddings based on\nunsupervised cross-modal association and attentive fusion. It then predicts a\nbinary panel masks individually using a transformer encoder-decoder framework.\nExtensive experiments show that our PersonalTailor excels on both personalized\nand standard pattern fabrication tasks.\n","authors":["Sauradip Nag","Anran Qi","Xiatian Zhu","Ariel Shamir"],"pdf_url":"https://arxiv.org/pdf/2303.09695v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2301.13617v2","updated":"2023-08-11T18:36:12Z","published":"2023-01-31T13:21:15Z","title":"A Closer Look into Recent Video-based Learning Research: A Comprehensive\n Review of Video Characteristics, Tools, Technologies, and Learning\n Effectiveness","summary":" People increasingly use videos on the Web as a source for learning. To\nsupport this way of learning, researchers and developers are continuously\ndeveloping tools, proposing guidelines, analyzing data, and conducting\nexperiments. However, it is still not clear what characteristics a video should\nhave to be an effective learning medium. In this paper, we present a\ncomprehensive review of 257 articles on video-based learning for the period\nfrom 2016 to 2021. One of the aims of the review is to identify the video\ncharacteristics that have been explored by previous work. Based on our\nanalysis, we suggest a taxonomy which organizes the video characteristics and\ncontextual aspects into eight categories: (1) audio features, (2) visual\nfeatures, (3) textual features, (4) instructor behavior, (5) learners\nactivities, (6) interactive features (quizzes, etc.), (7) production style, and\n(8) instructional design. Also, we identify four representative research\ndirections: (1) proposals of tools to support video-based learning, (2) studies\nwith controlled experiments, (3) data analysis studies, and (4) proposals of\ndesign guidelines for learning videos. We find that the most explored\ncharacteristics are textual features followed by visual features, learner\nactivities, and interactive features. Text of transcripts, video frames, and\nimages (figures and illustrations) are most frequently used by tools that\nsupport learning through videos. The learner activity is heavily explored\nthrough log files in data analysis studies, and interactive features have been\nfrequently scrutinized in controlled experiments. We complement our review by\ncontrasting research findings that investigate the impact of video\ncharacteristics on the learning effectiveness, report on tasks and technologies\nused to develop tools that support learning, and summarize trends of design\nguidelines to produce learning videos\n","authors":["Evelyn Navarrete","Andreas Nehring","Sascha Schanze","Ralph Ewerth","Anett Hoppe"],"pdf_url":"https://arxiv.org/pdf/2301.13617v2.pdf","comment":null}]},"2023-08-14T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.07317v1","updated":"2023-08-14T17:59:56Z","published":"2023-08-14T17:59:56Z","title":"Platypus: Quick, Cheap, and Powerful Refinement of LLMs","summary":" We present $\\textbf{Platypus}$, a family of fine-tuned and merged Large\nLanguage Models (LLMs) that achieves the strongest performance and currently\nstands at first place in HuggingFace's Open LLM Leaderboard as of the release\ndate of this work. In this work we describe (1) our curated dataset\n$\\textbf{Open-Platypus}$, that is a subset of other open datasets and which\n$\\textit{we release to the public}$ (2) our process of fine-tuning and merging\nLoRA modules in order to conserve the strong prior of pretrained LLMs, while\nbringing specific domain knowledge to the surface (3) our efforts in checking\nfor test data leaks and contamination in the training data, which can inform\nfuture research. Specifically, the Platypus family achieves strong performance\nin quantitative LLM metrics across model sizes, topping the global Open LLM\nleaderboard while using just a fraction of the fine-tuning data and overall\ncompute that are required for other state-of-the-art fine-tuned LLMs. In\nparticular, a 13B Platypus model can be trained on $\\textit{a single}$ A100 GPU\nusing 25k questions in 5 hours. This is a testament of the quality of our\nOpen-Platypus dataset, and opens opportunities for more improvements in the\nfield. Project page: https://platypus-llm.github.io\n","authors":["Ariel N. Lee","Cole J. Hunter","Nataniel Ruiz"],"pdf_url":"https://arxiv.org/pdf/2308.07317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07308v1","updated":"2023-08-14T17:54:10Z","published":"2023-08-14T17:54:10Z","title":"LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked","summary":" Large language models (LLMs) have skyrocketed in popularity in recent years\ndue to their ability to generate high-quality text in response to human\nprompting. However, these models have been shown to have the potential to\ngenerate harmful content in response to user prompting (e.g., giving users\ninstructions on how to commit crimes). There has been a focus in the literature\non mitigating these risks, through methods like aligning models with human\nvalues through reinforcement learning. However, it has been shown that even\naligned language models are susceptible to adversarial attacks that bypass\ntheir restrictions on generating harmful text. We propose a simple approach to\ndefending against these attacks by having a large language model filter its own\nresponses. Our current results show that even if a model is not fine-tuned to\nbe aligned with human values, it is possible to stop it from presenting harmful\ncontent to users by validating the content using a language model.\n","authors":["Alec Helbling","Mansi Phute","Matthew Hull","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2308.07308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07305v1","updated":"2023-08-14T17:46:52Z","published":"2023-08-14T17:46:52Z","title":"Neural Authorship Attribution: Stylometric Analysis on Large Language\n Models","summary":" Large language models (LLMs) such as GPT-4, PaLM, and Llama have\nsignificantly propelled the generation of AI-crafted text. With rising concerns\nabout their potential misuse, there is a pressing need for AI-generated-text\nforensics. Neural authorship attribution is a forensic effort, seeking to trace\nAI-generated text back to its originating LLM. The LLM landscape can be divided\ninto two primary categories: proprietary and open-source. In this work, we\ndelve into these emerging categories of LLMs, focusing on the nuances of neural\nauthorship attribution. To enrich our understanding, we carry out an empirical\nanalysis of LLM writing signatures, highlighting the contrasts between\nproprietary and open-source models, and scrutinizing variations within each\ngroup. By integrating stylometric features across lexical, syntactic, and\nstructural aspects of language, we explore their potential to yield\ninterpretable results and augment pre-trained language model-based classifiers\nutilized in neural authorship attribution. Our findings, based on a range of\nstate-of-the-art LLMs, provide empirical insights into neural authorship\nattribution, paving the way for future investigations aimed at mitigating the\nthreats posed by AI-generated misinformation.\n","authors":["Tharindu Kumarage","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07286v1","updated":"2023-08-14T17:17:21Z","published":"2023-08-14T17:17:21Z","title":"The Devil is in the Errors: Leveraging Large Language Models for\n Fine-grained Machine Translation Evaluation","summary":" Automatic evaluation of machine translation (MT) is a critical tool driving\nthe rapid iterative development of MT systems. While considerable progress has\nbeen made on estimating a single scalar quality score, current metrics lack the\ninformativeness of more detailed schemes that annotate individual errors, such\nas Multidimensional Quality Metrics (MQM). In this paper, we help fill this gap\nby proposing AutoMQM, a prompting technique which leverages the reasoning and\nin-context learning capabilities of large language models (LLMs) and asks them\nto identify and categorize errors in translations. We start by evaluating\nrecent LLMs, such as PaLM and PaLM-2, through simple score prediction\nprompting, and we study the impact of labeled data through in-context learning\nand finetuning. We then evaluate AutoMQM with PaLM-2 models, and we find that\nit improves performance compared to just prompting for scores (with\nparticularly large gains for larger models) while providing interpretability\nthrough error spans that align with human annotations.\n","authors":["Patrick Fernandes","Daniel Deutsch","Mara Finkelstein","Parker Riley","André F. T. Martins","Graham Neubig","Ankush Garg","Jonathan H. Clark","Markus Freitag","Orhan Firat"],"pdf_url":"https://arxiv.org/pdf/2308.07286v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2308.07282v1","updated":"2023-08-14T17:12:43Z","published":"2023-08-14T17:12:43Z","title":"Comparison between parameter-efficient techniques and full fine-tuning:\n A case study on multilingual news article classification","summary":" Adapters and Low-Rank Adaptation (LoRA) are parameter-efficient fine-tuning\ntechniques designed to make the training of language models more efficient.\nPrevious results demonstrated that these methods can even improve performance\non some classification tasks. This paper complements the existing research by\ninvestigating how these techniques influence the classification performance and\ncomputation costs compared to full fine-tuning when applied to multilingual\ntext classification tasks (genre, framing, and persuasion techniques detection;\nwith different input lengths, number of predicted classes and classification\ndifficulty), some of which have limited training data. In addition, we conduct\nin-depth analyses of their efficacy across different training scenarios\n(training on the original multilingual data; on the translations into English;\nand on a subset of English-only data) and different languages. Our findings\nprovide valuable insights into the applicability of the parameter-efficient\nfine-tuning techniques, particularly to complex multilingual and multilabel\nclassification tasks.\n","authors":["Olesya Razuvayevskaya","Ben Wu","Joao A. Leite","Freddy Heppell","Ivan Srba","Carolina Scarton","Kalina Bontcheva","Xingyi Song"],"pdf_url":"https://arxiv.org/pdf/2308.07282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07272v1","updated":"2023-08-14T16:58:50Z","published":"2023-08-14T16:58:50Z","title":"Dialogue for Prompting: a Policy-Gradient-Based Discrete Prompt\n Optimization for Few-shot Learning","summary":" Prompt-based pre-trained language models (PLMs) paradigm have succeeded\nsubstantially in few-shot natural language processing (NLP) tasks. However,\nprior discrete prompt optimization methods require expert knowledge to design\nthe base prompt set and identify high-quality prompts, which is costly,\ninefficient, and subjective. Meanwhile, existing continuous prompt optimization\nmethods improve the performance by learning the ideal prompts through the\ngradient information of PLMs, whose high computational cost, and low\nreadability and generalizability are often concerning. To address the research\ngap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt\nOptimization ($DP_2O$) method. We first design a multi-round dialogue alignment\nstrategy for readability prompt set generation based on GPT-4. Furthermore, we\npropose an efficient prompt screening metric to identify high-quality prompts\nwith linear complexity. Finally, we construct a reinforcement learning (RL)\nframework based on policy gradients to match the prompts to inputs optimally.\nBy training a policy network with only 0.67% of the PLM parameter size on the\ntasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA)\nmethod by 1.52% in accuracy on average on four open-source datasets. Moreover,\nsubsequent experiments also demonstrate that $DP_2O$ has good universality,\nrobustness, and generalization ability.\n","authors":["Chengzhengxu Li","Xiaoming Liu","Yichen Wang","Duyi Li","Yu Lan","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.07272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07269v1","updated":"2023-08-14T16:52:42Z","published":"2023-08-14T16:52:42Z","title":"EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language\n Models","summary":" Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy\nissues, which means they are unaware of unseen events or generate text with\nincorrect facts owing to the outdated/noisy data. To this end, many knowledge\nediting approaches for LLMs have emerged -- aiming to subtly inject/edit\nupdated knowledge or adjust undesired behavior while minimizing the impact on\nunrelated inputs. Nevertheless, due to significant differences among various\nknowledge editing methods and the variations in task setups, there is no\nstandard implementation framework available for the community, which hinders\npractitioners to apply knowledge editing to applications. To address these\nissues, we propose EasyEdit, an easy-to-use knowledge editing framework for\nLLMs. It supports various cutting-edge knowledge editing approaches and can be\nreadily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.\nEmpirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,\ndemonstrating that knowledge editing surpasses traditional fine-tuning in terms\nof reliability and generalization. We have released the source code on GitHub\nat https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and\ncomprehensive documentation for beginners to get started. Besides, we present\nan online system for real-time knowledge editing, and a demo video at\nhttp://knowlm.zjukg.cn/easyedit.mp4.\n","authors":["Peng Wang","Ningyu Zhang","Xin Xie","Yunzhi Yao","Bozhong Tian","Mengru Wang","Zekun Xi","Siyuan Cheng","Kangwei Liu","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07269v1.pdf","comment":"The project website is https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2208.08063v5","updated":"2023-08-14T16:49:07Z","published":"2022-08-17T04:30:58Z","title":"NECE: Narrative Event Chain Extraction Toolkit","summary":" To understand a narrative, it is essential to comprehend the temporal event\nflows, especially those associated with main characters; however, this can be\nchallenging with lengthy and unstructured narrative texts. To address this, we\nintroduce NECE, an open-access, document-level toolkit that automatically\nextracts and aligns narrative events in the temporal order of their occurrence.\nThrough extensive evaluations, we show the high quality of the NECE toolkit and\ndemonstrates its downstream application in analyzing narrative bias regarding\ngender. We also openly discuss the shortcomings of the current approach, and\npotential of leveraging generative models in future works. Lastly the NECE\ntoolkit includes both a Python library and a user-friendly web interface, which\noffer equal access to professionals and layman audience alike, to visualize\nevent chain, obtain narrative flows, or study narrative bias.\n","authors":["Guangxuan Xu","Paulina Toro Isaza","Moshi Li","Akintoye Oloko","Bingsheng Yao","Cassia Sanctos","Aminat Adebiyi","Yufang Hou","Nanyun Peng","Dakuo Wang"],"pdf_url":"https://arxiv.org/pdf/2208.08063v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06259v2","updated":"2023-08-14T16:44:01Z","published":"2023-08-11T17:47:54Z","title":"Self-Alignment with Instruction Backtranslation","summary":" We present a scalable method to build a high quality instruction following\nlanguage model by automatically labelling human-written text with corresponding\ninstructions. Our approach, named instruction backtranslation, starts with a\nlanguage model finetuned on a small amount of seed data, and a given web\ncorpus. The seed model is used to construct training examples by generating\ninstruction prompts for web documents (self-augmentation), and then selecting\nhigh quality examples from among these candidates (self-curation). This data is\nthen used to finetune a stronger model. Finetuning LLaMa on two iterations of\nour approach yields a model that outperforms all other LLaMa-based models on\nthe Alpaca leaderboard not relying on distillation data, demonstrating highly\neffective self-alignment.\n","authors":["Xian Li","Ping Yu","Chunting Zhou","Timo Schick","Luke Zettlemoyer","Omer Levy","Jason Weston","Mike Lewis"],"pdf_url":"https://arxiv.org/pdf/2308.06259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16878v2","updated":"2023-08-14T16:40:31Z","published":"2023-07-31T17:41:10Z","title":"Contrastive Learning for API Aspect Analysis","summary":" We present a novel approach - CLAA - for API aspect detection in API reviews\nthat utilizes transformer models trained with a supervised contrastive loss\nobjective function. We evaluate CLAA using performance and impact analysis. For\nperformance analysis, we utilized a benchmark dataset on developer discussions\ncollected from Stack Overflow and compare the results to those obtained using\nstate-of-the-art transformer models. Our experiments show that contrastive\nlearning can significantly improve the performance of transformer models in\ndetecting aspects such as Performance, Security, Usability, and Documentation.\nFor impact analysis, we performed empirical and developer study. On a randomly\nselected and manually labeled 200 online reviews, CLAA achieved 92% accuracy\nwhile the SOTA baseline achieved 81.5%. According to our developer study\ninvolving 10 participants, the use of 'Stack Overflow + CLAA' resulted in\nincreased accuracy and confidence during API selection. Replication package:\nhttps://github.com/disa-lab/Contrastive-Learning-API-Aspect-ASE2023\n","authors":["G. M. Shahariar","Tahmid Hasan","Anindya Iqbal","Gias Uddin"],"pdf_url":"https://arxiv.org/pdf/2307.16878v2.pdf","comment":"Accepted in the 38th IEEE/ACM International Conference on Automated\n Software Engineering (ASE2023)"},{"id":"http://arxiv.org/abs/2308.07201v1","updated":"2023-08-14T15:13:04Z","published":"2023-08-14T15:13:04Z","title":"ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate","summary":" Text evaluation has historically posed significant challenges, often\ndemanding substantial labor and time cost. With the emergence of large language\nmodels (LLMs), researchers have explored LLMs' potential as alternatives for\nhuman evaluation. While these single-agent-based approaches show promise,\nexperimental results suggest that further advancements are needed to bridge the\ngap between their current effectiveness and human-level evaluation quality.\nRecognizing that best practices of human evaluation processes often involve\nmultiple human annotators collaborating in the evaluation, we resort to a\nmulti-agent debate framework, moving beyond single-agent prompting strategies.\nThe multi-agent-based approach enables a group of LLMs to synergize with an\narray of intelligent counterparts, harnessing their distinct capabilities and\nexpertise to enhance efficiency and effectiveness in handling intricate tasks.\nIn this paper, we construct a multi-agent referee team called ChatEval to\nautonomously discuss and evaluate the quality of generated responses from\ndifferent models on open-ended questions and traditional natural language\ngeneration (NLG) tasks. Our analysis shows that ChatEval transcends mere\ntextual scoring, offering a human-mimicking evaluation process for reliable\nassessments. Our code is available at https://github.com/chanchimin/ChatEval.\n","authors":["Chi-Min Chan","Weize Chen","Yusheng Su","Jianxuan Yu","Wei Xue","Shanghang Zhang","Jie Fu","Zhiyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07179v1","updated":"2023-08-14T14:39:02Z","published":"2023-08-14T14:39:02Z","title":"Incorporating Annotator Uncertainty into Representations of Discourse\n Relations","summary":" Annotation of discourse relations is a known difficult task, especially for\nnon-expert annotators. In this paper, we investigate novice annotators'\nuncertainty on the annotation of discourse relations on spoken conversational\ndata. We find that dialogue context (single turn, pair of turns within speaker,\nand pair of turns across speakers) is a significant predictor of confidence\nscores. We compute distributed representations of discourse relations from\nco-occurrence statistics that incorporate information about confidence scores\nand dialogue context. We perform a hierarchical clustering analysis using these\nrepresentations and show that weighting discourse relation representations with\ninformation about confidence and dialogue context coherently models our\nannotators' uncertainty about discourse relation labels.\n","authors":["S. Magalí López Cortez","Cassandra L. Jacobs"],"pdf_url":"https://arxiv.org/pdf/2308.07179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07124v1","updated":"2023-08-14T13:53:54Z","published":"2023-08-14T13:53:54Z","title":"OctoPack: Instruction Tuning Code Large Language Models","summary":" Finetuning large language models (LLMs) on instructions leads to vast\nperformance improvements on natural language tasks. We apply instruction tuning\nusing code, leveraging the natural structure of Git commits, which pair code\nchanges with human instructions. We compile CommitPack: 4 terabytes of Git\ncommits across 350 programming languages. We benchmark CommitPack against other\nnatural and synthetic code instructions (xP3x, Self-Instruct, OASST) on the 16B\nparameter StarCoder model, and achieve state-of-the-art performance among\nmodels not trained on OpenAI outputs, on the HumanEval Python benchmark (46.2%\npass@1). We further introduce HumanEvalPack, expanding the HumanEval benchmark\nto a total of 3 coding tasks (Code Repair, Code Explanation, Code Synthesis)\nacross 6 languages (Python, JavaScript, Java, Go, C++, Rust). Our models,\nOctoCoder and OctoGeeX, achieve the best performance across HumanEvalPack among\nall permissive models, demonstrating CommitPack's benefits in generalizing to a\nwider set of languages and natural coding tasks. Code, models and data are\nfreely available at https://github.com/bigcode-project/octopack.\n","authors":["Niklas Muennighoff","Qian Liu","Armel Zebaze","Qinkai Zheng","Binyuan Hui","Terry Yue Zhuo","Swayam Singh","Xiangru Tang","Leandro von Werra","Shayne Longpre"],"pdf_url":"https://arxiv.org/pdf/2308.07124v1.pdf","comment":"57 pages (9 main), 39 figures, 16 tables"},{"id":"http://arxiv.org/abs/2308.07134v1","updated":"2023-08-14T13:41:09Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of language models\ncontinues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundational model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLMs to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative language models replacing GNNs as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v1.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.07120v1","updated":"2023-08-14T13:00:53Z","published":"2023-08-14T13:00:53Z","title":"Mind your Language (Model): Fact-Checking LLMs and their Role in NLP\n Research and Practice","summary":" Much of the recent discourse within the NLP research community has been\ncentered around Large Language Models (LLMs), their functionality and potential\n-- yet not only do we not have a working definition of LLMs, but much of this\ndiscourse relies on claims and assumptions that are worth re-examining. This\nposition paper contributes a definition of LLMs, explicates some of the\nassumptions made regarding their functionality, and outlines the existing\nevidence for and against them. We conclude with suggestions for research\ndirections and their framing in future work.\n","authors":["Alexandra Sasha Luccioni","Anna Rogers"],"pdf_url":"https://arxiv.org/pdf/2308.07120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07107v1","updated":"2023-08-14T12:47:22Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":" As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions within\nthis expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07102v1","updated":"2023-08-14T12:30:58Z","published":"2023-08-14T12:30:58Z","title":"Temporal Sentence Grounding in Streaming Videos","summary":" This paper aims to tackle a novel task - Temporal Sentence Grounding in\nStreaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance\nbetween a video stream and a given sentence query. Unlike regular videos,\nstreaming videos are acquired continuously from a particular source, and are\nalways desired to be processed on-the-fly in many applications such as\nsurveillance and live-stream analysis. Thus, TSGSV is challenging since it\nrequires the model to infer without future frames and process long historical\nframes effectively, which is untouched in the early methods. To specifically\naddress the above challenges, we propose two novel methods: (1) a TwinNet\nstructure that enables the model to learn about upcoming events; and (2) a\nlanguage-guided feature compressor that eliminates redundant visual frames and\nreinforces the frames that are relevant to the query. We conduct extensive\nexperiments using ActivityNet Captions, TACoS, and MAD datasets. The results\ndemonstrate the superiority of our proposed methods. A systematic ablation\nstudy also confirms their effectiveness.\n","authors":["Tian Gan","Xiao Wang","Yan Sun","Jianlong Wu","Qingpei Guo","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2308.07102v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2211.08233v3","updated":"2023-08-14T11:50:53Z","published":"2022-11-14T13:35:01Z","title":"Temporal Modeling Matters: A Novel Temporal Emotional Modeling Approach\n for Speech Emotion Recognition","summary":" Speech emotion recognition (SER) plays a vital role in improving the\ninteractions between humans and machines by inferring human emotion and\naffective states from speech signals. Whereas recent works primarily focus on\nmining spatiotemporal information from hand-crafted features, we explore how to\nmodel the temporal patterns of speech emotions from dynamic temporal scales.\nTowards that goal, we introduce a novel temporal emotional modeling approach\nfor SER, termed Temporal-aware bI-direction Multi-scale Network (TIM-Net),\nwhich learns multi-scale contextual affective representations from various time\nscales. Specifically, TIM-Net first employs temporal-aware blocks to learn\ntemporal affective representation, then integrates complementary information\nfrom the past and the future to enrich contextual representations, and finally,\nfuses multiple time scale features for better adaptation to the emotional\nvariation. Extensive experimental results on six benchmark SER datasets\ndemonstrate the superior performance of TIM-Net, gaining 2.34% and 2.61%\nimprovements of the average UAR and WAR over the second-best on each corpus.\nThe source code is available at https://github.com/Jiaxin-Ye/TIM-Net_SER.\n","authors":["Jiaxin Ye","Xin-cheng Wen","Yujie Wei","Yong Xu","Kunhong Liu","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2211.08233v3.pdf","comment":"ICASSP 2023"},{"id":"http://arxiv.org/abs/2308.07081v1","updated":"2023-08-14T11:26:25Z","published":"2023-08-14T11:26:25Z","title":"Aesthetics of Sanskrit Poetry from the Perspective of Computational\n Linguistics: A Case Study Analysis on Siksastaka","summary":" Sanskrit poetry has played a significant role in shaping the literary and\ncultural landscape of the Indian subcontinent for centuries. However, not much\nattention has been devoted to uncovering the hidden beauty of Sanskrit poetry\nin computational linguistics. This article explores the intersection of\nSanskrit poetry and computational linguistics by proposing a roadmap of an\ninterpretable framework to analyze and classify the qualities and\ncharacteristics of fine Sanskrit poetry. We discuss the rich tradition of\nSanskrit poetry and the significance of computational linguistics in\nautomatically identifying the characteristics of fine poetry. The proposed\nframework involves a human-in-the-loop approach that combines deterministic\naspects delegated to machines and deep semantics left to human experts. We\nprovide a deep analysis of Siksastaka, a Sanskrit poem, from the perspective of\n6 prominent kavyashastra schools, to illustrate the proposed framework.\nAdditionally, we provide compound, dependency, anvaya (prose order linearised\nform), meter, rasa (mood), alankar (figure of speech), and riti (writing style)\nannotations for Siksastaka and a web application to illustrate the poem's\nanalysis and annotations. Our key contributions include the proposed framework,\nthe analysis of Siksastaka, the annotations and the web application for future\nresearch. Link for interactive analysis:\nhttps://sanskritshala.github.io/shikshastakam/\n","authors":["Jivnesh Sandhan","Amruta Barbadikar","Malay Maity","Pavankumar Satuluri","Tushar Sandhan","Ravi M. Gupta","Pawan Goyal","Laxmidhar Behera"],"pdf_url":"https://arxiv.org/pdf/2308.07081v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.07074v1","updated":"2023-08-14T11:16:28Z","published":"2023-08-14T11:16:28Z","title":"#InsTag: Instruction Tagging for Diversity and Complexity Analysis","summary":" Foundation language models obtain the instruction-following ability through\nsupervised fine-tuning (SFT). Diversity and complexity are considered critical\nfactors of a successful SFT dataset, while their definitions remain obscure and\nlack quantitative analyses. In this work, we propose InsTag, an open-set\nfine-grained tagger, to tag samples within SFT datasets based on semantics and\nintentions and define instruction diversity and complexity regarding tags. We\nobtain 6.6K tags to describe comprehensive user queries. Then we analyze\npopular open-sourced SFT datasets and find that the model ability grows with\nmore diverse and complex data. Based on this observation, we propose a data\nselector based on InsTag to select 6K diverse and complex samples from\nopen-source datasets and fine-tune models on InsTag-selected data. The\nresulting models, TagLM, outperform open-source models based on considerably\nlarger SFT data evaluated by MT-Bench, echoing the importance of query\ndiversity and complexity. We open-source InsTag in\nhttps://github.com/OFA-Sys/InsTag.\n","authors":["Keming Lu","Hongyi Yuan","Zheng Yuan","Runji Lin","Junyang Lin","Chuanqi Tan","Chang Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01776v2","updated":"2023-08-14T09:15:42Z","published":"2023-08-03T14:09:31Z","title":"Does Correction Remain A Problem For Large Language Models?","summary":" As large language models, such as GPT, continue to advance the capabilities\nof natural language processing (NLP), the question arises: does the problem of\ncorrection still persist? This paper investigates the role of correction in the\ncontext of large language models by conducting two experiments. The first\nexperiment focuses on correction as a standalone task, employing few-shot\nlearning techniques with GPT-like models for error correction. The second\nexperiment explores the notion of correction as a preparatory task for other\nNLP tasks, examining whether large language models can tolerate and perform\nadequately on texts containing certain levels of noise or errors. By addressing\nthese experiments, we aim to shed light on the significance of correction in\nthe era of large language models and its implications for various NLP\napplications.\n","authors":["Xiaowu Zhang","Xiaotian Zhang","Cheng Yang","Hang Yan","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.01776v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01861v2","updated":"2023-08-14T09:07:00Z","published":"2023-08-03T16:31:02Z","title":"ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on\n Class-level Code Generation","summary":" In this work, we make the first attempt to evaluate LLMs in a more\nchallenging code generation scenario, i.e. class-level code generation. We\nfirst manually construct the first class-level code generation benchmark\nClassEval of 100 class-level Python code generation tasks with approximately\n500 person-hours. Based on it, we then perform the first study of 11\nstate-of-the-art LLMs on class-level code generation. Based on our results, we\nhave the following main findings. First, we find that all existing LLMs show\nmuch worse performance on class-level code generation compared to on standalone\nmethod-level code generation benchmarks like HumanEval; and the method-level\ncoding ability cannot equivalently reflect the class-level coding ability among\nLLMs. Second, we find that GPT-4 and GPT-3.5 still exhibit dominate superior\nthan other LLMs on class-level code generation, and the second-tier models\nincludes Instruct-Starcoder, Instruct-Codegen, and Wizardcoder with very\nsimilar performance. Third, we find that generating the entire class all at\nonce (i.e. holistic generation strategy) is the best generation strategy only\nfor GPT-4 and GPT-3.5, while method-by-method generation (i.e. incremental and\ncompositional) is better strategies for the other models with limited ability\nof understanding long instructions and utilizing the middle information.\nLastly, we find the limited model ability of generating method-dependent code\nand discuss the frequent error types in generated classes. Our benchmark is\navailable at https://github.com/FudanSELab/ClassEval.\n","authors":["Xueying Du","Mingwei Liu","Kaixin Wang","Hanlin Wang","Junwei Liu","Yixuan Chen","Jiayi Feng","Chaofeng Sha","Xin Peng","Yiling Lou"],"pdf_url":"https://arxiv.org/pdf/2308.01861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05221v2","updated":"2023-08-14T08:39:37Z","published":"2023-03-09T12:50:34Z","title":"SEAM: An Integrated Activation-Coupled Model of Sentence Processing and\n Eye Movements in Reading","summary":" Models of eye-movement control during reading, developed largely within\npsychology, usually focus on visual, attentional, lexical, and motor processes\nbut neglect post-lexical language processing; by contrast, models of sentence\ncomprehension processes, developed largely within psycholinguistics, generally\nfocus only on post-lexical language processes. We present a model that combines\nthese two research threads, by integrating eye-movement control and sentence\nprocessing. Developing such an integrated model is extremely challenging and\ncomputationally demanding, but such an integration is an important step toward\ncomplete mathematical models of natural language comprehension in reading. We\ncombine the SWIFT model of eye-movement control (Seelig et al., 2020,\ndoi:10.1016/j.jmp.2019.102313) with key components of the Lewis and Vasishth\nsentence processing model (Lewis & Vasishth, 2005,\ndoi:10.1207/s15516709cog0000_25). This integration becomes possible, for the\nfirst time, due in part to recent advances in successful parameter\nidentification in dynamical models, which allows us to investigate profile\nlog-likelihoods for individual model parameters. We present a fully implemented\nproof-of-concept model demonstrating how such an integrated model can be\nachieved; our approach includes Bayesian model inference with Markov Chain\nMonte Carlo (MCMC) sampling as a key computational tool. The integrated model,\nSEAM, can successfully reproduce eye movement patterns that arise due to\nsimilarity-based interference in reading. To our knowledge, this is the\nfirst-ever integration of a complete process model of eye-movement control with\nlinguistic dependency completion processes in sentence comprehension. In future\nwork, this proof of concept model will need to be evaluated using a\ncomprehensive set of benchmark data.\n","authors":["Maximilian M. Rabe","Dario Paape","Daniela Mertzen","Shravan Vasishth","Ralf Engbert"],"pdf_url":"https://arxiv.org/pdf/2303.05221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00304v2","updated":"2023-08-14T08:11:15Z","published":"2023-08-01T05:54:12Z","title":"Skills-in-Context Prompting: Unlocking Compositionality in Large\n Language Models","summary":" We consider the problem of eliciting compositional generalization\ncapabilities in large language models (LLMs) with a novel type of prompting\nstrategy. Compositional generalization empowers the LLMs to solve problems that\nare harder than the ones they have seen (i.e., easy-to-hard generalization),\nwhich is a critical reasoning capability of human-like intelligence. However,\neven the current state-of-the-art LLMs still struggle with this form of\nreasoning. To bridge this gap, we propose skills-in-context (SKiC) prompting,\nwhich instructs LLMs how to compose basic skills to resolve more complex\nproblems. We find that it is crucial to demonstrate both the skills and the\ncompositional examples within the same prompting context. With as few as two\nexamplars, our SKiC prompting initiates strong synergies between skills and\ntheir composition capabilities. Notably, it empowers LLMs to solve unseen\nproblems that require innovative skill compositions, achieving near-perfect\ngeneralization on a broad range of challenging compositionality tasks.\nIntriguingly, SKiC prompting unlocks the latent potential of LLMs, enabling\nthem to leverage pre-existing internal skills acquired during earlier\npre-training stages, even when these skills are not explicitly presented in the\nprompting context. This results in the capability of LLMs to solve unseen\ncomplex problems by activating and composing internal competencies. With such\nprominent features, SKiC prompting is able to achieve state-of-the-art\nperformance on challenging mathematical reasoning benchmarks (e.g., MATH).\n","authors":["Jiaao Chen","Xiaoman Pan","Dian Yu","Kaiqiang Song","Xiaoyang Wang","Dong Yu","Jianshu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.00304v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16680v3","updated":"2023-08-14T07:59:36Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n A Comprehensive Survey","summary":" Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v3.pdf","comment":"draft version"},{"id":"http://arxiv.org/abs/2308.06111v2","updated":"2023-08-14T07:45:17Z","published":"2023-08-11T12:55:09Z","title":"Improving Zero-Shot Text Matching for Financial Auditing with Large\n Language Models","summary":" Auditing financial documents is a very tedious and time-consuming process. As\nof today, it can already be simplified by employing AI-based solutions to\nrecommend relevant text passages from a report for each legal requirement of\nrigorous accounting standards. However, these methods need to be fine-tuned\nregularly, and they require abundant annotated data, which is often lacking in\nindustrial environments. Hence, we present ZeroShotALI, a novel recommender\nsystem that leverages a state-of-the-art large language model (LLM) in\nconjunction with a domain-specifically optimized transformer-based\ntext-matching solution. We find that a two-step approach of first retrieving a\nnumber of best matching document sections per legal requirement with a custom\nBERT-based model and second filtering these selections using an LLM yields\nsignificant performance improvements over existing approaches.\n","authors":["Lars Hillebrand","Armin Berger","Tobias Deußer","Tim Dilmaghani","Mohamed Khaled","Bernd Kliem","Rüdiger Loitz","Maren Pielka","David Leonhard","Christian Bauckhage","Rafet Sifa"],"pdf_url":"https://arxiv.org/pdf/2308.06111v2.pdf","comment":"Accepted at DocEng 2023, 4 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2308.06975v1","updated":"2023-08-14T07:20:49Z","published":"2023-08-14T07:20:49Z","title":"Can Knowledge Graphs Simplify Text?","summary":" Knowledge Graph (KG)-to-Text Generation has seen recent improvements in\ngenerating fluent and informative sentences which describe a given KG. As KGs\nare widespread across multiple domains and contain important entity-relation\ninformation, and as text simplification aims to reduce the complexity of a text\nwhile preserving the meaning of the original text, we propose KGSimple, a novel\napproach to unsupervised text simplification which infuses KG-established\ntechniques in order to construct a simplified KG path and generate a concise\ntext which preserves the original input's meaning. Through an iterative and\nsampling KG-first approach, our model is capable of simplifying text when\nstarting from a KG by learning to keep important information while harnessing\nKG-to-text generation to output fluent and descriptive sentences. We evaluate\nvarious settings of the KGSimple model on currently-available KG-to-text\ndatasets, demonstrating its effectiveness compared to unsupervised text\nsimplification models which start with a given complex text. Our code is\navailable on GitHub.\n","authors":["Anthony Colas","Haodi Ma","Xuanli He","Yang Bai","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06975v1.pdf","comment":"Accepted as a Main Conference Long Paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.06966v1","updated":"2023-08-14T06:49:53Z","published":"2023-08-14T06:49:53Z","title":"EcomGPT: Instruction-tuning Large Language Model with Chain-of-Task\n Tasks for E-commerce","summary":" Recently, instruction-following Large Language Models (LLMs) , represented by\nChatGPT, have exhibited exceptional performance in general Natural Language\nProcessing (NLP) tasks. However, the unique characteristics of E-commerce data\npose significant challenges to general LLMs. An LLM tailored specifically for\nE-commerce scenarios, possessing robust cross-dataset/task generalization\ncapabilities, is a pressing necessity. To solve this issue, in this work, we\nproposed the first e-commerce instruction dataset EcomInstruct, with a total of\n2.5 million instruction data. EcomInstruct scales up the data size and task\ndiversity by constructing atomic tasks with E-commerce basic data types, such\nas product information, user reviews. Atomic tasks are defined as intermediate\ntasks implicitly involved in solving a final task, which we also call\nChain-of-Task tasks. We developed EcomGPT with different parameter scales by\ntraining the backbone model BLOOMZ with the EcomInstruct. Benefiting from the\nfundamental semantic understanding capabilities acquired from the Chain-of-Task\ntasks, EcomGPT exhibits excellent zero-shot generalization capabilities.\nExtensive experiments and human evaluations demonstrate that EcomGPT\noutperforms ChatGPT in term of cross-dataset/task generalization on E-commerce\ntasks.\n","authors":["Yangning Li","Shirong Ma","Xiaobin Wang","Shen Huang","Chengyue Jiang","Hai-Tao Zheng","Pengjun Xie","Fei Huang","Yong Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.06966v1.pdf","comment":"Initial version of EcomGPT"},{"id":"http://arxiv.org/abs/2308.06953v1","updated":"2023-08-14T06:09:51Z","published":"2023-08-14T06:09:51Z","title":"Thresh: A Unified, Customizable and Deployable Platform for Fine-Grained\n Text Evaluation","summary":" Fine-grained, span-level human evaluation has emerged as a reliable and\nrobust method for evaluating text generation tasks such as summarization,\nsimplification, machine translation and news generation, and the derived\nannotations have been useful for training automatic metrics and improving\nlanguage models. However, existing annotation tools implemented for these\nevaluation frameworks lack the adaptability to be extended to different domains\nor languages, or modify annotation settings according to user needs. And the\nabsence of a unified annotated data format inhibits the research in multi-task\nlearning. In this paper, we introduce Thresh, a unified, customizable and\ndeployable platform for fine-grained evaluation. By simply creating a YAML\nconfiguration file, users can build and test an annotation interface for any\nframework within minutes -- all in one web browser window. To facilitate\ncollaboration and sharing, Thresh provides a community hub that hosts a\ncollection of fine-grained frameworks and corresponding annotations made and\ncollected by the community, covering a wide range of NLP tasks. For deployment,\nThresh offers multiple options for any scale of annotation projects from small\nmanual inspections to large crowdsourcing ones. Additionally, we introduce a\nPython library to streamline the entire process from typology design and\ndeployment to annotation processing. Thresh is publicly accessible at\nhttps://thresh.tools.\n","authors":["David Heineman","Yao Dou","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2308.06953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06942v1","updated":"2023-08-14T05:22:33Z","published":"2023-08-14T05:22:33Z","title":"Approximating Human-Like Few-shot Learning with GPT-based Compression","summary":" In this work, we conceptualize the learning process as information\ncompression. We seek to equip generative pre-trained models with human-like\nlearning capabilities that enable data compression during inference. We present\na novel approach that utilizes the Generative Pre-trained Transformer (GPT) to\napproximate Kolmogorov complexity, with the aim of estimating the optimal\nInformation Distance for few-shot learning. We first propose using GPT as a\nprior for lossless text compression, achieving a noteworthy compression ratio.\nExperiment with LLAMA2-7B backbone achieves a compression ratio of 15.5 on\nenwik9. We justify the pre-training objective of GPT models by demonstrating\nits equivalence to the compression length, and, consequently, its ability to\napproximate the information distance for texts. Leveraging the approximated\ninformation distance, our method allows the direct application of GPT models in\nquantitative text similarity measurements. Experiment results show that our\nmethod overall achieves superior performance compared to embedding and prompt\nbaselines on challenging NLP tasks, including semantic similarity, zero and\none-shot text classification, and zero-shot text ranking.\n","authors":["Cynthia Huang","Yuqing Xie","Zhiying Jiang","Jimmy Lin","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2308.06942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06912v1","updated":"2023-08-14T03:14:38Z","published":"2023-08-14T03:14:38Z","title":"CausalLM is not optimal for in-context learning","summary":" Recent empirical evidence indicates that transformer based in-context\nlearning performs better when using a prefix language model (prefixLM), in\nwhich in-context samples can all attend to each other, compared to causal\nlanguage models (causalLM), which use auto-regressive attention that prohibits\nin-context samples to attend to future samples. While this result is intuitive,\nit is not understood from a theoretical perspective. In this paper we take a\ntheoretical approach and analyze the convergence behavior of prefixLM and\ncausalLM under a certain parameter construction. Our analysis shows that both\nLM types converge to their stationary points at a linear rate, but that while\nprefixLM converges to the optimal solution of linear regression, causalLM\nconvergence dynamics follows that of an online gradient descent algorithm,\nwhich is not guaranteed to be optimal even as the number of samples grows\ninfinitely. We supplement our theoretical claims with empirical experiments\nover synthetic and real tasks and using various types of transformers. Our\nexperiments verify that causalLM consistently underperforms prefixLM in all\nsettings.\n","authors":["Nan Ding","Tomer Levinboim","Jialin Wu","Sebastian Goodman","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2308.06912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06911v1","updated":"2023-08-14T03:12:29Z","published":"2023-08-14T03:12:29Z","title":"GIT-Mol: A Multi-modal Large Language Model for Molecular Science with\n Graph, Image, and Text","summary":" Large language models have made significant strides in natural language\nprocessing, paving the way for innovative applications including molecular\nrepresentation and generation. However, most existing single-modality\napproaches cannot capture the abundant and complex information in molecular\ndata. Here, we introduce GIT-Mol, a multi-modal large language model that\nintegrates the structure Graph, Image, and Text information, including the\nSimplified Molecular Input Line Entry System (SMILES) and molecular captions.\nTo facilitate the integration of multi-modal molecular data, we propose\nGIT-Former, a novel architecture capable of mapping all modalities into a\nunified latent space. Our study develops an innovative any-to-language\nmolecular translation strategy and achieves a 10%-15% improvement in molecular\ncaptioning, a 5%-10% accuracy increase in property prediction, and a 20% boost\nin molecule generation validity compared to baseline or single-modality models.\n","authors":["Pengfei Liu","Yiming Ren","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.06911v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.03549v2","updated":"2023-08-14T02:59:52Z","published":"2023-08-07T12:56:13Z","title":"Zhongjing: Enhancing the Chinese Medical Capabilities of Large Language\n Model through Expert Feedback and Real-world Multi-turn Dialogue","summary":" Recent advances in Large Language Models (LLMs) have achieved remarkable\nbreakthroughs in understanding and responding to user intents. However, their\nperformance lag behind general use cases in some expertise domains, such as\nChinese medicine. Existing efforts to incorporate Chinese medicine into LLMs\nrely on Supervised Fine-Tuning (SFT) with single-turn and distilled dialogue\ndata. These models lack the ability for doctor-like proactive inquiry and\nmulti-turn comprehension and cannot always align responses with safety and\nprofessionalism experts. In this work, we introduce Zhongjing, the first\nChinese medical LLaMA-based LLM that implements an entire training pipeline\nfrom pre-training to reinforcement learning with human feedback (RLHF).\nAdditionally, we introduce a Chinese multi-turn medical dialogue dataset of\n70,000 authentic doctor-patient dialogues, CMtMedQA, which significantly\nenhances the model's capability for complex dialogue and proactive inquiry\ninitiation. We define a refined annotation rule and evaluation criteria given\nthe biomedical domain's unique characteristics. Results show that our model\noutperforms baselines in various capacities and matches the performance of\nChatGPT in a few abilities, despite having 50x training data with previous best\nmodel and 100x parameters with ChatGPT. RLHF further improves the model's\ninstruction-following ability and safety.We also release our code, datasets and\nmodel for further research.\n","authors":["Songhua Yang","Hanjie Zhao","Senbin Zhu","Guangyu Zhou","Hongfei Xu","Yuxiang Jia","Hongying Zan"],"pdf_url":"https://arxiv.org/pdf/2308.03549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06907v1","updated":"2023-08-14T02:59:27Z","published":"2023-08-14T02:59:27Z","title":"Generative Interpretation","summary":" We introduce generative interpretation, a new approach to estimating\ncontractual meaning using large language models. As AI triumphalism is the\norder of the day, we proceed by way of grounded case studies, each illustrating\nthe capabilities of these novel tools in distinct ways. Taking well-known\ncontracts opinions, and sourcing the actual agreements that they adjudicated,\nwe show that AI models can help factfinders ascertain ordinary meaning in\ncontext, quantify ambiguity, and fill gaps in parties' agreements. We also\nillustrate how models can calculate the probative value of individual pieces of\nextrinsic evidence. After offering best practices for the use of these models\ngiven their limitations, we consider their implications for judicial practice\nand contract theory. Using LLMs permits courts to estimate what the parties\nintended cheaply and accurately, and as such generative interpretation\nunsettles the current interpretative stalemate. Their use responds to\nefficiency-minded textualists and justice-oriented contextualists, who argue\nabout whether parties will prefer cost and certainty or accuracy and fairness.\nParties--and courts--would prefer a middle path, in which adjudicators strive\nto predict what the contract really meant, admitting just enough context to\napproximate reality while avoiding unguided and biased assimilation of\nevidence. As generative interpretation offers this possibility, we argue it can\nbecome the new workhorse of contractual interpretation.\n","authors":["Yonathan A. Arbel","David Hoffman"],"pdf_url":"https://arxiv.org/pdf/2308.06907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06873v1","updated":"2023-08-14T01:01:19Z","published":"2023-08-14T01:01:19Z","title":"SpeechX: Neural Codec Language Model as a Versatile Speech Transformer","summary":" Recent advancements in generative speech models based on audio-text prompts\nhave enabled remarkable innovations like high-quality zero-shot text-to-speech.\nHowever, existing models still face limitations in handling diverse audio-text\nspeech generation tasks involving transforming input speech and processing\naudio captured in adverse acoustic conditions. This paper introduces SpeechX, a\nversatile speech generation model capable of zero-shot TTS and various speech\ntransformation tasks, dealing with both clean and noisy signals. SpeechX\ncombines neural codec language modeling with multi-task learning using\ntask-dependent prompting, enabling unified and extensible modeling and\nproviding a consistent way for leveraging textual input in speech enhancement\nand transformation tasks. Experimental results show SpeechX's efficacy in\nvarious tasks, including zero-shot TTS, noise suppression, target speaker\nextraction, speech removal, and speech editing with or without background\nnoise, achieving comparable or superior performance to specialized models\nacross tasks. See https://aka.ms/speechx for demo samples.\n","authors":["Xiaofei Wang","Manthan Thakker","Zhuo Chen","Naoyuki Kanda","Sefik Emre Eskimez","Sanyuan Chen","Min Tang","Shujie Liu","Jinyu Li","Takuya Yoshioka"],"pdf_url":"https://arxiv.org/pdf/2308.06873v1.pdf","comment":"See https://aka.ms/speechx for demo samples"},{"id":"http://arxiv.org/abs/2308.07489v1","updated":"2023-08-14T22:47:19Z","published":"2023-08-14T22:47:19Z","title":"SOTASTREAM: A Streaming Approach to Machine Translation Training","summary":" Many machine translation toolkits make use of a data preparation step wherein\nraw data is transformed into a tensor format that can be used directly by the\ntrainer. This preparation step is increasingly at odds with modern research and\ndevelopment practices because this process produces a static, unchangeable\nversion of the training data, making common training-time needs difficult\n(e.g., subword sampling), time-consuming (preprocessing with large data can\ntake days), expensive (e.g., disk space), and cumbersome (managing experiment\ncombinatorics). We propose an alternative approach that separates the\ngeneration of data from the consumption of that data. In this approach, there\nis no separate pre-processing step; data generation produces an infinite stream\nof permutations of the raw training data, which the trainer tensorizes and\nbatches as it is consumed. Additionally, this data stream can be manipulated by\na set of user-definable operators that provide on-the-fly modifications, such\nas data normalization, augmentation or filtering. We release an open-source\ntoolkit, SOTASTREAM, that implements this approach:\nhttps://github.com/marian-nmt/sotastream. We show that it cuts training time,\nadds flexibility, reduces experiment management complexity, and reduces disk\nspace, all without affecting the accuracy of the trained models.\n","authors":["Matt Post","Thamme Gowda","Roman Grundkiewicz","Huda Khayrallah","Rohit Jain","Marcin Junczys-Dowmunt"],"pdf_url":"https://arxiv.org/pdf/2308.07489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07486v1","updated":"2023-08-14T22:36:27Z","published":"2023-08-14T22:36:27Z","title":"O-1: Self-training with Oracle and 1-best Hypothesis","summary":" We introduce O-1, a new self-training objective to reduce training bias and\nunify training and evaluation metrics for speech recognition. O-1 is a faster\nvariant of Expected Minimum Bayes Risk (EMBR), that boosts the oracle\nhypothesis and can accommodate both supervised and unsupervised data. We\ndemonstrate the effectiveness of our approach in terms of recognition on\npublicly available SpeechStew datasets and a large-scale, in-house data set. On\nSpeechstew, the O-1 objective closes the gap between the actual and oracle\nperformance by 80\\% relative compared to EMBR which bridges the gap by 43\\%\nrelative. O-1 achieves 13\\% to 25\\% relative improvement over EMBR on the\nvarious datasets that SpeechStew comprises of, and a 12\\% relative gap\nreduction with respect to the oracle WER over EMBR training on the in-house\ndataset. Overall, O-1 results in a 9\\% relative improvement in WER over EMBR,\nthereby speaking to the scalability of the proposed objective for large-scale\ndatasets.\n","authors":["Murali Karthick Baskar","Andrew Rosenberg","Bhuvana Ramabhadran","Kartik Audhkhasi"],"pdf_url":"https://arxiv.org/pdf/2308.07486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07462v1","updated":"2023-08-14T21:19:44Z","published":"2023-08-14T21:19:44Z","title":"Playing with Words: Comparing the Vocabulary and Lexical Richness of\n ChatGPT and Humans","summary":" The introduction of Artificial Intelligence (AI) generative language models\nsuch as GPT (Generative Pre-trained Transformer) and tools such as ChatGPT has\ntriggered a revolution that can transform how text is generated. This has many\nimplications, for example, as AI-generated text becomes a significant fraction\nof the text in many disciplines, would this have an effect on the language\ncapabilities of readers and also on the training of newer AI tools? Would it\naffect the evolution of languages? Focusing on one specific aspect of the\nlanguage: words; will the use of tools such as ChatGPT increase or reduce the\nvocabulary used or the lexical richness (understood as the number of different\nwords used in a written or oral production) when writing a given text? This has\nimplications for words, as those not included in AI-generated content will tend\nto be less and less popular and may eventually be lost. In this work, we\nperform an initial comparison of the vocabulary and lexical richness of ChatGPT\nand humans when performing the same tasks. In more detail, two datasets\ncontaining the answers to different types of questions answered by ChatGPT and\nhumans are used, and the analysis shows that ChatGPT tends to use fewer\ndistinct words and lower lexical richness than humans. These results are very\npreliminary and additional datasets and ChatGPT configurations have to be\nevaluated to extract more general conclusions. Therefore, further research is\nneeded to understand how the use of ChatGPT and more broadly generative AI\ntools will affect the vocabulary and lexical richness in different types of\ntext and languages.\n","authors":["Pedro Reviriego","Javier Conde","Elena Merino-Gómez","Gonzalo Martínez","José Alberto Hernández"],"pdf_url":"https://arxiv.org/pdf/2308.07462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07407v1","updated":"2023-08-14T18:52:03Z","published":"2023-08-14T18:52:03Z","title":"Development and Evaluation of Three Chatbots for Postpartum Mood and\n Anxiety Disorders","summary":" In collaboration with Postpartum Support International (PSI), a non-profit\norganization dedicated to supporting caregivers with postpartum mood and\nanxiety disorders, we developed three chatbots to provide context-specific\nempathetic support to postpartum caregivers, leveraging both rule-based and\ngenerative models. We present and evaluate the performance of our chatbots\nusing both machine-based metrics and human-based questionnaires. Overall, our\nrule-based model achieves the best performance, with outputs that are close to\nground truth reference and contain the highest levels of empathy. Human users\nprefer the rule-based chatbot over the generative chatbot for its\ncontext-specific and human-like replies. Our generative chatbot also produced\nempathetic responses and was described by human users as engaging. However,\nlimitations in the training dataset often result in confusing or nonsensical\nresponses. We conclude by discussing practical benefits of rule-based vs.\ngenerative models for supporting individuals with mental health challenges. In\nlight of the recent surge of ChatGPT and BARD, we also discuss the\npossibilities and pitfalls of large language models for digital mental\nhealthcare.\n","authors":["Xuewen Yao","Miriam Mikhelson","S. Craig Watkins","Eunsol Choi","Edison Thomaz","Kaya de Barbaro"],"pdf_url":"https://arxiv.org/pdf/2308.07407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06203v2","updated":"2023-08-14T18:45:55Z","published":"2023-04-13T00:34:32Z","title":"LeafAI: query generator for clinical cohort discovery rivaling a human\n programmer","summary":" Objective: Identifying study-eligible patients within clinical databases is a\ncritical step in clinical research. However, accurate query design typically\nrequires extensive technical and biomedical expertise. We sought to create a\nsystem capable of generating data model-agnostic queries while also providing\nnovel logical reasoning capabilities for complex clinical trial eligibility\ncriteria.\n Materials and Methods: The task of query creation from eligibility criteria\nrequires solving several text-processing problems, including named entity\nrecognition and relation extraction, sequence-to-sequence transformation,\nnormalization, and reasoning. We incorporated hybrid deep learning and\nrule-based modules for these, as well as a knowledge base of the Unified\nMedical Language System (UMLS) and linked ontologies. To enable data-model\nagnostic query creation, we introduce a novel method for tagging database\nschema elements using UMLS concepts. To evaluate our system, called LeafAI, we\ncompared the capability of LeafAI to a human database programmer to identify\npatients who had been enrolled in 8 clinical trials conducted at our\ninstitution. We measured performance by the number of actual enrolled patients\nmatched by generated queries.\n Results: LeafAI matched a mean 43% of enrolled patients with 27,225 eligible\nacross 8 clinical trials, compared to 27% matched and 14,587 eligible in\nqueries by a human database programmer. The human programmer spent 26 total\nhours crafting queries compared to several minutes by LeafAI.\n Conclusions: Our work contributes a state-of-the-art data model-agnostic\nquery generation system capable of conditional reasoning using a knowledge\nbase. We demonstrate that LeafAI can rival an experienced human programmer in\nfinding patients eligible for clinical trials.\n","authors":["Nicholas J Dobbins","Bin Han","Weipeng Zhou","Kristine Lan","H. Nina Kim","Robert Harrington","Ozlem Uzuner","Meliha Yetisgen"],"pdf_url":"https://arxiv.org/pdf/2304.06203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07395v1","updated":"2023-08-14T18:28:04Z","published":"2023-08-14T18:28:04Z","title":"Text Injection for Capitalization and Turn-Taking Prediction in Speech\n Models","summary":" Text injection for automatic speech recognition (ASR), wherein unpaired\ntext-only data is used to supplement paired audio-text data, has shown\npromising improvements for word error rate. This study examines the use of text\ninjection for auxiliary tasks, which are the non-ASR tasks often performed by\nan E2E model. In this work, we use joint end-to-end and internal language model\ntraining (JEIT) as our text injection algorithm to train an ASR model which\nperforms two auxiliary tasks. The first is capitalization, which is a\nde-normalization task. The second is turn-taking prediction, which attempts to\nidentify whether a user has completed their conversation turn in a digital\nassistant interaction. We show results demonstrating that our text injection\nmethod boosts capitalization performance for long-tail data, and improves\nturn-taking detection recall.\n","authors":["Shaan Bijwadia","Shuo-yiin Chang","Weiran Wang","Zhong Meng","Hao Zhang","Tara N. Sainath"],"pdf_url":"https://arxiv.org/pdf/2308.07395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07393v1","updated":"2023-08-14T18:26:27Z","published":"2023-08-14T18:26:27Z","title":"Using Text Injection to Improve Recognition of Personal Identifiers in\n Speech","summary":" Accurate recognition of specific categories, such as persons' names, dates or\nother identifiers is critical in many Automatic Speech Recognition (ASR)\napplications. As these categories represent personal information, ethical use\nof this data including collection, transcription, training and evaluation\ndemands special care. One way of ensuring the security and privacy of\nindividuals is to redact or eliminate Personally Identifiable Information (PII)\nfrom collection altogether. However, this results in ASR models that tend to\nhave lower recognition accuracy of these categories. We use text-injection to\nimprove the recognition of PII categories by including fake textual substitutes\nof PII categories in the training data using a text injection method. We\ndemonstrate substantial improvement to Recall of Names and Dates in medical\nnotes while improving overall WER. For alphanumeric digit sequences we show\nimprovements to Character Error Rate and Sentence Accuracy.\n","authors":["Yochai Blau","Rohan Agrawal","Lior Madmony","Gary Wang","Andrew Rosenberg","Zhehuai Chen","Zorik Gekhman","Genady Beryozkin","Parisa Haghani","Bhuvana Ramabhadran"],"pdf_url":"https://arxiv.org/pdf/2308.07393v1.pdf","comment":"Accepted to Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.07213v1","updated":"2023-08-14T15:31:32Z","published":"2023-08-14T15:31:32Z","title":"Human-centered NLP Fact-checking: Co-Designing with Fact-checkers using\n Matchmaking for AI","summary":" A key challenge in professional fact-checking is its limited scalability in\nrelation to the magnitude of false information. While many Natural Language\nProcessing (NLP) tools have been proposed to enhance fact-checking efficiency\nand scalability, both academic research and fact-checking organizations report\nlimited adoption of such tooling due to insufficient alignment with\nfact-checker practices, values, and needs. To address this gap, we investigate\na co-design method, Matchmaking for AI, which facilitates fact-checkers,\ndesigners, and NLP researchers to collaboratively discover what fact-checker\nneeds should be addressed by technology and how. Our co-design sessions with 22\nprofessional fact-checkers yielded a set of 11 novel design ideas. They assist\nin information searching, processing, and writing tasks for efficient and\npersonalized fact-checking; help fact-checkers proactively prepare for future\nmisinformation; monitor their potential biases; and support internal\norganization collaboration. Our work offers implications for human-centered\nfact-checking research and practice and AI co-design research.\n","authors":["Houjiang Liu","Anubrata Das","Alexander Boltz","Didi Zhou","Daisy Pinaroc","Matthew Lease","Min Kyung Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08488v1","updated":"2023-08-14T08:19:24Z","published":"2023-08-14T08:19:24Z","title":"Improving Audio-Visual Speech Recognition by Lip-Subword Correlation\n Based Visual Pre-training and Cross-Modal Fusion Encoder","summary":" In recent research, slight performance improvement is observed from automatic\nspeech recognition systems to audio-visual speech recognition systems in the\nend-to-end framework with low-quality videos. Unmatching convergence rates and\nspecialized input representations between audio and visual modalities are\nconsidered to cause the problem. In this paper, we propose two novel techniques\nto improve audio-visual speech recognition (AVSR) under a pre-training and\nfine-tuning training framework. First, we explore the correlation between lip\nshapes and syllable-level subword units in Mandarin to establish good\nframe-level syllable boundaries from lip shapes. This enables accurate\nalignment of video and audio streams during visual model pre-training and\ncross-modal fusion. Next, we propose an audio-guided cross-modal fusion encoder\n(CMFE) neural network to utilize main training parameters for multiple\ncross-modal attention layers to make full use of modality complementarity.\nExperiments on the MISP2021-AVSR data set show the effectiveness of the two\nproposed techniques. Together, using only a relatively small amount of training\ndata, the final system achieves better performances than state-of-the-art\nsystems with more complex front-ends and back-ends.\n","authors":["Yusheng Dai","Hang Chen","Jun Du","Xiaofei Ding","Ning Ding","Feijun Jiang","Chin-Hui Lee"],"pdf_url":"https://arxiv.org/pdf/2308.08488v1.pdf","comment":"6 pages, 2 figures, published in ICME2023"},{"id":"http://arxiv.org/abs/2308.07937v1","updated":"2023-08-14T03:17:24Z","published":"2023-08-14T03:17:24Z","title":"Automated Testing and Improvement of Named Entity Recognition Systems","summary":" Named entity recognition (NER) systems have seen rapid progress in recent\nyears due to the development of deep neural networks. These systems are widely\nused in various natural language processing applications, such as information\nextraction, question answering, and sentiment analysis. However, the complexity\nand intractability of deep neural networks can make NER systems unreliable in\ncertain circumstances, resulting in incorrect predictions. For example, NER\nsystems may misidentify female names as chemicals or fail to recognize the\nnames of minority groups, leading to user dissatisfaction. To tackle this\nproblem, we introduce TIN, a novel, widely applicable approach for\nautomatically testing and repairing various NER systems. The key idea for\nautomated testing is that the NER predictions of the same named entities under\nsimilar contexts should be identical. The core idea for automated repairing is\nthat similar named entities should have the same NER prediction under the same\ncontext. We use TIN to test two SOTA NER models and two commercial NER APIs,\ni.e., Azure NER and AWS NER. We manually verify 784 of the suspicious issues\nreported by TIN and find that 702 are erroneous issues, leading to high\nprecision (85.0%-93.4%) across four categories of NER errors: omission,\nover-labeling, incorrect category, and range error. For automated repairing,\nTIN achieves a high error reduction rate (26.8%-50.6%) over the four systems\nunder test, which successfully repairs 1,056 out of the 1,877 reported NER\nerrors.\n","authors":["Boxi Yu","Yiyan Hu","Qiuyang Mang","Wenhan Hu","Pinjia He"],"pdf_url":"https://arxiv.org/pdf/2308.07937v1.pdf","comment":"Accepted by ESEC/FSE'23"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.07316v1","updated":"2023-08-14T17:59:31Z","published":"2023-08-14T17:59:31Z","title":"Jurassic World Remake: Bringing Ancient Fossils Back to Life via\n Zero-Shot Long Image-to-Image Translation","summary":" With a strong understanding of the target domain from natural language, we\nproduce promising results in translating across large domain gaps and bringing\nskeletons back to life. In this work, we use text-guided latent diffusion\nmodels for zero-shot image-to-image translation (I2I) across large domain gaps\n(longI2I), where large amounts of new visual features and new geometry need to\nbe generated to enter the target domain. Being able to perform translations\nacross large domain gaps has a wide variety of real-world applications in\ncriminology, astrology, environmental conservation, and paleontology. In this\nwork, we introduce a new task Skull2Animal for translating between skulls and\nliving animals. On this task, we find that unguided Generative Adversarial\nNetworks (GANs) are not capable of translating across large domain gaps.\nInstead of these traditional I2I methods, we explore the use of guided\ndiffusion and image editing models and provide a new benchmark model,\nRevive-2I, capable of performing zero-shot I2I via text-prompting latent\ndiffusion models. We find that guidance is necessary for longI2I because, to\nbridge the large domain gap, prior knowledge about the target domain is needed.\nIn addition, we find that prompting provides the best and most scalable\ninformation about the target domain as classifier-guided diffusion models\nrequire retraining for specific use cases and lack stronger constraints on the\ntarget domain because of the wide variety of images they are trained on.\n","authors":["Alexander Martin","Haitian Zheng","Jie An","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2308.07316v1.pdf","comment":"9 pages, 10 figures, ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.07314v1","updated":"2023-08-14T17:58:33Z","published":"2023-08-14T17:58:33Z","title":"Dual Associated Encoder for Face Restoration","summary":" Restoring facial details from low-quality (LQ) images has remained a\nchallenging problem due to its ill-posedness induced by various degradations in\nthe wild. The existing codebook prior mitigates the ill-posedness by leveraging\nan autoencoder and learned codebook of high-quality (HQ) features, achieving\nremarkable quality. However, existing approaches in this paradigm frequently\ndepend on a single encoder pre-trained on HQ data for restoring HQ images,\ndisregarding the domain gap between LQ and HQ images. As a result, the encoding\nof LQ inputs may be insufficient, resulting in suboptimal performance. To\ntackle this problem, we propose a novel dual-branch framework named DAEFR. Our\nmethod introduces an auxiliary LQ branch that extracts crucial information from\nthe LQ inputs. Additionally, we incorporate association training to promote\neffective synergy between the two branches, enhancing code prediction and\noutput quality. We evaluate the effectiveness of DAEFR on both synthetic and\nreal-world datasets, demonstrating its superior performance in restoring facial\ndetails.\n","authors":["Yu-Ju Tsai","Yu-Lun Liu","Lu Qi","Kelvin C. K. Chan","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2308.07314v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.07313v1","updated":"2023-08-14T17:58:04Z","published":"2023-08-14T17:58:04Z","title":"Group Pose: A Simple Baseline for End-to-End Multi-person Pose\n Estimation","summary":" In this paper, we study the problem of end-to-end multi-person pose\nestimation. State-of-the-art solutions adopt the DETR-like framework, and\nmainly develop the complex decoder, e.g., regarding pose estimation as keypoint\nbox detection and combining with human detection in ED-Pose, hierarchically\npredicting with pose decoder and joint (keypoint) decoder in PETR. We present a\nsimple yet effective transformer approach, named Group Pose. We simply regard\n$K$-keypoint pose estimation as predicting a set of $N\\times K$ keypoint\npositions, each from a keypoint query, as well as representing each pose with\nan instance query for scoring $N$ pose predictions. Motivated by the intuition\nthat the interaction, among across-instance queries of different types, is not\ndirectly helpful, we make a simple modification to decoder self-attention. We\nreplace single self-attention over all the $N\\times(K+1)$ queries with two\nsubsequent group self-attentions: (i) $N$ within-instance self-attention, with\neach over $K$ keypoint queries and one instance query, and (ii) $(K+1)$\nsame-type across-instance self-attention, each over $N$ queries of the same\ntype. The resulting decoder removes the interaction among across-instance\ntype-different queries, easing the optimization and thus improving the\nperformance. Experimental results on MS COCO and CrowdPose show that our\napproach without human box supervision is superior to previous methods with\ncomplex decoders, and even is slightly better than ED-Pose that uses human box\nsupervision. $\\href{https://github.com/Michel-liu/GroupPose-Paddle}{\\rm\nPaddle}$ and $\\href{https://github.com/Michel-liu/GroupPose}{\\rm PyTorch}$ code\nare available.\n","authors":["Huan Liu","Qiang Chen","Zichang Tan","Jiang-Jiang Liu","Jian Wang","Xiangbo Su","Xiaolong Li","Kun Yao","Junyu Han","Errui Ding","Yao Zhao","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07313v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07301v1","updated":"2023-08-14T17:39:44Z","published":"2023-08-14T17:39:44Z","title":"A Unified Masked Autoencoder with Patchified Skeletons for Motion\n Synthesis","summary":" The synthesis of human motion has traditionally been addressed through\ntask-dependent models that focus on specific challenges, such as predicting\nfuture motions or filling in intermediate poses conditioned on known key-poses.\nIn this paper, we present a novel task-independent model called UNIMASK-M,\nwhich can effectively address these challenges using a unified architecture.\nOur model obtains comparable or better performance than the state-of-the-art in\neach field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model\ndecomposes a human pose into body parts to leverage the spatio-temporal\nrelationships existing in human motion. Moreover, we reformulate various\npose-conditioned motion synthesis tasks as a reconstruction problem with\ndifferent masking patterns given as input. By explicitly informing our model\nabout the masked joints, our UNIMASK-M becomes more robust to occlusions.\nExperimental results show that our model successfully forecasts human motion on\nthe Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion\ninbetweening on the LaFAN1 dataset, particularly in long transition periods.\nMore information can be found on the project website\nhttps://sites.google.com/view/estevevallsmascaro/publications/unimask-m.\n","authors":["Esteve Valls Mascaro","Hyemin Ahn","Dongheui Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07298v1","updated":"2023-08-14T17:36:39Z","published":"2023-08-14T17:36:39Z","title":"Accurate Eye Tracking from Dense 3D Surface Reconstructions using\n Single-Shot Deflectometry","summary":" Eye-tracking plays a crucial role in the development of virtual reality\ndevices, neuroscience research, and psychology. Despite its significance in\nnumerous applications, achieving an accurate, robust, and fast eye-tracking\nsolution remains a considerable challenge for current state-of-the-art methods.\nWhile existing reflection-based techniques (e.g., \"glint tracking\") are\nconsidered the most accurate, their performance is limited by their reliance on\nsparse 3D surface data acquired solely from the cornea surface. In this paper,\nwe rethink the way how specular reflections can be used for eye tracking: We\npropose a novel method for accurate and fast evaluation of the gaze direction\nthat exploits teachings from single-shot phase-measuring-deflectometry (PMD).\nIn contrast to state-of-the-art reflection-based methods, our method acquires\ndense 3D surface information of both cornea and sclera within only one single\ncamera frame (single-shot). Improvements in acquired reflection surface\npoints(\"glints\") of factors $>3300 \\times$ are easily achievable. We show the\nfeasibility of our approach with experimentally evaluated gaze errors of only\n$\\leq 0.25^\\circ$ demonstrating a significant improvement over the current\nstate-of-the-art.\n","authors":["Jiazhang Wang","Tianfu Wang","Bingjie Xu","Oliver Cossairt And Florian Willomitzer"],"pdf_url":"https://arxiv.org/pdf/2308.07298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07279v1","updated":"2023-08-14T17:11:17Z","published":"2023-08-14T17:11:17Z","title":"A Robust Approach Towards Distinguishing Natural and Computer Generated\n Images using Multi-Colorspace fused and Enriched Vision Transformer","summary":" The works in literature classifying natural and computer generated images are\nmostly designed as binary tasks either considering natural images versus\ncomputer graphics images only or natural images versus GAN generated images\nonly, but not natural images versus both classes of the generated images. Also,\neven though this forensic classification task of distinguishing natural and\ncomputer generated images gets the support of the new convolutional neural\nnetworks and transformer based architectures that can give remarkable\nclassification accuracies, they are seen to fail over the images that have\nundergone some post-processing operations usually performed to deceive the\nforensic algorithms, such as JPEG compression, gaussian noise, etc. This work\nproposes a robust approach towards distinguishing natural and computer\ngenerated images including both, computer graphics and GAN generated images\nusing a fusion of two vision transformers where each of the transformer\nnetworks operates in different color spaces, one in RGB and the other in YCbCr\ncolor space. The proposed approach achieves high performance gain when compared\nto a set of baselines, and also achieves higher robustness and generalizability\nthan the baselines. The features of the proposed model when visualized are seen\nto obtain higher separability for the classes than the input image features and\nthe baseline features. This work also studies the attention map visualizations\nof the networks of the fused model and observes that the proposed methodology\ncan capture more image information relevant to the forensic task of classifying\nnatural and generated images.\n","authors":["Manjary P Gangan","Anoop Kadan","Lajish V L"],"pdf_url":"https://arxiv.org/pdf/2308.07279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07269v1","updated":"2023-08-14T16:52:42Z","published":"2023-08-14T16:52:42Z","title":"EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language\n Models","summary":" Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy\nissues, which means they are unaware of unseen events or generate text with\nincorrect facts owing to the outdated/noisy data. To this end, many knowledge\nediting approaches for LLMs have emerged -- aiming to subtly inject/edit\nupdated knowledge or adjust undesired behavior while minimizing the impact on\nunrelated inputs. Nevertheless, due to significant differences among various\nknowledge editing methods and the variations in task setups, there is no\nstandard implementation framework available for the community, which hinders\npractitioners to apply knowledge editing to applications. To address these\nissues, we propose EasyEdit, an easy-to-use knowledge editing framework for\nLLMs. It supports various cutting-edge knowledge editing approaches and can be\nreadily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.\nEmpirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,\ndemonstrating that knowledge editing surpasses traditional fine-tuning in terms\nof reliability and generalization. We have released the source code on GitHub\nat https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and\ncomprehensive documentation for beginners to get started. Besides, we present\nan online system for real-time knowledge editing, and a demo video at\nhttp://knowlm.zjukg.cn/easyedit.mp4.\n","authors":["Peng Wang","Ningyu Zhang","Xin Xie","Yunzhi Yao","Bozhong Tian","Mengru Wang","Zekun Xi","Siyuan Cheng","Kangwei Liu","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07269v1.pdf","comment":"The project website is https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2308.07267v1","updated":"2023-08-14T16:50:27Z","published":"2023-08-14T16:50:27Z","title":"Diving with Penguins: Detecting Penguins and their Prey in Animal-borne\n Underwater Videos via Deep Learning","summary":" African penguins (Spheniscus demersus) are an endangered species. Little is\nknown regarding their underwater hunting strategies and associated predation\nsuccess rates, yet this is essential for guiding conservation. Modern\nbio-logging technology has the potential to provide valuable insights, but\nmanually analysing large amounts of data from animal-borne video recorders\n(AVRs) is time-consuming. In this paper, we publish an animal-borne underwater\nvideo dataset of penguins and introduce a ready-to-deploy deep learning system\ncapable of robustly detecting penguins (mAP50@98.0%) and also instances of fish\n(mAP50@73.3%). We note that the detectors benefit explicitly from air-bubble\nlearning to improve accuracy. Extending this detector towards a dual-stream\nbehaviour recognition network, we also provide the first results for\nidentifying predation behaviour in penguin underwater videos. Whilst results\nare promising, further work is required for useful applicability of predation\nbehaviour detection in field scenarios. In summary, we provide a highly\nreliable underwater penguin detector, a fish detector, and a valuable first\nattempt towards an automated visual detection of complex behaviours in a marine\npredator. We publish the networks, the DivingWithPenguins video dataset,\nannotations, splits, and weights for full reproducibility and immediate\nusability by practitioners.\n","authors":["Kejia Zhang","Mingyu Yang","Stephen D. J. Lang","Alistair M. McInnes","Richard B. Sherley","Tilo Burghardt"],"pdf_url":"https://arxiv.org/pdf/2308.07267v1.pdf","comment":"5 pages, 5 figures, 4 Tables, \"3rd International Workshop on Camera\n traps, AI, and Ecology (CamTrapAI)\""},{"id":"http://arxiv.org/abs/2308.07264v1","updated":"2023-08-14T16:48:57Z","published":"2023-08-14T16:48:57Z","title":"Efficient Real-time Smoke Filtration with 3D LiDAR for Search and Rescue\n with Autonomous Heterogeneous Robotic Systems","summary":" Search and Rescue (SAR) missions in harsh and unstructured Sub-Terranean\n(Sub-T) environments in the presence of aerosol particles have recently become\nthe main focus in the field of robotics. Aerosol particles such as smoke and\ndust directly affect the performance of any mobile robotic platform due to\ntheir reliance on their onboard perception systems for autonomous navigation\nand localization in Global Navigation Satellite System (GNSS)-denied\nenvironments. Although obstacle avoidance and object detection algorithms are\nrobust to the presence of noise to some degree, their performance directly\nrelies on the quality of captured data by onboard sensors such as Light\nDetection And Ranging (LiDAR) and camera. Thus, this paper proposes a novel\nmodular agnostic filtration pipeline based on intensity and spatial information\nsuch as local point density for removal of detected smoke particles from Point\nCloud (PCL) prior to its utilization for collision detection. Furthermore, the\nefficacy of the proposed framework in the presence of smoke during multiple\nfrontier exploration missions is investigated while the experimental results\nare presented to facilitate comparison with other methodologies and their\ncomputational impact. This provides valuable insight to the research community\nfor better utilization of filtration schemes based on available computation\nresources while considering the safe autonomous navigation of mobile robots.\n","authors":["Alexander Kyuroson","Anton Koval","George Nikolakopoulos"],"pdf_url":"https://arxiv.org/pdf/2308.07264v1.pdf","comment":"Accepted in the 49th Annual Conference of the IEEE Industrial\n Electronics Society [IECON2023]"},{"id":"http://arxiv.org/abs/2308.07251v1","updated":"2023-08-14T16:38:13Z","published":"2023-08-14T16:38:13Z","title":"Large-kernel Attention for Efficient and Robust Brain Lesion\n Segmentation","summary":" Vision transformers are effective deep learning models for vision tasks,\nincluding medical image segmentation. However, they lack efficiency and\ntranslational invariance, unlike convolutional neural networks (CNNs). To model\nlong-range interactions in 3D brain lesion segmentation, we propose an\nall-convolutional transformer block variant of the U-Net architecture. We\ndemonstrate that our model provides the greatest compromise in three factors:\nperformance competitive with the state-of-the-art; parameter efficiency of a\nCNN; and the favourable inductive biases of a transformer. Our public\nimplementation is available at https://github.com/liamchalcroft/MDUNet .\n","authors":["Liam Chalcroft","Ruben Lourenço Pereira","Mikael Brudfors","Andrew S. Kayser","Mark D'Esposito","Cathy J. Price","Ioannis Pappas","John Ashburner"],"pdf_url":"https://arxiv.org/pdf/2308.07251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03202v2","updated":"2023-08-14T16:33:43Z","published":"2023-08-06T20:19:06Z","title":"Source-free Domain Adaptive Human Pose Estimation","summary":" Human Pose Estimation (HPE) is widely used in various fields, including\nmotion analysis, healthcare, and virtual reality. However, the great expenses\nof labeled real-world datasets present a significant challenge for HPE. To\novercome this, one approach is to train HPE models on synthetic datasets and\nthen perform domain adaptation (DA) on real-world data. Unfortunately, existing\nDA methods for HPE neglect data privacy and security by using both source and\ntarget data in the adaptation process. To this end, we propose a new task,\nnamed source-free domain adaptive HPE, which aims to address the challenges of\ncross-domain learning of HPE without access to source data during the\nadaptation process. We further propose a novel framework that consists of three\nmodels: source model, intermediate model, and target model, which explores the\ntask from both source-protect and target-relevant perspectives. The\nsource-protect module preserves source information more effectively while\nresisting noise, and the target-relevant module reduces the sparsity of spatial\nrepresentations by building a novel spatial probability space, and\npose-specific contrastive learning and information maximization are proposed on\nthe basis of this space. Comprehensive experiments on several domain adaptive\nHPE benchmarks show that the proposed method outperforms existing approaches by\na considerable margin.\n","authors":["Qucheng Peng","Ce Zheng","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03202v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07243v1","updated":"2023-08-14T16:24:35Z","published":"2023-08-14T16:24:35Z","title":"AAFACE: Attribute-aware Attentional Network for Face Recognition","summary":" In this paper, we present a new multi-branch neural network that\nsimultaneously performs soft biometric (SB) prediction as an auxiliary modality\nand face recognition (FR) as the main task. Our proposed network named AAFace\nutilizes SB attributes to enhance the discriminative ability of FR\nrepresentation. To achieve this goal, we propose an attribute-aware attentional\nintegration (AAI) module to perform weighted integration of FR with SB feature\nmaps. Our proposed AAI module is not only fully context-aware but also capable\nof learning complex relationships between input features by means of the\nsequential multi-scale channel and spatial sub-modules. Experimental results\nverify the superiority of our proposed network compared with the\nstate-of-the-art (SoTA) SB prediction and FR methods.\n","authors":["Niloufar Alipour Talemi","Hossein Kashiani","Sahar Rahimi Malakshan","Mohammad Saeed Ebrahimi Saadabadi","Nima Najafzadeh","Mohammad Akyash","Nasser M. Nasrabadi"],"pdf_url":"https://arxiv.org/pdf/2308.07243v1.pdf","comment":"Accepted to $30^{th}$ IEEE International Conference on Image\n Processing (ICIP 2023) as an oral presentation"},{"id":"http://arxiv.org/abs/2308.07234v1","updated":"2023-08-14T16:17:13Z","published":"2023-08-14T16:17:13Z","title":"UniWorld: Autonomous Driving Pre-training via World Models","summary":" In this paper, we draw inspiration from Alberto Elfes' pioneering work in\n1989, where he introduced the concept of the occupancy grid as World Models for\nrobots. We imbue the robot with a spatial-temporal world model, termed\nUniWorld, to perceive its surroundings and predict the future behavior of other\nparticipants. UniWorld involves initially predicting 4D geometric occupancy as\nthe World Models for foundational stage and subsequently fine-tuning on\ndownstream tasks. UniWorld can estimate missing information concerning the\nworld state and predict plausible future states of the world. Besides,\nUniWorld's pre-training process is label-free, enabling the utilization of\nmassive amounts of image-LiDAR pairs to build a Foundational Model.The proposed\nunified pre-training framework demonstrates promising results in key tasks such\nas motion prediction, multi-camera 3D object detection, and surrounding\nsemantic scene completion. When compared to monocular pre-training methods on\nthe nuScenes dataset, UniWorld shows a significant improvement of about 1.5% in\nIoU for motion prediction, 2.0% in mAP and 2.0% in NDS for multi-camera 3D\nobject detection, as well as a 3% increase in mIoU for surrounding semantic\nscene completion. By adopting our unified pre-training method, a 25% reduction\nin 3D training annotation costs can be achieved, offering significant practical\nvalue for the implementation of real-world autonomous driving. Codes are\npublicly available at https://github.com/chaytonmin/UniWorld.\n","authors":["Chen Min","Dawei Zhao","Liang Xiao","Yiming Nie","Bin Dai"],"pdf_url":"https://arxiv.org/pdf/2308.07234v1.pdf","comment":"8 pages, 5 figures. arXiv admin note: substantial text overlap with\n arXiv:2305.18829"},{"id":"http://arxiv.org/abs/2308.07228v1","updated":"2023-08-14T16:04:53Z","published":"2023-08-14T16:04:53Z","title":"RestoreFormer++: Towards Real-World Blind Face Restoration from\n Undegraded Key-Value Pairs","summary":" Blind face restoration aims at recovering high-quality face images from those\nwith unknown degradations. Current algorithms mainly introduce priors to\ncomplement high-quality details and achieve impressive progress. However, most\nof these algorithms ignore abundant contextual information in the face and its\ninterplay with the priors, leading to sub-optimal performance. Moreover, they\npay less attention to the gap between the synthetic and real-world scenarios,\nlimiting the robustness and generalization to real-world applications. In this\nwork, we propose RestoreFormer++, which on the one hand introduces\nfully-spatial attention mechanisms to model the contextual information and the\ninterplay with the priors, and on the other hand, explores an extending\ndegrading model to help generate more realistic degraded face images to\nalleviate the synthetic-to-real-world gap. Compared with current algorithms,\nRestoreFormer++ has several crucial benefits. First, instead of using a\nmulti-head self-attention mechanism like the traditional visual transformer, we\nintroduce multi-head cross-attention over multi-scale features to fully explore\nspatial interactions between corrupted information and high-quality priors. In\nthis way, it can facilitate RestoreFormer++ to restore face images with higher\nrealness and fidelity. Second, in contrast to the recognition-oriented\ndictionary, we learn a reconstruction-oriented dictionary as priors, which\ncontains more diverse high-quality facial details and better accords with the\nrestoration target. Third, we introduce an extending degrading model that\ncontains more realistic degraded scenarios for training data synthesizing, and\nthus helps to enhance the robustness and generalization of our RestoreFormer++\nmodel. Extensive experiments show that RestoreFormer++ outperforms\nstate-of-the-art algorithms on both synthetic and real-world datasets.\n","authors":["Zhouxia Wang","Jiawei Zhang","Tianshui Chen","Wenping Wang","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.07228v1.pdf","comment":"Submitted to TPAMI. An extension of RestoreFormer"},{"id":"http://arxiv.org/abs/2308.07225v1","updated":"2023-08-14T15:57:42Z","published":"2023-08-14T15:57:42Z","title":"DS-Depth: Dynamic and Static Depth Estimation via a Fusion Cost Volume","summary":" Self-supervised monocular depth estimation methods typically rely on the\nreprojection error to capture geometric relationships between successive frames\nin static environments. However, this assumption does not hold in dynamic\nobjects in scenarios, leading to errors during the view synthesis stage, such\nas feature mismatch and occlusion, which can significantly reduce the accuracy\nof the generated depth maps. To address this problem, we propose a novel\ndynamic cost volume that exploits residual optical flow to describe moving\nobjects, improving incorrectly occluded regions in static cost volumes used in\nprevious work. Nevertheless, the dynamic cost volume inevitably generates extra\nocclusions and noise, thus we alleviate this by designing a fusion module that\nmakes static and dynamic cost volumes compensate for each other. In other\nwords, occlusion from the static volume is refined by the dynamic volume, and\nincorrect information from the dynamic volume is eliminated by the static\nvolume. Furthermore, we propose a pyramid distillation loss to reduce\nphotometric error inaccuracy at low resolutions and an adaptive photometric\nerror loss to alleviate the flow direction of the large gradient in the\nocclusion regions. We conducted extensive experiments on the KITTI and\nCityscapes datasets, and the results demonstrate that our model outperforms\npreviously published baselines for self-supervised monocular depth estimation.\n","authors":["Xingyu Miao","Yang Bai","Haoran Duan","Yawen Huang","Fan Wan","Xinxing Xu","Yang Long","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.07225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07223v1","updated":"2023-08-14T15:49:19Z","published":"2023-08-14T15:49:19Z","title":"Distance Matters For Improving Performance Estimation Under Covariate\n Shift","summary":" Performance estimation under covariate shift is a crucial component of safe\nAI model deployment, especially for sensitive use-cases. Recently, several\nsolutions were proposed to tackle this problem, most leveraging model\npredictions or softmax confidence to derive accuracy estimates. However, under\ndataset shifts, confidence scores may become ill-calibrated if samples are too\nfar from the training distribution. In this work, we show that taking into\naccount distances of test samples to their expected training distribution can\nsignificantly improve performance estimation under covariate shift. Precisely,\nwe introduce a \"distance-check\" to flag samples that lie too far from the\nexpected distribution, to avoid relying on their untrustworthy model outputs in\nthe accuracy estimation step. We demonstrate the effectiveness of this method\non 13 image classification tasks, across a wide-range of natural and synthetic\ndistribution shifts and hundreds of models, with a median relative MAE\nimprovement of 27% over the best baseline across all tasks, and SOTA\nperformance on 10 out of 13 tasks. Our code is publicly available at\nhttps://github.com/melanibe/distance_matters_performance_estimation.\n","authors":["Mélanie Roschewitz","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2308.07223v1.pdf","comment":"Accepted to ICCV Workshop on Uncertainty Quantification for Computer\n Vision 2023"},{"id":"http://arxiv.org/abs/2308.03900v2","updated":"2023-08-14T15:37:10Z","published":"2023-08-07T20:23:39Z","title":"Developability Approximation for Neural Implicits through Rank\n Minimization","summary":" Developability refers to the process of creating a surface without any\ntearing or shearing from a two-dimensional plane. It finds practical\napplications in the fabrication industry. An essential characteristic of a\ndevelopable 3D surface is its zero Gaussian curvature, which means that either\none or both of the principal curvatures are zero. This paper introduces a\nmethod for reconstructing an approximate developable surface from a neural\nimplicit surface. The central idea of our method involves incorporating a\nregularization term that operates on the second-order derivatives of the neural\nimplicits, effectively promoting zero Gaussian curvature. Implicit surfaces\noffer the advantage of smoother deformation with infinite resolution,\novercoming the high polygonal constraints of state-of-the-art methods using\ndiscrete representations. We draw inspiration from the properties of surface\ncurvature and employ rank minimization techniques derived from compressed\nsensing. Experimental results on both developable and non-developable surfaces,\nincluding those affected by noise, validate the generalizability of our method.\n","authors":["Pratheba Selvaraju"],"pdf_url":"https://arxiv.org/pdf/2308.03900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07214v1","updated":"2023-08-14T15:34:22Z","published":"2023-08-14T15:34:22Z","title":"Automated Ensemble-Based Segmentation of Adult Brain Tumors: A Novel\n Approach Using the BraTS AFRICA Challenge Data","summary":" Brain tumors, particularly glioblastoma, continue to challenge medical\ndiagnostics and treatments globally. This paper explores the application of\ndeep learning to multi-modality magnetic resonance imaging (MRI) data for\nenhanced brain tumor segmentation precision in the Sub-Saharan Africa patient\npopulation. We introduce an ensemble method that comprises eleven unique\nvariations based on three core architectures: UNet3D, ONet3D, SphereNet3D and\nmodified loss functions. The study emphasizes the need for both age- and\npopulation-based segmentation models, to fully account for the complexities in\nthe brain. Our findings reveal that the ensemble approach, combining different\narchitectures, outperforms single models, leading to improved evaluation\nmetrics. Specifically, the results exhibit Dice scores of 0.82, 0.82, and 0.87\nfor enhancing tumor, tumor core, and whole tumor labels respectively. These\nresults underline the potential of tailored deep learning techniques in\nprecisely segmenting brain tumors and lay groundwork for future work to\nfine-tune models and assess performance across different brain regions.\n","authors":["Chiranjeewee Prasad Koirala","Sovesh Mohapatra","Advait Gosai","Gottfried Schlaug"],"pdf_url":"https://arxiv.org/pdf/2308.07214v1.pdf","comment":"3 figs and 3 tables"},{"id":"http://arxiv.org/abs/2308.07212v1","updated":"2023-08-14T15:29:32Z","published":"2023-08-14T15:29:32Z","title":"Automated Ensemble-Based Segmentation of Pediatric Brain Tumors: A Novel\n Approach Using the CBTN-CONNECT-ASNR-MICCAI BraTS-PEDs 2023 Challenge Data","summary":" Brain tumors remain a critical global health challenge, necessitating\nadvancements in diagnostic techniques and treatment methodologies. In response\nto the growing need for age-specific segmentation models, particularly for\npediatric patients, this study explores the deployment of deep learning\ntechniques using magnetic resonance imaging (MRI) modalities. By introducing a\nnovel ensemble approach using ONet and modified versions of UNet, coupled with\ninnovative loss functions, this study achieves a precise segmentation model for\nthe BraTS-PEDs 2023 Challenge. Data augmentation, including both single and\ncomposite transformations, ensures model robustness and accuracy across\ndifferent scanning protocols. The ensemble strategy, integrating the ONet and\nUNet models, shows greater effectiveness in capturing specific features and\nmodeling diverse aspects of the MRI images which result in lesion_wise dice\nscores of 0.52, 0.72 and 0.78 for enhancing tumor, tumor core and whole tumor\nlabels respectively. Visual comparisons further confirm the superiority of the\nensemble method in accurate tumor region coverage. The results indicate that\nthis advanced ensemble approach, building upon the unique strengths of\nindividual models, offers promising prospects for enhanced diagnostic accuracy\nand effective treatment planning for brain tumors in pediatric brains.\n","authors":["Shashidhar Reddy Javaji","Sovesh Mohapatra","Advait Gosai","Gottfried Schlaug"],"pdf_url":"https://arxiv.org/pdf/2308.07212v1.pdf","comment":"3 Figs, 3 Tables"},{"id":"http://arxiv.org/abs/2308.07209v1","updated":"2023-08-14T15:25:07Z","published":"2023-08-14T15:25:07Z","title":"Unified Data-Free Compression: Pruning and Quantization without\n Fine-Tuning","summary":" Structured pruning and quantization are promising approaches for reducing the\ninference time and memory footprint of neural networks. However, most existing\nmethods require the original training dataset to fine-tune the model. This not\nonly brings heavy resource consumption but also is not possible for\napplications with sensitive or proprietary data due to privacy and security\nconcerns. Therefore, a few data-free methods are proposed to address this\nproblem, but they perform data-free pruning and quantization separately, which\ndoes not explore the complementarity of pruning and quantization. In this\npaper, we propose a novel framework named Unified Data-Free Compression(UDFC),\nwhich performs pruning and quantization simultaneously without any data and\nfine-tuning process. Specifically, UDFC starts with the assumption that the\npartial information of a damaged(e.g., pruned or quantized) channel can be\npreserved by a linear combination of other channels, and then derives the\nreconstruction form from the assumption to restore the information loss due to\ncompression. Finally, we formulate the reconstruction error between the\noriginal network and its compressed network, and theoretically deduce the\nclosed-form solution. We evaluate the UDFC on the large-scale image\nclassification task and obtain significant improvements over various network\narchitectures and compression methods. For example, we achieve a 20.54%\naccuracy improvement on ImageNet dataset compared to SOTA method with 30%\npruning ratio and 6-bit quantization on ResNet-34.\n","authors":["Shipeng Bai","Jun Chen","Xintian Shen","Yixuan Qian","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07209v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.07207v1","updated":"2023-08-14T15:24:44Z","published":"2023-08-14T15:24:44Z","title":"FOLT: Fast Multiple Object Tracking from UAV-captured Videos Based on\n Optical Flow","summary":" Multiple object tracking (MOT) has been successfully investigated in computer\nvision.\n However, MOT for the videos captured by unmanned aerial vehicles (UAV) is\nstill challenging due to small object size, blurred object appearance, and very\nlarge and/or irregular motion in both ground objects and UAV platforms.\n In this paper, we propose FOLT to mitigate these problems and reach fast and\naccurate MOT in UAV view.\n Aiming at speed-accuracy trade-off, FOLT adopts a modern detector and\nlight-weight optical flow extractor to extract object detection features and\nmotion features at a minimum cost.\n Given the extracted flow, the flow-guided feature augmentation is designed to\naugment the object detection feature based on its optical flow, which improves\nthe detection of small objects.\n Then the flow-guided motion prediction is also proposed to predict the\nobject's position in the next frame, which improves the tracking performance of\nobjects with very large displacements between adjacent frames.\n Finally, the tracker matches the detected objects and predicted objects using\na spatially matching scheme to generate tracks for every object.\n Experiments on Visdrone and UAVDT datasets show that our proposed model can\nsuccessfully track small objects with large and irregular motion and outperform\nexisting state-of-the-art methods in UAV-MOT tasks.\n","authors":["Mufeng Yao","Jiaqi Wang","Jinlong Peng","Mingmin Chi","Chao Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09571v2","updated":"2023-08-14T15:15:46Z","published":"2023-04-19T11:19:10Z","title":"SLIC: Large Receptive Field Learning with Self-Conditioned Adaptability\n for Learned Image Compression","summary":" Recently, transformers are trending as replacements for CNNs in vision tasks,\nincluding compression. This trend compels us to question the inherent\nlimitations of CNNs compared to transformers and to explore if CNNs can be\nenhanced to achieve the same or even better performance than transformers. We\nwant to design a pure CNN based model for compression as most devices are\noptimized for CNNs well. In our analysis, we find that the key strengths of\ntransformers lie in their dynamic weights and large receptive fields. To enable\nCNNs with such properties, we propose a novel transform module with large\nreceptive filed learning and self-conditioned adaptability for learned image\ncompression, named SLIC. Specifically, we enlarge the receptive field of\ndepth-wise convolution with suitable complexity and generate the weights\naccording to given conditions. In addition, we also investigate the\nself-conditioned factor for channels. To prove the effectiveness of our\nproposed transform module, we equip it with existing entropy models ChARM,\nSCCTX, and SWAtten and we obtain models SLIC-ChARM, SLIC-SCCTX, and\nSLIC-SWAtten. Extensive experiments demonstrate our SLIC-ChARM, SLIC-SCCTX, and\nSLIC-SWAtten have significant improvements over corresponding baselines and\nachieve SOTA performances with suitable complexity on 5 test datasets (Kodak,\nTecnick, CLIC 20, CLIC 21, JPEGAI). Code will be available at\nhttps://github.com/JiangWeibeta/SLIC.\n","authors":["Wei Jiang","Peirong Ning","Jiayu Yang","Yongqi Zhai","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09571v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2308.07202v1","updated":"2023-08-14T15:14:37Z","published":"2023-08-14T15:14:37Z","title":"Towards Robust Real-Time Scene Text Detection: From Semantic to Instance\n Representation Learning","summary":" Due to the flexible representation of arbitrary-shaped scene text and simple\npipeline, bottom-up segmentation-based methods begin to be mainstream in\nreal-time scene text detection. Despite great progress, these methods show\ndeficiencies in robustness and still suffer from false positives and instance\nadhesion. Different from existing methods which integrate multiple-granularity\nfeatures or multiple outputs, we resort to the perspective of representation\nlearning in which auxiliary tasks are utilized to enable the encoder to jointly\nlearn robust features with the main task of per-pixel classification during\noptimization. For semantic representation learning, we propose global-dense\nsemantic contrast (GDSC), in which a vector is extracted for global semantic\nrepresentation, then used to perform element-wise contrast with the dense grid\nfeatures. To learn instance-aware representation, we propose to combine\ntop-down modeling (TDM) with the bottom-up framework to provide implicit\ninstance-level clues for the encoder. With the proposed GDSC and TDM, the\nencoder network learns stronger representation without introducing any\nparameters and computations during inference. Equipped with a very light\ndecoder, the detector can achieve more robust real-time scene text detection.\nExperimental results on four public datasets show that the proposed method can\noutperform or be comparable to the state-of-the-art on both accuracy and speed.\nSpecifically, the proposed method achieves 87.2% F-measure with 48.2 FPS on\nTotal-Text and 89.6% F-measure with 36.9 FPS on MSRA-TD500 on a single GeForce\nRTX 2080 Ti GPU.\n","authors":["Xugong Qin","Pengyuan Lyu","Chengquan Zhang","Yu Zhou","Kun Yao","Peng Zhang","Hailun Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07202v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.10003v2","updated":"2023-08-14T15:09:45Z","published":"2023-06-16T17:56:16Z","title":"C2F2NeUS: Cascade Cost Frustum Fusion for High Fidelity and\n Generalizable Neural Surface Reconstruction","summary":" There is an emerging effort to combine the two popular 3D frameworks using\nMulti-View Stereo (MVS) and Neural Implicit Surfaces (NIS) with a specific\nfocus on the few-shot / sparse view setting. In this paper, we introduce a\nnovel integration scheme that combines the multi-view stereo with neural signed\ndistance function representations, which potentially overcomes the limitations\nof both methods. MVS uses per-view depth estimation and cross-view fusion to\ngenerate accurate surfaces, while NIS relies on a common coordinate volume.\nBased on this strategy, we propose to construct per-view cost frustum for finer\ngeometry estimation, and then fuse cross-view frustums and estimate the\nimplicit signed distance functions to tackle artifacts that are due to noise\nand holes in the produced surface reconstruction. We further apply a cascade\nfrustum fusion strategy to effectively captures global-local information and\nstructural consistency. Finally, we apply cascade sampling and a\npseudo-geometric loss to foster stronger integration between the two\narchitectures. Extensive experiments demonstrate that our method reconstructs\nrobust surfaces and outperforms existing state-of-the-art methods.\n","authors":["Luoyuan Xu","Tao Guan","Yuesong Wang","Wenkai Liu","Zhaojie Zeng","Junle Wang","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2306.10003v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2302.07184v2","updated":"2023-08-14T14:49:27Z","published":"2023-02-14T16:52:26Z","title":"Point Cloud Registration for LiDAR and Photogrammetric Data: a Critical\n Synthesis and Performance Analysis on Classic and Deep Learning Algorithms","summary":" Recent advances in computer vision and deep learning have shown promising\nperformance in estimating rigid/similarity transformation between unregistered\npoint clouds of complex objects and scenes. However, their performances are\nmostly evaluated using a limited number of datasets from a single sensor (e.g.\nKinect or RealSense cameras), lacking a comprehensive overview of their\napplicability in photogrammetric 3D mapping scenarios. In this work, we provide\na comprehensive review of the state-of-the-art (SOTA) point cloud registration\nmethods, where we analyze and evaluate these methods using a diverse set of\npoint cloud data from indoor to satellite sources. The quantitative analysis\nallows for exploring the strengths, applicability, challenges, and future\ntrends of these methods. In contrast to existing analysis works that introduce\npoint cloud registration as a holistic process, our experimental analysis is\nbased on its inherent two-step process to better comprehend these approaches\nincluding feature/keypoint-based initial coarse registration and dense fine\nregistration through cloud-to-cloud (C2C) optimization. More than ten methods,\nincluding classic hand-crafted, deep-learning-based feature correspondence, and\nrobust C2C methods were tested. We observed that the success rate of most of\nthe algorithms are fewer than 40% over the datasets we tested and there are\nstill are large margin of improvement upon existing algorithms concerning 3D\nsparse corresopondence search, and the ability to register point clouds with\ncomplex geometry and occlusions. With the evaluated statistics on three\ndatasets, we conclude the best-performing methods for each step and provide our\nrecommendations, and outlook future efforts.\n","authors":["Ningli Xu","Rongjun Qin","Shuang Song"],"pdf_url":"https://arxiv.org/pdf/2302.07184v2.pdf","comment":"7 figures"},{"id":"http://arxiv.org/abs/2308.07180v1","updated":"2023-08-14T14:39:06Z","published":"2023-08-14T14:39:06Z","title":"SEMI-CenterNet: A Machine Learning Facilitated Approach for\n Semiconductor Defect Inspection","summary":" Continual shrinking of pattern dimensions in the semiconductor domain is\nmaking it increasingly difficult to inspect defects due to factors such as the\npresence of stochastic noise and the dynamic behavior of defect patterns and\ntypes. Conventional rule-based methods and non-parametric supervised machine\nlearning algorithms like KNN mostly fail at the requirements of semiconductor\ndefect inspection at these advanced nodes. Deep Learning (DL)-based methods\nhave gained popularity in the semiconductor defect inspection domain because\nthey have been proven robust towards these challenging scenarios. In this\nresearch work, we have presented an automated DL-based approach for efficient\nlocalization and classification of defects in SEM images. We have proposed\nSEMI-CenterNet (SEMI-CN), a customized CN architecture trained on SEM images of\nsemiconductor wafer defects. The use of the proposed CN approach allows\nimproved computational efficiency compared to previously studied DL models.\nSEMI-CN gets trained to output the center, class, size, and offset of a defect\ninstance. This is different from the approach of most object detection models\nthat use anchors for bounding box prediction. Previous methods predict\nredundant bounding boxes, most of which are discarded in postprocessing. CN\nmitigates this by only predicting boxes for likely defect center points. We\ntrain SEMI-CN on two datasets and benchmark two ResNet backbones for the\nframework. Initially, ResNet models pretrained on the COCO dataset undergo\ntraining using two datasets separately. Primarily, SEMI-CN shows significant\nimprovement in inference time against previous research works. Finally,\ntransfer learning (using weights of custom SEM dataset) is applied from ADI\ndataset to AEI dataset and vice-versa, which reduces the required training time\nfor both backbones to reach the best mAP against conventional training method.\n","authors":["Vic De Ridder","Bappaditya Dey","Enrique Dehaerne","Sandip Halder","Stefan De Gendt","Bartel Van Waeyenberge"],"pdf_url":"https://arxiv.org/pdf/2308.07180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13334v3","updated":"2023-08-14T14:35:02Z","published":"2023-02-26T15:34:05Z","title":"Knowledge Restore and Transfer for Multi-label Class-Incremental\n Learning","summary":" Current class-incremental learning research mainly focuses on single-label\nclassification tasks while multi-label class-incremental learning (MLCIL) with\nmore practical application scenarios is rarely studied. Although there have\nbeen many anti-forgetting methods to solve the problem of catastrophic\nforgetting in class-incremental learning, these methods have difficulty in\nsolving the MLCIL problem due to label absence and information dilution. In\nthis paper, we propose a knowledge restore and transfer (KRT) framework for\nMLCIL, which includes a dynamic pseudo-label (DPL) module to restore the old\nclass knowledge and an incremental cross-attention(ICA) module to save\nsession-specific knowledge and transfer old class knowledge to the new model\nsufficiently. Besides, we propose a token loss to jointly optimize the\nincremental cross-attention module. Experimental results on MS-COCO and PASCAL\nVOC datasets demonstrate the effectiveness of our method for improving\nrecognition performance and mitigating forgetting on multi-label\nclass-incremental learning tasks.\n","authors":["Songlin Dong","Haoyu Luo","Yuhang He","Xing Wei","Yihong Gong"],"pdf_url":"https://arxiv.org/pdf/2302.13334v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.04668v2","updated":"2023-08-14T14:24:28Z","published":"2022-03-09T12:15:55Z","title":"Inadequately Pre-trained Models are Better Feature Extractors","summary":" Pre-training has been a popular learning paradigm in deep learning era,\nespecially in annotation-insufficient scenario. Better ImageNet pre-trained\nmodels have been demonstrated, from the perspective of architecture, by\nprevious research to have better transferability to downstream tasks. However,\nin this paper, we found that during the same pre-training process, models at\nmiddle epochs, which is inadequately pre-trained, can outperform fully trained\nmodels when used as feature extractors (FE), while the fine-tuning (FT)\nperformance still grows with the source performance. This reveals that there is\nnot a solid positive correlation between top-1 accuracy on ImageNet and the\ntransferring result on target data. Based on the contradictory phenomenon\nbetween FE and FT that better feature extractor fails to be fine-tuned better\naccordingly, we conduct comprehensive analyses on features before softmax layer\nto provide insightful explanations. Our discoveries suggest that, during\npre-training, models tend to first learn spectral components corresponding to\nlarge singular values and the residual components contribute more when\nfine-tuning.\n","authors":["Andong Deng","Xingjian Li","Di Hu","Tianyang Wang","Haoyi Xiong","Chengzhong Xu"],"pdf_url":"https://arxiv.org/pdf/2203.04668v2.pdf","comment":"Accepted by ICCV'2023"},{"id":"http://arxiv.org/abs/2308.07163v1","updated":"2023-08-14T14:18:11Z","published":"2023-08-14T14:18:11Z","title":"HyperSparse Neural Networks: Shifting Exploration to Exploitation\n through Adaptive Regularization","summary":" Sparse neural networks are a key factor in developing resource-efficient\nmachine learning applications. We propose the novel and powerful sparse\nlearning method Adaptive Regularized Training (ART) to compress dense into\nsparse networks. Instead of the commonly used binary mask during training to\nreduce the number of model weights, we inherently shrink weights close to zero\nin an iterative manner with increasing weight regularization. Our method\ncompresses the pre-trained model knowledge into the weights of highest\nmagnitude. Therefore, we introduce a novel regularization loss named\nHyperSparse that exploits the highest weights while conserving the ability of\nweight exploration. Extensive experiments on CIFAR and TinyImageNet show that\nour method leads to notable performance gains compared to other sparsification\nmethods, especially in extremely high sparsity regimes up to 99.8 percent model\nsparsity. Additional investigations provide new insights into the patterns that\nare encoded in weights with high magnitudes.\n","authors":["Patrick Glandorf","Timo Kaiser","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2308.07163v1.pdf","comment":"ICCV'23 Workshops"},{"id":"http://arxiv.org/abs/2308.07156v1","updated":"2023-08-14T14:09:41Z","published":"2023-08-14T14:09:41Z","title":"SAM Meets Robotic Surgery: An Empirical Study on Generalization,\n Robustness and Adaptation","summary":" The Segment Anything Model (SAM) serves as a fundamental model for semantic\nsegmentation and demonstrates remarkable generalization capabilities across a\nwide range of downstream scenarios. In this empirical study, we examine SAM's\nrobustness and zero-shot generalizability in the field of robotic surgery. We\ncomprehensively explore different scenarios, including prompted and unprompted\nsituations, bounding box and points-based prompt approaches, as well as the\nability to generalize under corruptions and perturbations at five severity\nlevels. Additionally, we compare the performance of SAM with state-of-the-art\nsupervised models. We conduct all the experiments with two well-known robotic\ninstrument segmentation datasets from MICCAI EndoVis 2017 and 2018 challenges.\nOur extensive evaluation results reveal that although SAM shows remarkable\nzero-shot generalization ability with bounding box prompts, it struggles to\nsegment the whole instrument with point-based prompts and unprompted settings.\nFurthermore, our qualitative figures demonstrate that the model either failed\nto predict certain parts of the instrument mask (e.g., jaws, wrist) or\npredicted parts of the instrument as wrong classes in the scenario of\noverlapping instruments within the same bounding box or with the point-based\nprompt. In fact, SAM struggles to identify instruments in complex surgical\nscenarios characterized by the presence of blood, reflection, blur, and shade.\nAdditionally, SAM is insufficiently robust to maintain high performance when\nsubjected to various forms of data corruption. We also attempt to fine-tune SAM\nusing Low-rank Adaptation (LoRA) and propose SurgicalSAM, which shows the\ncapability in class-wise mask prediction without prompt. Therefore, we can\nargue that, without further domain-specific fine-tuning, SAM is not ready for\ndownstream surgical tasks.\n","authors":["An Wang","Mobarakol Islam","Mengya Xu","Yang Zhang","Hongliang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.07156v1.pdf","comment":"Accepted as Oral Presentation at MedAGI Workshop - MICCAI 2023 1st\n International Workshop on Foundation Models for General Medical AI. arXiv\n admin note: substantial text overlap with arXiv:2304.14674"},{"id":"http://arxiv.org/abs/2308.07153v1","updated":"2023-08-14T14:06:21Z","published":"2023-08-14T14:06:21Z","title":"DELO: Deep Evidential LiDAR Odometry using Partial Optimal Transport","summary":" Accurate, robust, and real-time LiDAR-based odometry (LO) is imperative for\nmany applications like robot navigation, globally consistent 3D scene map\nreconstruction, or safe motion-planning. Though LiDAR sensor is known for its\nprecise range measurement, the non-uniform and uncertain point sampling density\ninduce structural inconsistencies. Hence, existing supervised and unsupervised\npoint set registration methods fail to establish one-to-one matching\ncorrespondences between LiDAR frames. We introduce a novel deep learning-based\nreal-time (approx. 35-40ms per frame) LO method that jointly learns accurate\nframe-to-frame correspondences and model's predictive uncertainty (PU) as\nevidence to safe-guard LO predictions. In this work, we propose (i) partial\noptimal transportation of LiDAR feature descriptor for robust LO estimation,\n(ii) joint learning of predictive uncertainty while learning odometry over\ndriving sequences, and (iii) demonstrate how PU can serve as evidence for\nnecessary pose-graph optimization when LO network is either under or over\nconfident. We evaluate our method on KITTI dataset and show competitive\nperformance, even superior generalization ability over recent state-of-the-art\napproaches. Source codes are available.\n","authors":["Sk Aziz Ali","Djamila Aouada","Gerd Reis","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2308.07153v1.pdf","comment":"Accepted in ICCV 2023 Workshop"},{"id":"http://arxiv.org/abs/2304.01716v2","updated":"2023-08-14T14:02:15Z","published":"2023-04-04T11:25:44Z","title":"Decoupling Dynamic Monocular Videos for Dynamic View Synthesis","summary":" The challenge of dynamic view synthesis from dynamic monocular videos, i.e.,\nsynthesizing novel views for free viewpoints given a monocular video of a\ndynamic scene captured by a moving camera, mainly lies in accurately modeling\nthe dynamic objects of a scene using limited 2D frames, each with a varying\ntimestamp and viewpoint. Existing methods usually require pre-processed 2D\noptical flow and depth maps by off-the-shelf methods to supervise the network,\nmaking them suffer from the inaccuracy of the pre-processed supervision and the\nambiguity when lifting the 2D information to 3D. In this paper, we tackle this\nchallenge in an unsupervised fashion. Specifically, we decouple the motion of\nthe dynamic objects into object motion and camera motion, respectively\nregularized by proposed unsupervised surface consistency and patch-based\nmulti-view constraints. The former enforces the 3D geometric surfaces of moving\nobjects to be consistent over time, while the latter regularizes their\nappearances to be consistent across different viewpoints. Such a fine-grained\nmotion formulation can alleviate the learning difficulty for the network, thus\nenabling it to produce not only novel views with higher quality but also more\naccurate scene flows and depth than existing methods requiring extra\nsupervision.\n","authors":["Meng You","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2304.01716v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07151v1","updated":"2023-08-14T13:59:04Z","published":"2023-08-14T13:59:04Z","title":"Diffusion Based Augmentation for Captioning and Retrieval in Cultural\n Heritage","summary":" Cultural heritage applications and advanced machine learning models are\ncreating a fruitful synergy to provide effective and accessible ways of\ninteracting with artworks. Smart audio-guides, personalized art-related content\nand gamification approaches are just a few examples of how technology can be\nexploited to provide additional value to artists or exhibitions. Nonetheless,\nfrom a machine learning point of view, the amount of available artistic data is\noften not enough to train effective models. Off-the-shelf computer vision\nmodules can still be exploited to some extent, yet a severe domain shift is\npresent between art images and standard natural image datasets used to train\nsuch models. As a result, this can lead to degraded performance. This paper\nintroduces a novel approach to address the challenges of limited annotated data\nand domain shifts in the cultural heritage domain. By leveraging generative\nvision-language models, we augment art datasets by generating diverse\nvariations of artworks conditioned on their captions. This augmentation\nstrategy enhances dataset diversity, bridging the gap between natural images\nand artworks, and improving the alignment of visual cues with knowledge from\ngeneral-purpose datasets. The generated variations assist in training vision\nand language models with a deeper understanding of artistic characteristics and\nthat are able to generate better captions with appropriate jargon.\n","authors":["Dario Cioni","Lorenzo Berlincioni","Federico Becattini","Alberto del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2308.07151v1.pdf","comment":"Accepted at ICCV 2023 4th Workshop on e-Heritage"},{"id":"http://arxiv.org/abs/2308.07146v1","updated":"2023-08-14T13:53:18Z","published":"2023-08-14T13:53:18Z","title":"CTP: Towards Vision-Language Continual Pretraining via Compatible\n Momentum Contrast and Topology Preservation","summary":" Vision-Language Pretraining (VLP) has shown impressive results on diverse\ndownstream tasks by offline training on large-scale datasets. Regarding the\ngrowing nature of real-world data, such an offline training paradigm on\never-expanding data is unsustainable, because models lack the continual\nlearning ability to accumulate knowledge constantly. However, most continual\nlearning studies are limited to uni-modal classification and existing\nmulti-modal datasets cannot simulate continual non-stationary data stream\nscenarios. To support the study of Vision-Language Continual Pretraining\n(VLCP), we first contribute a comprehensive and unified benchmark dataset P9D\nwhich contains over one million product image-text pairs from 9 industries. The\ndata from each industry as an independent task supports continual learning and\nconforms to the real-world long-tail nature to simulate pretraining on web\ndata. We comprehensively study the characteristics and challenges of VLCP, and\npropose a new algorithm: Compatible momentum contrast with Topology\nPreservation, dubbed CTP. The compatible momentum model absorbs the knowledge\nof the current and previous-task models to flexibly update the modal feature.\nMoreover, Topology Preservation transfers the knowledge of embedding across\ntasks while preserving the flexibility of feature adjustment. The experimental\nresults demonstrate our method not only achieves superior performance compared\nwith other baselines but also does not bring an expensive training burden.\nDataset and codes are available at https://github.com/KevinLight831/CTP.\n","authors":["Hongguang Zhu","Yunchao Wei","Xiaodan Liang","Chunjie Zhang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.07146v1.pdf","comment":"Accepted by ICCV 2023. Code: https://github.com/KevinLight831/CTP"},{"id":"http://arxiv.org/abs/2301.00135v3","updated":"2023-08-14T13:41:49Z","published":"2022-12-31T06:32:36Z","title":"TeViS:Translating Text Synopses to Video Storyboards","summary":" A video storyboard is a roadmap for video creation which consists of\nshot-by-shot images to visualize key plots in a text synopsis. Creating video\nstoryboards, however, remains challenging which not only requires cross-modal\nassociation between high-level texts and images but also demands long-term\nreasoning to make transitions smooth across shots. In this paper, we propose a\nnew task called Text synopsis to Video Storyboard (TeViS) which aims to\nretrieve an ordered sequence of images as the video storyboard to visualize the\ntext synopsis. We construct a MovieNet-TeViS dataset based on the public\nMovieNet dataset. It contains 10K text synopses each paired with keyframes\nmanually selected from corresponding movies by considering both relevance and\ncinematic coherence. To benchmark the task, we present strong CLIP-based\nbaselines and a novel VQ-Trans. VQ-Trans first encodes text synopsis and images\ninto a joint embedding space and uses vector quantization (VQ) to improve the\nvisual representation. Then, it auto-regressively generates a sequence of\nvisual features for retrieval and ordering. Experimental results demonstrate\nthat VQ-Trans significantly outperforms prior methods and the CLIP-based\nbaselines. Nevertheless, there is still a large gap compared to human\nperformance suggesting room for promising future work. The code and data are\navailable at: \\url{https://ruc-aimind.github.io/projects/TeViS/}\n","authors":["Xu Gu","Yuchong Sun","Feiyue Ni","Shizhe Chen","Xihua Wang","Ruihua Song","Boyuan Li","Xiang Cao"],"pdf_url":"https://arxiv.org/pdf/2301.00135v3.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2306.17723v4","updated":"2023-08-14T13:41:48Z","published":"2023-06-30T15:11:00Z","title":"FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis","summary":" Neural Radiance Field (NeRF) has been a mainstream in novel view synthesis\nwith its remarkable quality of rendered images and simple architecture.\nAlthough NeRF has been developed in various directions improving continuously\nits performance, the necessity of a dense set of multi-view images still exists\nas a stumbling block to progress for practical application. In this work, we\npropose FlipNeRF, a novel regularization method for few-shot novel view\nsynthesis by utilizing our proposed flipped reflection rays. The flipped\nreflection rays are explicitly derived from the input ray directions and\nestimated normal vectors, and play a role of effective additional training rays\nwhile enabling to estimate more accurate surface normals and learn the 3D\ngeometry effectively. Since the surface normal and the scene depth are both\nderived from the estimated densities along a ray, the accurate surface normal\nleads to more exact depth estimation, which is a key factor for few-shot novel\nview synthesis. Furthermore, with our proposed Uncertainty-aware Emptiness Loss\nand Bottleneck Feature Consistency Loss, FlipNeRF is able to estimate more\nreliable outputs with reducing floating artifacts effectively across the\ndifferent scene structures, and enhance the feature-level consistency between\nthe pair of the rays cast toward the photo-consistent pixels without any\nadditional feature extractor, respectively. Our FlipNeRF achieves the SOTA\nperformance on the multiple benchmarks across all the scenarios.\n","authors":["Seunghyeon Seo","Yeonjin Chang","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2306.17723v4.pdf","comment":"ICCV 2023. Project Page: https://shawn615.github.io/flipnerf/"},{"id":"http://arxiv.org/abs/2302.13084v2","updated":"2023-08-14T13:18:34Z","published":"2023-02-25T14:09:52Z","title":"RemoteNet: Remote Sensing Image Segmentation Network based on\n Global-Local Information","summary":" Remotely captured images possess an immense scale and object appearance\nvariability due to the complex scene. It becomes challenging to capture the\nunderlying attributes in the global and local context for their segmentation.\nExisting networks struggle to capture the inherent features due to the\ncluttered background. To address these issues, we propose a remote sensing\nimage segmentation network, RemoteNet, for semantic segmentation of remote\nsensing images. We capture the global and local features by leveraging the\nbenefits of the transformer and convolution mechanisms. RemoteNet is an\nencoder-decoder design that uses multi-scale features. We construct an\nattention map module to generate channel-wise attention scores for fusing these\nfeatures. We construct a global-local transformer block (GLTB) in the decoder\nnetwork to support learning robust representations during a decoding phase.\nFurther, we designed a feature refinement module to refine the fused output of\nthe shallow stage encoder feature and the deepest GLTB feature of the decoder.\nExperimental findings on the two public datasets show the effectiveness of the\nproposed RemoteNet.\n","authors":["Satyawant Kumar","Abhishek Kumar","Dong-Gyu Lee"],"pdf_url":"https://arxiv.org/pdf/2302.13084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07126v1","updated":"2023-08-14T13:13:50Z","published":"2023-08-14T13:13:50Z","title":"A Time-aware tensor decomposition for tracking evolving patterns","summary":" Time-evolving data sets can often be arranged as a higher-order tensor with\none of the modes being the time mode. While tensor factorizations have been\nsuccessfully used to capture the underlying patterns in such higher-order data\nsets, the temporal aspect is often ignored, allowing for the reordering of time\npoints. In recent studies, temporal regularizers are incorporated in the time\nmode to tackle this issue. Nevertheless, existing approaches still do not allow\nunderlying patterns to change in time (e.g., spatial changes in the brain,\ncontextual changes in topics). In this paper, we propose temporal PARAFAC2\n(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal\nregularization to extract gradually evolving patterns from temporal data.\nThrough extensive experiments on synthetic data, we demonstrate that tPARAFAC2\ncan capture the underlying evolving patterns accurately performing better than\nPARAFAC2 and coupled matrix factorization with temporal smoothness\nregularization.\n","authors":["Christos Chatzis","Max Pfeffer","Pedro Lind","Evrim Acar"],"pdf_url":"https://arxiv.org/pdf/2308.07126v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.07123v1","updated":"2023-08-14T13:10:48Z","published":"2023-08-14T13:10:48Z","title":"An Outlook into the Future of Egocentric Vision","summary":" What will the future be? We wonder! In this survey, we explore the gap\nbetween current research in egocentric vision and the ever-anticipated future,\nwhere wearable computing, with outward facing cameras and digital overlays, is\nexpected to be integrated in our every day lives. To understand this gap, the\narticle starts by envisaging the future through character-based stories,\nshowcasing through examples the limitations of current technology. We then\nprovide a mapping between this future and previously defined research tasks.\nFor each task, we survey its seminal works, current state-of-the-art\nmethodologies and available datasets, then reflect on shortcomings that limit\nits applicability to future research. Note that this survey focuses on software\nmodels for egocentric vision, independent of any specific hardware. The paper\nconcludes with recommendations for areas of immediate explorations so as to\nunlock our path to the future always-on, personalised and life-enhancing\negocentric vision.\n","authors":["Chiara Plizzari","Gabriele Goletto","Antonino Furnari","Siddhant Bansal","Francesco Ragusa","Giovanni Maria Farinella","Dima Damen","Tatiana Tommasi"],"pdf_url":"https://arxiv.org/pdf/2308.07123v1.pdf","comment":"We invite comments, suggestions and corrections here:\n https://openreview.net/forum?id=V3974SUk1w"},{"id":"http://arxiv.org/abs/2308.07119v1","updated":"2023-08-14T12:58:02Z","published":"2023-08-14T12:58:02Z","title":"On the Importance of Spatial Relations for Few-shot Action Recognition","summary":" Deep learning has achieved great success in video recognition, yet still\nstruggles to recognize novel actions when faced with only a few examples. To\ntackle this challenge, few-shot action recognition methods have been proposed\nto transfer knowledge from a source dataset to a novel target dataset with only\none or a few labeled videos. However, existing methods mainly focus on modeling\nthe temporal relations between the query and support videos while ignoring the\nspatial relations. In this paper, we find that the spatial misalignment between\nobjects also occurs in videos, notably more common than the temporal\ninconsistency. We are thus motivated to investigate the importance of spatial\nrelations and propose a more accurate few-shot action recognition method that\nleverages both spatial and temporal information. Particularly, a novel Spatial\nAlignment Cross Transformer (SA-CT) which learns to re-adjust the spatial\nrelations and incorporates the temporal information is contributed. Experiments\nreveal that, even without using any temporal information, the performance of\nSA-CT is comparable to temporal based methods on 3/4 benchmarks. To further\nincorporate the temporal information, we propose a simple yet effective\nTemporal Mixer module. The Temporal Mixer enhances the video representation and\nimproves the performance of the full SA-CT model, achieving very competitive\nresults. In this work, we also exploit large-scale pretrained models for\nfew-shot action recognition, providing useful insights for this research\ndirection.\n","authors":["Yilun Zhang","Yuqian Fu","Xingjun Ma","Lizhe Qi","Jingjing Chen","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.07119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07110v1","updated":"2023-08-14T12:49:39Z","published":"2023-08-14T12:49:39Z","title":"SCSC: Spatial Cross-scale Convolution Module to Strengthen both CNNs and\n Transformers","summary":" This paper presents a module, Spatial Cross-scale Convolution (SCSC), which\nis verified to be effective in improving both CNNs and Transformers. Nowadays,\nCNNs and Transformers have been successful in a variety of tasks. Especially\nfor Transformers, increasing works achieve state-of-the-art performance in the\ncomputer vision community. Therefore, researchers start to explore the\nmechanism of those architectures. Large receptive fields, sparse connections,\nweight sharing, and dynamic weight have been considered keys to designing\neffective base models. However, there are still some issues to be addressed:\nlarge dense kernels and self-attention are inefficient, and large receptive\nfields make it hard to capture local features. Inspired by the above analyses\nand to solve the mentioned problems, in this paper, we design a general module\ntaking in these design keys to enhance both CNNs and Transformers. SCSC\nintroduces an efficient spatial cross-scale encoder and spatial embed module to\ncapture assorted features in one layer. On the face recognition task,\nFaceResNet with SCSC can improve 2.7% with 68% fewer FLOPs and 79% fewer\nparameters. On the ImageNet classification task, Swin Transformer with SCSC can\nachieve even better performance with 22% fewer FLOPs, and ResNet with CSCS can\nimprove 5.3% with similar complexity. Furthermore, a traditional network (e.g.,\nResNet) embedded with SCSC can match Swin Transformer's performance.\n","authors":["Xijun Wang","Xiaojie Chu","Chunrui Han","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07110v1.pdf","comment":"ICCV2023 Workshop (New Ideas in Vision Transformers)"},{"id":"http://arxiv.org/abs/2308.05667v2","updated":"2023-08-14T12:49:28Z","published":"2023-08-10T16:10:54Z","title":"2D3D-MATR: 2D-3D Matching Transformer for Detection-free Registration\n between Images and Point Clouds","summary":" The commonly adopted detect-then-match approach to registration finds\ndifficulties in the cross-modality cases due to the incompatible keypoint\ndetection and inconsistent feature description. We propose, 2D3D-MATR, a\ndetection-free method for accurate and robust registration between images and\npoint clouds. Our method adopts a coarse-to-fine pipeline where it first\ncomputes coarse correspondences between downsampled patches of the input image\nand the point cloud and then extends them to form dense correspondences between\npixels and points within the patch region. The coarse-level patch matching is\nbased on transformer which jointly learns global contextual constraints with\nself-attention and cross-modality correlations with cross-attention. To resolve\nthe scale ambiguity in patch matching, we construct a multi-scale pyramid for\neach image patch and learn to find for each point patch the best matching image\npatch at a proper resolution level. Extensive experiments on two public\nbenchmarks demonstrate that 2D3D-MATR outperforms the previous state-of-the-art\nP2-Net by around $20$ percentage points on inlier ratio and over $10$ points on\nregistration recall. Our code and models are available at\nhttps://github.com/minhaolee/2D3DMATR.\n","authors":["Minhao Li","Zheng Qin","Zhirui Gao","Renjiao Yi","Chenyang Zhu","Yulan Guo","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2308.05667v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03061v2","updated":"2023-08-14T12:44:57Z","published":"2023-08-06T09:09:17Z","title":"InterTracker: Discovering and Tracking General Objects Interacting with\n Hands in the Wild","summary":" Understanding human interaction with objects is an important research topic\nfor embodied Artificial Intelligence and identifying the objects that humans\nare interacting with is a primary problem for interaction understanding.\nExisting methods rely on frame-based detectors to locate interacting objects.\nHowever, this approach is subjected to heavy occlusions, background clutter,\nand distracting objects. To address the limitations, in this paper, we propose\nto leverage spatio-temporal information of hand-object interaction to track\ninteractive objects under these challenging cases. Without prior knowledge of\nthe general objects to be tracked like object tracking problems, we first\nutilize the spatial relation between hands and objects to adaptively discover\nthe interacting objects from the scene. Second, the consistency and continuity\nof the appearance of objects between successive frames are exploited to track\nthe objects. With this tracking formulation, our method also benefits from\ntraining on large-scale general object-tracking datasets. We further curate a\nvideo-level hand-object interaction dataset for testing and evaluation from\n100DOH. The quantitative results demonstrate that our proposed method\noutperforms the state-of-the-art methods. Specifically, in scenes with\ncontinuous interaction with different objects, we achieve an impressive\nimprovement of about 10% as evaluated using the Average Precision (AP) metric.\nOur qualitative findings also illustrate that our method can produce more\ncontinuous trajectories for interacting objects.\n","authors":["Yanyan Shao","Qi Ye","Wenhan Luo","Kaihao Zhang","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03061v2.pdf","comment":"IROS 2023"},{"id":"http://arxiv.org/abs/2308.07106v1","updated":"2023-08-14T12:38:43Z","published":"2023-08-14T12:38:43Z","title":"Checklist to Transparently Define Test Oracles for TP, FP, and FN\n Objects in Automated Driving","summary":" Popular test oracles for the perception subsystem of driving automation\nsystems identify true-positive (TP), false-positive (FP), and false-negative\n(FN) objects. Oracle transparency is needed for comparing test results and for\nsafety cases. To date, there exists a common notion of TPs, FPs, and FNs in the\nfield, but apparently no published way to comprehensively define their oracles.\nTherefore, this paper provides a checklist of functional aspects and\nimplementation details that affect the oracle behavior. Besides labeling\npolicies of the test set, we cover fields of view, occlusion handling,\nsafety-relevant areas, matching criteria, temporal and probabilistic issues,\nand further aspects. Even though our checklist can hardly be formalized, it can\nhelp practitioners maximize the transparency of their oracles, which, in turn,\nmakes statements on object perception more reliable and comparable.\n","authors":["Michael Hoss"],"pdf_url":"https://arxiv.org/pdf/2308.07106v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.09241v3","updated":"2023-08-14T12:35:57Z","published":"2023-05-16T07:40:05Z","title":"Unlearnable Examples Give a False Sense of Security: Piercing through\n Unexploitable Data with Learnable Examples","summary":" Safeguarding data from unauthorized exploitation is vital for privacy and\nsecurity, especially in recent rampant research in security breach such as\nadversarial/membership attacks. To this end, \\textit{unlearnable examples}\n(UEs) have been recently proposed as a compelling protection, by adding\nimperceptible perturbation to data so that models trained on them cannot\nclassify them accurately on original clean distribution. Unfortunately, we find\nUEs provide a false sense of security, because they cannot stop unauthorized\nusers from utilizing other unprotected data to remove the protection, by\nturning unlearnable data into learnable again. Motivated by this observation,\nwe formally define a new threat by introducing \\textit{learnable unauthorized\nexamples} (LEs) which are UEs with their protection removed. The core of this\napproach is a novel purification process that projects UEs onto the manifold of\nLEs. This is realized by a new joint-conditional diffusion model which denoises\nUEs conditioned on the pixel and perceptual similarity between UEs and LEs.\nExtensive experiments demonstrate that LE delivers state-of-the-art countering\nperformance against both supervised UEs and unsupervised UEs in various\nscenarios, which is the first generalizable countermeasure to UEs across\nsupervised learning and unsupervised learning. Our code is available at\n\\url{https://github.com/jiangw-0/LE_JCDP}.\n","authors":["Wan Jiang","Yunfeng Diao","He Wang","Jianxin Sun","Meng Wang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2305.09241v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07104v1","updated":"2023-08-14T12:35:39Z","published":"2023-08-14T12:35:39Z","title":"FocusFlow: Boosting Key-Points Optical Flow Estimation for Autonomous\n Driving","summary":" Key-point-based scene understanding is fundamental for autonomous driving\napplications. At the same time, optical flow plays an important role in many\nvision tasks. However, due to the implicit bias of equal attention on all\npoints, classic data-driven optical flow estimation methods yield less\nsatisfactory performance on key points, limiting their implementations in\nkey-point-critical safety-relevant scenarios. To address these issues, we\nintroduce a points-based modeling method that requires the model to learn\nkey-point-related priors explicitly. Based on the modeling method, we present\nFocusFlow, a framework consisting of 1) a mix loss function combined with a\nclassic photometric loss function and our proposed Conditional Point Control\nLoss (CPCL) function for diverse point-wise supervision; 2) a conditioned\ncontrolling model which substitutes the conventional feature encoder by our\nproposed Condition Control Encoder (CCE). CCE incorporates a Frame Feature\nEncoder (FFE) that extracts features from frames, a Condition Feature Encoder\n(CFE) that learns to control the feature extraction behavior of FFE from input\nmasks containing information of key points, and fusion modules that transfer\nthe controlling information between FFE and CFE. Our FocusFlow framework shows\noutstanding performance with up to +44.5% precision improvement on various key\npoints such as ORB, SIFT, and even learning-based SiLK, along with exceptional\nscalability for most existing data-driven optical flow methods like PWC-Net,\nRAFT, and FlowFormer. Notably, FocusFlow yields competitive or superior\nperformances rivaling the original models on the whole frame. The source code\nwill be available at https://github.com/ZhonghuaYi/FocusFlow_official.\n","authors":["Zhonghua Yi","Hao Shi","Kailun Yang","Qi Jiang","Yaozu Ye","Ze Wang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07104v1.pdf","comment":"The source code of FocusFlow will be available at\n https://github.com/ZhonghuaYi/FocusFlow_official"},{"id":"http://arxiv.org/abs/2302.03665v4","updated":"2023-08-14T12:31:19Z","published":"2023-02-07T18:34:59Z","title":"HumanMAC: Masked Motion Completion for Human Motion Prediction","summary":" Human motion prediction is a classical problem in computer vision and\ncomputer graphics, which has a wide range of practical applications. Previous\neffects achieve great empirical performance based on an encoding-decoding\nstyle. The methods of this style work by first encoding previous motions to\nlatent representations and then decoding the latent representations into\npredicted motions. However, in practice, they are still unsatisfactory due to\nseveral issues, including complicated loss constraints, cumbersome training\nprocesses, and scarce switch of different categories of motions in prediction.\nIn this paper, to address the above issues, we jump out of the foregoing style\nand propose a novel framework from a new perspective. Specifically, our\nframework works in a masked completion fashion. In the training stage, we learn\na motion diffusion model that generates motions from random noise. In the\ninference stage, with a denoising procedure, we make motion prediction\nconditioning on observed motions to output more continuous and controllable\npredictions. The proposed framework enjoys promising algorithmic properties,\nwhich only needs one loss in optimization and is trained in an end-to-end\nmanner. Additionally, it accomplishes the switch of different categories of\nmotions effectively, which is significant in realistic tasks, e.g., the\nanimation task. Comprehensive experiments on benchmarks confirm the superiority\nof the proposed framework. The project page is available at\nhttps://lhchen.top/Human-MAC.\n","authors":["Ling-Hao Chen","Jiawei Zhang","Yewen Li","Yiren Pang","Xiaobo Xia","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2302.03665v4.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07102v1","updated":"2023-08-14T12:30:58Z","published":"2023-08-14T12:30:58Z","title":"Temporal Sentence Grounding in Streaming Videos","summary":" This paper aims to tackle a novel task - Temporal Sentence Grounding in\nStreaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance\nbetween a video stream and a given sentence query. Unlike regular videos,\nstreaming videos are acquired continuously from a particular source, and are\nalways desired to be processed on-the-fly in many applications such as\nsurveillance and live-stream analysis. Thus, TSGSV is challenging since it\nrequires the model to infer without future frames and process long historical\nframes effectively, which is untouched in the early methods. To specifically\naddress the above challenges, we propose two novel methods: (1) a TwinNet\nstructure that enables the model to learn about upcoming events; and (2) a\nlanguage-guided feature compressor that eliminates redundant visual frames and\nreinforces the frames that are relevant to the query. We conduct extensive\nexperiments using ActivityNet Captions, TACoS, and MAD datasets. The results\ndemonstrate the superiority of our proposed methods. A systematic ablation\nstudy also confirms their effectiveness.\n","authors":["Tian Gan","Xiao Wang","Yan Sun","Jianlong Wu","Qingpei Guo","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2308.07102v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.09011v2","updated":"2023-08-14T12:16:53Z","published":"2023-06-15T10:12:02Z","title":"CAD-Estate: Large-scale CAD Model Annotation in RGB Videos","summary":" We propose a method for annotating videos of complex multi-object scenes with\na globally-consistent 3D representation of the objects. We annotate each object\nwith a CAD model from a database, and place it in the 3D coordinate frame of\nthe scene with a 9-DoF pose transformation. Our method is semi-automatic and\nworks on commonly-available RGB videos, without requiring a depth sensor. Many\nsteps are performed automatically, and the tasks performed by humans are\nsimple, well-specified, and require only limited reasoning in 3D. This makes\nthem feasible for crowd-sourcing and has allowed us to construct a large-scale\ndataset by annotating real-estate videos from YouTube. Our dataset CAD-Estate\noffers 101k instances of 12k unique CAD models placed in the 3D representations\nof 20k videos. In comparison to Scan2CAD, the largest existing dataset with CAD\nmodel annotations on real scenes, CAD-Estate has 7x more instances and 4x more\nunique CAD models. We showcase the benefits of pre-training a Mask2CAD model on\nCAD-Estate for the task of automatic 3D object reconstruction and pose\nestimation, demonstrating that it leads to performance improvements on the\npopular Scan2CAD benchmark. The dataset is available at\nhttps://github.com/google-research/cad-estate.\n","authors":["Kevis-Kokitsi Maninis","Stefan Popov","Matthias Nießner","Vittorio Ferrari"],"pdf_url":"https://arxiv.org/pdf/2306.09011v2.pdf","comment":"Project page: https://github.com/google-research/cad-estate"},{"id":"http://arxiv.org/abs/2210.06551v5","updated":"2023-08-14T12:11:35Z","published":"2022-10-12T19:46:25Z","title":"MotionBERT: A Unified Perspective on Learning Human Motion\n Representations","summary":" We present a unified perspective on tackling various human-centric video\ntasks by learning human motion representations from large-scale and\nheterogeneous data resources. Specifically, we propose a pretraining stage in\nwhich a motion encoder is trained to recover the underlying 3D motion from\nnoisy partial 2D observations. The motion representations acquired in this way\nincorporate geometric, kinematic, and physical knowledge about human motion,\nwhich can be easily transferred to multiple downstream tasks. We implement the\nmotion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer)\nneural network. It could capture long-range spatio-temporal relationships among\nthe skeletal joints comprehensively and adaptively, exemplified by the lowest\n3D pose estimation error so far when trained from scratch. Furthermore, our\nproposed framework achieves state-of-the-art performance on all three\ndownstream tasks by simply finetuning the pretrained motion encoder with a\nsimple regression head (1-2 layers), which demonstrates the versatility of the\nlearned motion representations. Code and models are available at\nhttps://motionbert.github.io/\n","authors":["Wentao Zhu","Xiaoxuan Ma","Zhaoyang Liu","Libin Liu","Wayne Wu","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2210.06551v5.pdf","comment":"ICCV 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2308.07092v1","updated":"2023-08-14T11:56:39Z","published":"2023-08-14T11:56:39Z","title":"Masked Motion Predictors are Strong 3D Action Representation Learners","summary":" In 3D human action recognition, limited supervised data makes it challenging\nto fully tap into the modeling potential of powerful networks such as\ntransformers. As a result, researchers have been actively investigating\neffective self-supervised pre-training strategies. In this work, we show that\ninstead of following the prevalent pretext task to perform masked\nself-component reconstruction in human joints, explicit contextual motion\nmodeling is key to the success of learning effective feature representation for\n3D action recognition. Formally, we propose the Masked Motion Prediction (MAMP)\nframework. To be specific, the proposed MAMP takes as input the masked\nspatio-temporal skeleton sequence and predicts the corresponding temporal\nmotion of the masked human joints. Considering the high temporal redundancy of\nthe skeleton sequence, in our MAMP, the motion information also acts as an\nempirical semantic richness prior that guide the masking process, promoting\nbetter attention to semantically rich temporal regions. Extensive experiments\non NTU-60, NTU-120, and PKU-MMD datasets show that the proposed MAMP\npre-training substantially improves the performance of the adopted vanilla\ntransformer, achieving state-of-the-art results without bells and whistles. The\nsource code of our MAMP is available at https://github.com/maoyunyao/MAMP.\n","authors":["Yunyao Mao","Jiajun Deng","Wengang Zhou","Yao Fang","Wanli Ouyang","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.07092v1.pdf","comment":"To appear in ICCV 2023"},{"id":"http://arxiv.org/abs/2303.12398v3","updated":"2023-08-14T11:47:41Z","published":"2023-03-22T09:06:07Z","title":"Multiscale Attention via Wavelet Neural Operators for Vision\n Transformers","summary":" Transformers have achieved widespread success in computer vision. At their\nheart, there is a Self-Attention (SA) mechanism, an inductive bias that\nassociates each token in the input with every other token through a weighted\nbasis. The standard SA mechanism has quadratic complexity with the sequence\nlength, which impedes its utility to long sequences appearing in high\nresolution vision. Recently, inspired by operator learning for PDEs, Adaptive\nFourier Neural Operators (AFNO) were introduced for high resolution attention\nbased on global convolution that is efficiently implemented via FFT. However,\nthe AFNO global filtering cannot well represent small and moderate scale\nstructures that commonly appear in natural images. To leverage the\ncoarse-to-fine scale structures we introduce a Multiscale Wavelet Attention\n(MWA) by leveraging wavelet neural operators which incurs linear complexity in\nthe sequence size. We replace the attention in ViT with MWA and our experiments\nwith CIFAR and Tiny-ImageNet classification demonstrate significant improvement\nover alternative Fourier-based attentions such as AFNO and Global Filter\nNetwork (GFN).\n","authors":["Anahita Nekoozadeh","Mohammad Reza Ahmadzadeh","Zahra Mardani"],"pdf_url":"https://arxiv.org/pdf/2303.12398v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07078v1","updated":"2023-08-14T11:21:47Z","published":"2023-08-14T11:21:47Z","title":"ICPC: Instance-Conditioned Prompting with Contrastive Learning for\n Semantic Segmentation","summary":" Modern supervised semantic segmentation methods are usually finetuned based\non the supervised or self-supervised models pre-trained on ImageNet. Recent\nwork shows that transferring the knowledge from CLIP to semantic segmentation\nvia prompt learning can achieve promising performance. The performance boost\ncomes from the feature enhancement with multimodal alignment, i.e., the dot\nproduct between vision and text embeddings. However, how to improve the\nmultimodal alignment for better transfer performance in dense tasks remains\nunderexplored. In this work, we focus on improving the quality of vision-text\nalignment from two aspects of prompting design and loss function, and present\nan instance-conditioned prompting with contrastive learning (ICPC) framework.\nFirst, compared with the static prompt designs, we reveal that dynamic\nprompting conditioned on image content can more efficiently utilize the text\nencoder for complex dense tasks. Second, we propose an align-guided contrastive\nloss to refine the alignment of vision and text embeddings. We further propose\nlightweight multi-scale alignment for better performance. Extensive experiments\non three large-scale datasets (ADE20K, COCO-Stuff10k, and ADE20K-Full)\ndemonstrate that ICPC brings consistent improvements across diverse backbones.\nTaking ResNet-50 as an example, ICPC outperforms the state-of-the-art\ncounterpart by 1.71%, 1.05%, and 1.41% mIoU on the three datasets,\nrespectively.\n","authors":["Chaohui Yu","Qiang Zhou","Zhibin Wang","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12280v2","updated":"2023-08-14T11:16:44Z","published":"2023-07-23T10:16:47Z","title":"Downstream-agnostic Adversarial Examples","summary":" Self-supervised learning usually uses a large amount of unlabeled data to\npre-train an encoder which can be used as a general-purpose feature extractor,\nsuch that downstream users only need to perform fine-tuning operations to enjoy\nthe benefit of \"large model\". Despite this promising prospect, the security of\npre-trained encoder has not been thoroughly investigated yet, especially when\nthe pre-trained encoder is publicly available for commercial use.\n In this paper, we propose AdvEncoder, the first framework for generating\ndownstream-agnostic universal adversarial examples based on the pre-trained\nencoder. AdvEncoder aims to construct a universal adversarial perturbation or\npatch for a set of natural images that can fool all the downstream tasks\ninheriting the victim pre-trained encoder. Unlike traditional adversarial\nexample works, the pre-trained encoder only outputs feature vectors rather than\nclassification labels. Therefore, we first exploit the high frequency component\ninformation of the image to guide the generation of adversarial examples. Then\nwe design a generative attack framework to construct adversarial\nperturbations/patches by learning the distribution of the attack surrogate\ndataset to improve their attack success rates and transferability. Our results\nshow that an attacker can successfully attack downstream tasks without knowing\neither the pre-training dataset or the downstream dataset. We also tailor four\ndefenses for pre-trained encoders, the results of which further prove the\nattack ability of AdvEncoder.\n","authors":["Ziqi Zhou","Shengshan Hu","Ruizhi Zhao","Qian Wang","Leo Yu Zhang","Junhui Hou","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.12280v2.pdf","comment":"This paper has been accepted by the International Conference on\n Computer Vision (ICCV '23, October 2--6, 2023, Paris, France)"},{"id":"http://arxiv.org/abs/2308.07072v1","updated":"2023-08-14T11:06:28Z","published":"2023-08-14T11:06:28Z","title":"Teeth And Root Canals Segmentation Using ZXYFormer With Uncertainty\n Guidance And Weight Transfer","summary":" This study attempts to segment teeth and root-canals simultaneously from CBCT\nimages, but there are very challenging problems in this process. First, the\nclinical CBCT image data is very large (e.g., 672 *688 * 688), and the use of\ndownsampling operation will lose useful information about teeth and root\ncanals. Second, teeth and root canals are very different in morphology, and it\nis difficult for a simple network to identify them precisely. In addition,\nthere are weak edges at the tooth, between tooth and root canal, which makes it\nvery difficult to segment such weak edges. To this end, we propose a\ncoarse-to-fine segmentation method based on inverse feature fusion transformer\nand uncertainty estimation to address above challenging problems. First, we use\nthe downscaled volume data (e.g., 128 * 128 * 128) to conduct coarse\nsegmentation and map it to the original volume to obtain the area of teeth and\nroot canals. Then, we design a transformer with reverse feature fusion, which\ncan bring better segmentation effect of different morphological objects by\ntransferring deeper features to shallow features. Finally, we design an\nauxiliary branch to calculate and refine the difficult areas in order to\nimprove the weak edge segmentation performance of teeth and root canals.\nThrough the combined tooth and root canal segmentation experiment of 157\nclinical high-resolution CBCT data, it is verified that the proposed method is\nsuperior to the existing tooth or root canal segmentation methods.\n","authors":["Shangxuan Li","Yu Du","Li Ye","Chichi Li","Yanshu Fang","Cheng Wang","Wu Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07070v1","updated":"2023-08-14T11:05:37Z","published":"2023-08-14T11:05:37Z","title":"A Local Iterative Approach for the Extraction of 2D Manifolds from\n Strongly Curved and Folded Thin-Layer Structures","summary":" Ridge surfaces represent important features for the analysis of 3-dimensional\n(3D) datasets in diverse applications and are often derived from varying\nunderlying data including flow fields, geological fault data, and point data,\nbut they can also be present in the original scalar images acquired using a\nplethora of imaging techniques. Our work is motivated by the analysis of image\ndata acquired using micro-computed tomography (Micro-CT) of ancient, rolled and\nfolded thin-layer structures such as papyrus, parchment, and paper as well as\nsilver and lead sheets. From these documents we know that they are\n2-dimensional (2D) in nature. Hence, we are particularly interested in\nreconstructing 2D manifolds that approximate the document's structure. The\nimage data from which we want to reconstruct the 2D manifolds are often very\nnoisy and represent folded, densely-layered structures with many artifacts,\nsuch as ruptures or layer splitting and merging. Previous ridge-surface\nextraction methods fail to extract the desired 2D manifold for such challenging\ndata. We have therefore developed a novel method to extract 2D manifolds. The\nproposed method uses a local fast marching scheme in combination with a\nseparation of the region covered by fast marching into two sub-regions. The 2D\nmanifold of interest is then extracted as the surface separating the two\nsub-regions. The local scheme can be applied for both automatic propagation as\nwell as interactive analysis. We demonstrate the applicability and robustness\nof our method on both artificial data as well as real-world data including\nfolded silver and papyrus sheets.\n","authors":["Nicolas Klenert","Verena Lepper","Daniel Baum"],"pdf_url":"https://arxiv.org/pdf/2308.07070v1.pdf","comment":"16 pages, 21 figures, to be published in IEEE Transactions on\n Visualization and Computer Graphics"},{"id":"http://arxiv.org/abs/2203.10496v3","updated":"2023-08-14T10:45:48Z","published":"2022-03-20T09:02:13Z","title":"NeuralReshaper: Single-image Human-body Retouching with Deep Neural\n Networks","summary":" In this paper, we present NeuralReshaper, a novel method for semantic\nreshaping of human bodies in single images using deep generative networks. To\nachieve globally coherent reshaping effects, our approach follows a\nfit-then-reshape pipeline, which first fits a parametric 3D human model to a\nsource human image and then reshapes the fitted 3D model with respect to\nuser-specified semantic attributes. Previous methods rely on image warping to\ntransfer 3D reshaping effects to the entire image domain and thus often cause\ndistortions in both foreground and background. In contrast, we resort to\ngenerative adversarial nets conditioned on the source image and a 2D warping\nfield induced by the reshaped 3D model, to achieve more realistic reshaping\nresults. Specifically, we separately encode the foreground and background\ninformation in the source image using a two-headed UNet-like generator, and\nguide the information flow from the foreground branch to the background branch\nvia feature space warping. Furthermore, to deal with the lack-of-data problem\nthat no paired data exist (i.e., the same human bodies in varying shapes), we\nintroduce a novel self-supervised strategy to train our network. Unlike\nprevious methods that often require manual efforts to correct undesirable\nartifacts caused by incorrect body-to-image fitting, our method is fully\nautomatic. Extensive experiments on both indoor and outdoor datasets\ndemonstrate the superiority of our method over previous approaches.\n","authors":["Beijia Chen","Yuefan Shen","Hongbo Fu","Xiang Chen","Kun Zhou","Youyi Zheng"],"pdf_url":"https://arxiv.org/pdf/2203.10496v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07052v1","updated":"2023-08-14T10:23:25Z","published":"2023-08-14T10:23:25Z","title":"Diagnosis of Scalp Disorders using Machine Learning and Deep Learning\n Approach -- A Review","summary":" The morbidity of scalp diseases is minuscule compared to other diseases, but\nthe impact on the patient's life is enormous. It is common for people to\nexperience scalp problems that include Dandruff, Psoriasis, Tinea-Capitis,\nAlopecia and Atopic-Dermatitis. In accordance with WHO research, approximately\n70% of adults have problems with their scalp. It has been demonstrated in\ndescriptive research that hair quality is impaired by impaired scalp, but these\nimpacts are reversible with early diagnosis and treatment. Deep Learning\nadvances have demonstrated the effectiveness of CNN paired with FCN in\ndiagnosing scalp and skin disorders. In one proposed Deep-Learning-based scalp\ninspection and diagnosis system, an imaging microscope and a trained model are\ncombined with an app that classifies scalp disorders accurately with an average\nprecision of 97.41%- 99.09%. Another research dealt with classifying the\nPsoriasis using the CNN with an accuracy of 82.9%. As part of another study, an\nML based algorithm was also employed. It accurately classified the healthy\nscalp and alopecia areata with 91.4% and 88.9% accuracy with SVM and KNN\nalgorithms. Using deep learning models to diagnose scalp related diseases has\nimproved due to advancements i computation capabilities and computer vision,\nbut there remains a wide horizon for further improvements.\n","authors":["Hrishabh Tiwari","Jatin Moolchandani","Shamla Mantri"],"pdf_url":"https://arxiv.org/pdf/2308.07052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07050v1","updated":"2023-08-14T10:21:06Z","published":"2023-08-14T10:21:06Z","title":"Survey on video anomaly detection in dynamic scenes with moving cameras","summary":" The increasing popularity of compact and inexpensive cameras, e.g.~dash\ncameras, body cameras, and cameras equipped on robots, has sparked a growing\ninterest in detecting anomalies within dynamic scenes recorded by moving\ncameras. However, existing reviews primarily concentrate on Video Anomaly\nDetection (VAD) methods assuming static cameras. The VAD literature with moving\ncameras remains fragmented, lacking comprehensive reviews to date. To address\nthis gap, we endeavor to present the first comprehensive survey on Moving\nCamera Video Anomaly Detection (MC-VAD). We delve into the research papers\nrelated to MC-VAD, critically assessing their limitations and highlighting\nassociated challenges. Our exploration encompasses three application domains:\nsecurity, urban transportation, and marine environments, which in turn cover\nsix specific tasks. We compile an extensive list of 25 publicly-available\ndatasets spanning four distinct environments: underwater, water surface,\nground, and aerial. We summarize the types of anomalies these datasets\ncorrespond to or contain, and present five main categories of approaches for\ndetecting such anomalies. Lastly, we identify future research directions and\ndiscuss novel contributions that could advance the field of MC-VAD. With this\nsurvey, we aim to offer a valuable reference for researchers and practitioners\nstriving to develop and advance state-of-the-art MC-VAD methods.\n","authors":["Runyu Jiao","Yi Wan","Fabio Poiesi","Yiming Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07050v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.07039v1","updated":"2023-08-14T10:02:30Z","published":"2023-08-14T10:02:30Z","title":"The minimal computational substrate of fluid intelligence","summary":" The quantification of cognitive powers rests on identifying a behavioural\ntask that depends on them. Such dependence cannot be assured, for the powers a\ntask invokes cannot be experimentally controlled or constrained a priori,\nresulting in unknown vulnerability to failure of specificity and\ngeneralisability. Evaluating a compact version of Raven's Advanced Progressive\nMatrices (RAPM), a widely used clinical test of fluid intelligence, we show\nthat LaMa, a self-supervised artificial neural network trained solely on the\ncompletion of partially masked images of natural environmental scenes, achieves\nhuman-level test scores a prima vista, without any task-specific inductive bias\nor training. Compared with cohorts of healthy and focally lesioned\nparticipants, LaMa exhibits human-like variation with item difficulty, and\nproduces errors characteristic of right frontal lobe damage under degradation\nof its ability to integrate global spatial patterns. LaMa's narrow training and\nlimited capacity -- comparable to the nervous system of the fruit fly --\nsuggest RAPM may be open to computationally simple solutions that need not\nnecessarily invoke abstract reasoning.\n","authors":["Amy PK Nelson","Joe Mole","Guilherme Pombo","Robert J Gray","James K Ruffle","Edgar Chan","Geraint E Rees","Lisa Cipolotti","Parashkev Nachev"],"pdf_url":"https://arxiv.org/pdf/2308.07039v1.pdf","comment":"26 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.07034v1","updated":"2023-08-14T09:53:27Z","published":"2023-08-14T09:53:27Z","title":"An Inherent Trade-Off in Noisy Neural Communication with Rank-Order\n Coding","summary":" Rank-order coding, a form of temporal coding, has emerged as a promising\nscheme to explain the rapid ability of the mammalian brain. Owing to its speed\nas well as efficiency, rank-order coding is increasingly gaining interest in\ndiverse research areas beyond neuroscience. However, much uncertainty still\nexists about the performance of rank-order coding under noise. Herein we show\nwhat information rates are fundamentally possible and what trade-offs are at\nstake. An unexpected finding in this paper is the emergence of a special class\nof errors that, in a regime, increase with less noise.\n","authors":["Ibrahim Alsolami","Tomoki Fukai"],"pdf_url":"https://arxiv.org/pdf/2308.07034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07032v1","updated":"2023-08-14T09:45:28Z","published":"2023-08-14T09:45:28Z","title":"S3IM: Stochastic Structural SIMilarity and Its Unreasonable\n Effectiveness for Neural Fields","summary":" Recently, Neural Radiance Field (NeRF) has shown great success in rendering\nnovel-view images of a given scene by learning an implicit representation with\nonly posed RGB images. NeRF and relevant neural field methods (e.g., neural\nsurface representation) typically optimize a point-wise loss and make\npoint-wise predictions, where one data point corresponds to one pixel.\nUnfortunately, this line of research failed to use the collective supervision\nof distant pixels, although it is known that pixels in an image or scene can\nprovide rich structural information. To the best of our knowledge, we are the\nfirst to design a nonlocal multiplex training paradigm for NeRF and relevant\nneural field methods via a novel Stochastic Structural SIMilarity (S3IM) loss\nthat processes multiple data points as a whole set instead of process multiple\ninputs independently. Our extensive experiments demonstrate the unreasonable\neffectiveness of S3IM in improving NeRF and neural surface representation for\nnearly free. The improvements of quality metrics can be particularly\nsignificant for those relatively difficult tasks: e.g., the test MSE loss\nunexpectedly drops by more than 90% for TensoRF and DVGO over eight novel view\nsynthesis tasks; a 198% F-score gain and a 64% Chamfer $L_{1}$ distance\nreduction for NeuS over eight surface reconstruction tasks. Moreover, S3IM is\nconsistently robust even with sparse inputs, corrupted images, and dynamic\nscenes.\n","authors":["Zeke Xie","Xindi Yang","Yujie Yang","Qi Sun","Yixiang Jiang","Haoran Wang","Yunfeng Cai","Mingming Sun"],"pdf_url":"https://arxiv.org/pdf/2308.07032v1.pdf","comment":"ICCV 2023 main conference. Code: https://github.com/Madaoer/S3IM. 14\n pages, 5 figures, 17 tables"},{"id":"http://arxiv.org/abs/2308.07026v1","updated":"2023-08-14T09:29:22Z","published":"2023-08-14T09:29:22Z","title":"AdvCLIP: Downstream-agnostic Adversarial Examples in Multimodal\n Contrastive Learning","summary":" Multimodal contrastive learning aims to train a general-purpose feature\nextractor, such as CLIP, on vast amounts of raw, unlabeled paired image-text\ndata. This can greatly benefit various complex downstream tasks, including\ncross-modal image-text retrieval and image classification. Despite its\npromising prospect, the security issue of cross-modal pre-trained encoder has\nnot been fully explored yet, especially when the pre-trained encoder is\npublicly available for commercial use.\n In this work, we propose AdvCLIP, the first attack framework for generating\ndownstream-agnostic adversarial examples based on cross-modal pre-trained\nencoders. AdvCLIP aims to construct a universal adversarial patch for a set of\nnatural images that can fool all the downstream tasks inheriting the victim\ncross-modal pre-trained encoder. To address the challenges of heterogeneity\nbetween different modalities and unknown downstream tasks, we first build a\ntopological graph structure to capture the relevant positions between target\nsamples and their neighbors. Then, we design a topology-deviation based\ngenerative adversarial network to generate a universal adversarial patch. By\nadding the patch to images, we minimize their embeddings similarity to\ndifferent modality and perturb the sample distribution in the feature space,\nachieving unviersal non-targeted attacks. Our results demonstrate the excellent\nattack performance of AdvCLIP on two types of downstream tasks across eight\ndatasets. We also tailor three popular defenses to mitigate AdvCLIP,\nhighlighting the need for new defense mechanisms to defend cross-modal\npre-trained encoders.\n","authors":["Ziqi Zhou","Shengshan Hu","Minghui Li","Hangtao Zhang","Yechao Zhang","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2308.07026v1.pdf","comment":"This paper has been accepted by the ACM International Conference on\n Multimedia (ACM MM '23, October 29-November 3, 2023, Ottawa, ON, Canada)"},{"id":"http://arxiv.org/abs/2308.07024v1","updated":"2023-08-14T09:19:26Z","published":"2023-08-14T09:19:26Z","title":"PGT-Net: Progressive Guided Multi-task Neural Network for Small-area Wet\n Fingerprint Denoising and Recognition","summary":" Fingerprint recognition on mobile devices is an important method for identity\nverification. However, real fingerprints usually contain sweat and moisture\nwhich leads to poor recognition performance. In addition, for rolling out\nslimmer and thinner phones, technology companies reduce the size of recognition\nsensors by embedding them with the power button. Therefore, the limited size of\nfingerprint data also increases the difficulty of recognition. Denoising the\nsmall-area wet fingerprint images to clean ones becomes crucial to improve\nrecognition performance. In this paper, we propose an end-to-end trainable\nprogressive guided multi-task neural network (PGT-Net). The PGT-Net includes a\nshared stage and specific multi-task stages, enabling the network to train\nbinary and non-binary fingerprints sequentially. The binary information is\nregarded as guidance for output enhancement which is enriched with the ridge\nand valley details. Moreover, a novel residual scaling mechanism is introduced\nto stabilize the training process. Experiment results on the FW9395 and\nFT-lightnoised dataset provided by FocalTech shows that PGT-Net has promising\nperformance on the wet-fingerprint denoising and significantly improves the\nfingerprint recognition rate (FRR). On the FT-lightnoised dataset, the FRR of\nfingerprint recognition can be declined from 17.75% to 4.47%. On the FW9395\ndataset, the FRR of fingerprint recognition can be declined from 9.45% to\n1.09%.\n","authors":["Yu-Ting Li","Ching-Te Chiu","An-Ting Hsieh","Mao-Hsiu Hsu","Long Wenyong","Jui-Min Hsu"],"pdf_url":"https://arxiv.org/pdf/2308.07024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07017v1","updated":"2023-08-14T09:06:21Z","published":"2023-08-14T09:06:21Z","title":"Contrastive Bi-Projector for Unsupervised Domain Adaption","summary":" This paper proposes a novel unsupervised domain adaption (UDA) method based\non contrastive bi-projector (CBP), which can improve the existing UDA methods.\nIt is called CBPUDA here, which effectively promotes the feature extractors\n(FEs) to reduce the generation of ambiguous features for classification and\ndomain adaption. The CBP differs from traditional bi-classifier-based methods\nat that these two classifiers are replaced with two projectors of performing a\nmapping from the input feature to two distinct features. These two projectors\nand the FEs in the CBPUDA can be trained adversarially to obtain more refined\ndecision boundaries so that it can possess powerful classification performance.\nTwo properties of the proposed loss function are analyzed here. The first\nproperty is to derive an upper bound of joint prediction entropy, which is used\nto form the proposed loss function, contrastive discrepancy (CD) loss. The CD\nloss takes the advantages of the contrastive learning and the bi-classifier.\nThe second property is to analyze the gradient of the CD loss and then overcome\nthe drawback of the CD loss. The result of the second property is utilized in\nthe development of the gradient scaling (GS) scheme in this paper. The GS\nscheme can be exploited to tackle the unstable problem of the CD loss because\ntraining the CBPUDA requires using contrastive learning and adversarial\nlearning at the same time. Therefore, using the CD loss with the GS scheme\novercomes the problem mentioned above to make features more compact for\nintra-class and distinguishable for inter-class. Experimental results express\nthat the CBPUDA is superior to conventional UDA methods under consideration in\nthis paper for UDA and fine-grained UDA tasks.\n","authors":["Lin-Chieh Huang","Hung-Hsu Tsai"],"pdf_url":"https://arxiv.org/pdf/2308.07017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07016v1","updated":"2023-08-14T09:04:06Z","published":"2023-08-14T09:04:06Z","title":"HPFormer: Hyperspectral image prompt object tracking","summary":" Hyperspectral imagery contains abundant spectral information beyond the\nvisible RGB bands, providing rich discriminative details about objects in a\nscene. Leveraging such data has the potential to enhance visual tracking\nperformance. While prior hyperspectral trackers employ CNN or hybrid\nCNN-Transformer architectures, we propose a novel approach HPFormer on\nTransformers to capitalize on their powerful representation learning\ncapabilities. The core of HPFormer is a Hyperspectral Hybrid Attention (HHA)\nmodule which unifies feature extraction and fusion within one component through\ntoken interactions. Additionally, a Transform Band Module (TBM) is introduced\nto selectively aggregate spatial details and spectral signatures from the full\nhyperspectral input for injecting informative target representations. Extensive\nexperiments demonstrate state-of-the-art performance of HPFormer on benchmark\nNIR and VIS tracking datasets. Our work provides new insights into harnessing\nthe strengths of transformers and hyperspectral fusion to advance robust object\ntracking.\n","authors":["Yuedong Tan"],"pdf_url":"https://arxiv.org/pdf/2308.07016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01146v4","updated":"2023-08-14T08:54:43Z","published":"2023-01-03T15:11:41Z","title":"Rethinking Mobile Block for Efficient Attention-based Models","summary":" This paper focuses on developing modern, efficient, lightweight models for\ndense predictions while trading off parameters, FLOPs, and performance.\nInverted Residual Block (IRB) serves as the infrastructure for lightweight\nCNNs, but no counterpart has been recognized by attention-based studies. This\nwork rethinks lightweight infrastructure from efficient IRB and effective\ncomponents of Transformer from a unified perspective, extending CNN-based IRB\nto attention-based models and abstracting a one-residual Meta Mobile Block\n(MMB) for lightweight model design. Following simple but effective design\ncriterion, we deduce a modern Inverted Residual Mobile Block (iRMB) and build a\nResNet-like Efficient MOdel (EMO) with only iRMB for down-stream tasks.\nExtensive experiments on ImageNet-1K, COCO2017, and ADE20K benchmarks\ndemonstrate the superiority of our EMO over state-of-the-art methods, e.g.,\nEMO-1M/2M/5M achieve 71.5, 75.1, and 78.4 Top-1 that surpass equal-order\nCNN-/Attention-based models, while trading-off the parameter, efficiency, and\naccuracy well: running 2.8-4.0x faster than EdgeNeXt on iPhone14.\n","authors":["Jiangning Zhang","Xiangtai Li","Jian Li","Liang Liu","Zhucun Xue","Boshen Zhang","Zhengkai Jiang","Tianxin Huang","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2301.01146v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07009v1","updated":"2023-08-14T08:52:41Z","published":"2023-08-14T08:52:41Z","title":"ACTIVE: Towards Highly Transferable 3D Physical Camouflage for Universal\n and Robust Vehicle Evasion","summary":" Adversarial camouflage has garnered attention for its ability to attack\nobject detectors from any viewpoint by covering the entire object's surface.\nHowever, universality and robustness in existing methods often fall short as\nthe transferability aspect is often overlooked, thus restricting their\napplication only to a specific target with limited performance. To address\nthese challenges, we present Adversarial Camouflage for Transferable and\nIntensive Vehicle Evasion (ACTIVE), a state-of-the-art physical camouflage\nattack framework designed to generate universal and robust adversarial\ncamouflage capable of concealing any 3D vehicle from detectors. Our framework\nincorporates innovative techniques to enhance universality and robustness: a\nrefined texture rendering that enables common texture application to different\nvehicles without being constrained to a specific texture map, a novel stealth\nloss that renders the vehicle undetectable, and a smooth and camouflage loss to\nenhance the naturalness of the adversarial camouflage. Our extensive\nexperiments on 15 different models show that ACTIVE consistently outperforms\nexisting works on various public detectors, including the latest YOLOv7.\nNotably, our universality evaluations reveal promising transferability to other\nvehicle classes, tasks (segmentation models), and the real world, not just\nother vehicles.\n","authors":["Naufal Suryanto","Yongsu Kim","Harashta Tatimma Larasati","Hyoeun Kang","Thi-Thu-Huong Le","Yoonyoung Hong","Hunmin Yang","Se-Yoon Oh","Howon Kim"],"pdf_url":"https://arxiv.org/pdf/2308.07009v1.pdf","comment":"Accepted for ICCV 2023. Main Paper with Supplementary Material.\n Project Page: https://islab-ai.github.io/active-iccv2023/"},{"id":"http://arxiv.org/abs/2307.06505v2","updated":"2023-08-14T08:52:02Z","published":"2023-07-13T01:05:12Z","title":"WaterScenes: A Multi-Task 4D Radar-Camera Fusion Dataset and Benchmark\n for Autonomous Driving on Water Surfaces","summary":" Autonomous driving on water surfaces plays an essential role in executing\nhazardous and time-consuming missions, such as maritime surveillance, survivors\nrescue, environmental monitoring, hydrography mapping and waste cleaning. This\nwork presents WaterScenes, the first multi-task 4D radar-camera fusion dataset\nfor autonomous driving on water surfaces. Equipped with a 4D radar and a\nmonocular camera, our Unmanned Surface Vehicle (USV) proffers all-weather\nsolutions for discerning object-related information, including color, shape,\ntexture, range, velocity, azimuth, and elevation. Focusing on typical static\nand dynamic objects on water surfaces, we label the camera images and radar\npoint clouds at pixel-level and point-level, respectively. In addition to basic\nperception tasks, such as object detection, instance segmentation and semantic\nsegmentation, we also provide annotations for free-space segmentation and\nwaterline segmentation. Leveraging the multi-task and multi-modal data, we\nconduct benchmark experiments on the uni-modality of radar and camera, as well\nas the fused modalities. Experimental results demonstrate that 4D radar-camera\nfusion can considerably improve the accuracy and robustness of perception on\nwater surfaces, especially in adverse lighting and weather conditions.\nWaterScenes dataset is public on https://waterscenes.github.io.\n","authors":["Shanliang Yao","Runwei Guan","Zhaodong Wu","Yi Ni","Zile Huang","Zixian Zhang","Yong Yue","Weiping Ding","Eng Gee Lim","Hyungjoon Seo","Ka Lok Man","Xiaohui Zhu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2307.06505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07003v1","updated":"2023-08-14T08:39:09Z","published":"2023-08-14T08:39:09Z","title":"Deepbet: Fast brain extraction of T1-weighted MRI using Convolutional\n Neural Networks","summary":" Brain extraction in magnetic resonance imaging (MRI) data is an important\nsegmentation step in many neuroimaging preprocessing pipelines. Image\nsegmentation is one of the research fields in which deep learning had the\nbiggest impact in recent years enabling high precision segmentation with\nminimal compute. Consequently, traditional brain extraction methods are now\nbeing replaced by deep learning-based methods. Here, we used a unique dataset\ncomprising 568 T1-weighted (T1w) MR images from 191 different studies in\ncombination with cutting edge deep learning methods to build a fast,\nhigh-precision brain extraction tool called deepbet. deepbet uses LinkNet, a\nmodern UNet architecture, in a two stage prediction process. This increases its\nsegmentation performance, setting a novel state-of-the-art performance during\ncross-validation with a median Dice score (DSC) of 99.0% on unseen datasets,\noutperforming current state of the art models (DSC = 97.8% and DSC = 97.9%).\nWhile current methods are more sensitive to outliers, resulting in Dice scores\nas low as 76.5%, deepbet manages to achieve a Dice score of > 96.9% for all\nsamples. Finally, our model accelerates brain extraction by a factor of ~10\ncompared to current methods, enabling the processing of one image in ~2 seconds\non low level hardware.\n","authors":["Lukas Fisch","Stefan Zumdick","Carlotta Barkhau","Daniel Emden","Jan Ernsting","Ramona Leenings","Kelvin Sarink","Nils R. Winter","Benjamin Risse","Udo Dannlowski","Tim Hahn"],"pdf_url":"https://arxiv.org/pdf/2308.07003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01006v4","updated":"2023-08-14T08:28:32Z","published":"2023-08-02T08:29:44Z","title":"FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of\n Autonomous Driving","summary":" Building a multi-modality multi-task neural network toward accurate and\nrobust performance is a de-facto standard in perception task of autonomous\ndriving. However, leveraging such data from multiple sensors to jointly\noptimize the prediction and planning tasks remains largely unexplored. In this\npaper, we present FusionAD, to the best of our knowledge, the first unified\nframework that fuse the information from two most critical sensors, camera and\nLiDAR, goes beyond perception task. Concretely, we first build a transformer\nbased multi-modality fusion network to effectively produce fusion based\nfeatures. In constrast to camera-based end-to-end method UniAD, we then\nestablish a fusion aided modality-aware prediction and status-aware planning\nmodules, dubbed FMSPnP that take advantages of multi-modality features. We\nconduct extensive experiments on commonly used benchmark nuScenes dataset, our\nFusionAD achieves state-of-the-art performance and surpassing baselines on\naverage 15% on perception tasks like detection and tracking, 10% on occupancy\nprediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score\nand reduces the collision rate from 0.31% to only 0.12%.\n","authors":["Tengju Ye","Wei Jing","Chunyong Hu","Shikun Huang","Lingping Gao","Fangzhen Li","Jingke Wang","Ke Guo","Wencong Xiao","Weibo Mao","Hang Zheng","Kun Li","Junbo Chen","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2308.01006v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06998v1","updated":"2023-08-14T08:23:58Z","published":"2023-08-14T08:23:58Z","title":"Mutual Information-driven Triple Interaction Network for Efficient Image\n Dehazing","summary":" Multi-stage architectures have exhibited efficacy in image dehazing, which\nusually decomposes a challenging task into multiple more tractable sub-tasks\nand progressively estimates latent hazy-free images. Despite the remarkable\nprogress, existing methods still suffer from the following shortcomings: (1)\nlimited exploration of frequency domain information; (2) insufficient\ninformation interaction; (3) severe feature redundancy. To remedy these issues,\nwe propose a novel Mutual Information-driven Triple interaction Network\n(MITNet) based on spatial-frequency dual domain information and two-stage\narchitecture. To be specific, the first stage, named amplitude-guided haze\nremoval, aims to recover the amplitude spectrum of the hazy images for haze\nremoval. And the second stage, named phase-guided structure refined, devotes to\nlearning the transformation and refinement of the phase spectrum. To facilitate\nthe information exchange between two stages, an Adaptive Triple Interaction\nModule (ATIM) is developed to simultaneously aggregate cross-domain,\ncross-scale, and cross-stage features, where the fused features are further\nused to generate content-adaptive dynamic filters so that applying them to\nenhance global context representation. In addition, we impose the mutual\ninformation minimization constraint on paired scale encoder and decoder\nfeatures from both stages. Such an operation can effectively reduce information\nredundancy and enhance cross-stage feature complementarity. Extensive\nexperiments on multiple public datasets exhibit that our MITNet performs\nsuperior performance with lower model complexity.The code and models are\navailable at https://github.com/it-hao/MITNet.\n","authors":["Hao Shen","Zhong-Qiu Zhao","Yulun Zhang","Zhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.06998v1.pdf","comment":"Accepted in ACM MM 2023"},{"id":"http://arxiv.org/abs/2303.04980v2","updated":"2023-08-14T08:08:50Z","published":"2023-03-09T01:42:43Z","title":"Decision-BADGE: Decision-based Adversarial Batch Attack with Directional\n Gradient Estimation","summary":" The susceptibility of deep neural networks (DNNs) to adversarial examples has\nprompted an increase in the deployment of adversarial attacks. Image-agnostic\nuniversal adversarial perturbations (UAPs) are much more threatening, but many\nlimitations exist to implementing UAPs in real-world scenarios where only\nbinary decisions are returned. In this research, we propose Decision-BADGE, a\nnovel method to craft universal adversarial perturbations for executing\ndecision-based black-box attacks. To optimize perturbation with decisions, we\naddressed two challenges, namely the magnitude and the direction of the\ngradient. First, we use batch loss, differences from distributions of ground\ntruth, and accumulating decisions in batches to determine the magnitude of the\ngradient. This magnitude is applied in the direction of the revised\nsimultaneous perturbation stochastic approximation (SPSA) to update the\nperturbation. This simple yet efficient method can be easily extended to\nscore-based attacks as well as targeted attacks. Experimental validation across\nmultiple victim models demonstrates that the Decision-BADGE outperforms\nexisting attack methods, even image-specific and score-based attacks. In\nparticular, our proposed method shows a superior success rate with less\ntraining time. The research also shows that Decision-BADGE can successfully\ndeceive unseen victim models and accurately target specific classes.\n","authors":["Geunhyeok Yu","Minwoo Jeon","Hyoseok Hwang"],"pdf_url":"https://arxiv.org/pdf/2303.04980v2.pdf","comment":"9 pages (7 pages except for references), 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.16680v3","updated":"2023-08-14T07:59:36Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n A Comprehensive Survey","summary":" Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v3.pdf","comment":"draft version"},{"id":"http://arxiv.org/abs/2303.05050v2","updated":"2023-08-14T07:50:52Z","published":"2023-03-09T05:54:42Z","title":"Lifelong-MonoDepth: Lifelong Learning for Multi-Domain Monocular Metric\n Depth Estimation","summary":" With the rapid advancements in autonomous driving and robot navigation, there\nis a growing demand for lifelong learning models capable of estimating metric\n(absolute) depth. Lifelong learning approaches potentially offer significant\ncost savings in terms of model training, data storage, and collection. However,\nthe quality of RGB images and depth maps is sensor-dependent, and depth maps in\nthe real world exhibit domain-specific characteristics, leading to variations\nin depth ranges. These challenges limit existing methods to lifelong learning\nscenarios with small domain gaps and relative depth map estimation. To\nfacilitate lifelong metric depth learning, we identify three crucial technical\nchallenges that require attention: i) developing a model capable of addressing\nthe depth scale variation through scale-aware depth learning, ii) devising an\neffective learning strategy to handle significant domain gaps, and iii)\ncreating an automated solution for domain-aware depth inference in practical\napplications. Based on the aforementioned considerations, in this paper, we\npresent i) a lightweight multi-head framework that effectively tackles the\ndepth scale imbalance, ii) an uncertainty-aware lifelong learning solution that\nadeptly handles significant domain gaps, and iii) an online domain-specific\npredictor selection method for real-time inference. Through extensive numerical\nstudies, we show that the proposed method can achieve good efficiency,\nstability, and plasticity, leading the benchmarks by 8% to 15%.\n","authors":["Junjie Hu","Chenyou Fan","Liguang Zhou","Qing Gao","Honghai Liu","Tin Lun Lam"],"pdf_url":"https://arxiv.org/pdf/2303.05050v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06985v1","updated":"2023-08-14T07:45:54Z","published":"2023-08-14T07:45:54Z","title":"PatchContrast: Self-Supervised Pre-training for 3D Object Detection","summary":" Accurately detecting objects in the environment is a key challenge for\nautonomous vehicles. However, obtaining annotated data for detection is\nexpensive and time-consuming. We introduce PatchContrast, a novel\nself-supervised point cloud pre-training framework for 3D object detection. We\npropose to utilize two levels of abstraction to learn discriminative\nrepresentation from unlabeled data: proposal-level and patch-level. The\nproposal-level aims at localizing objects in relation to their surroundings,\nwhereas the patch-level adds information about the internal connections between\nthe object's components, hence distinguishing between different objects based\non their individual components. We demonstrate how these levels can be\nintegrated into self-supervised pre-training for various backbones to enhance\nthe downstream 3D detection task. We show that our method outperforms existing\nstate-of-the-art models on three commonly-used 3D detection datasets.\n","authors":["Oren Shrout","Ori Nitzan","Yizhak Ben-Shabat","Ayellet Tal"],"pdf_url":"https://arxiv.org/pdf/2308.06985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02849v2","updated":"2023-08-14T07:38:32Z","published":"2023-04-06T03:45:07Z","title":"Logistic-Normal Likelihoods for Heteroscedastic Label Noise","summary":" A natural way of estimating heteroscedastic label noise in regression is to\nmodel the observed (potentially noisy) target as a sample from a normal\ndistribution, whose parameters can be learned by minimizing the negative\nlog-likelihood. This formulation has desirable loss attenuation properties, as\nit reduces the contribution of high-error examples. Intuitively, this behavior\ncan improve robustness against label noise by reducing overfitting. We propose\nan extension of this simple and probabilistic approach to classification that\nhas the same desirable loss attenuation properties. Furthermore, we discuss and\naddress some practical challenges of this extension. We evaluate the\neffectiveness of the method by measuring its robustness against label noise in\nclassification. We perform enlightening experiments exploring the inner\nworkings of the method, including sensitivity to hyperparameters, ablation\nstudies, and other insightful analyses.\n","authors":["Erik Englesson","Amir Mehrpanah","Hossein Azizpour"],"pdf_url":"https://arxiv.org/pdf/2304.02849v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02898v4","updated":"2023-08-14T07:37:27Z","published":"2023-06-05T14:06:24Z","title":"Towards Unified Text-based Person Retrieval: A Large-scale\n Multi-Attribute and Language Search Benchmark","summary":" In this paper, we introduce a large Multi-Attribute and Language Search\ndataset for text-based person retrieval, called MALS, and explore the\nfeasibility of performing pre-training on both attribute recognition and\nimage-text matching tasks in one stone. In particular, MALS contains 1,510,330\nimage-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,\nand all images are annotated with 27 attributes. Considering the privacy\nconcerns and annotation costs, we leverage the off-the-shelf diffusion models\nto generate the dataset. To verify the feasibility of learning from the\ngenerated data, we develop a new joint Attribute Prompt Learning and Text\nMatching Learning (APTM) framework, considering the shared knowledge between\nattribute and text. As the name implies, APTM contains an attribute prompt\nlearning stream and a text matching learning stream. (1) The attribute prompt\nlearning leverages the attribute prompts for image-attribute alignment, which\nenhances the text matching learning. (2) The text matching learning facilitates\nthe representation learning on fine-grained details, and in turn, boosts the\nattribute prompt learning. Extensive experiments validate the effectiveness of\nthe pre-training on MALS, achieving state-of-the-art retrieval performance via\nAPTM on three challenging real-world benchmarks. In particular, APTM achieves a\nconsistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on\nCUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.\n","authors":["Shuyu Yang","Yinan Zhou","Yaxiong Wang","Yujiao Wu","Li Zhu","Zhedong Zheng"],"pdf_url":"https://arxiv.org/pdf/2306.02898v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06983v1","updated":"2023-08-14T07:35:43Z","published":"2023-08-14T07:35:43Z","title":"pNNCLR: Stochastic Pseudo Neighborhoods for Contrastive Learning based\n Unsupervised Representation Learning Problems","summary":" Nearest neighbor (NN) sampling provides more semantic variations than\npre-defined transformations for self-supervised learning (SSL) based image\nrecognition problems. However, its performance is restricted by the quality of\nthe support set, which holds positive samples for the contrastive loss. In this\nwork, we show that the quality of the support set plays a crucial role in any\nnearest neighbor based method for SSL. We then provide a refined baseline\n(pNNCLR) to the nearest neighbor based SSL approach (NNCLR). To this end, we\nintroduce pseudo nearest neighbors (pNN) to control the quality of the support\nset, wherein, rather than sampling the nearest neighbors, we sample in the\nvicinity of hard nearest neighbors by varying the magnitude of the resultant\nvector and employing a stochastic sampling strategy to improve the performance.\nAdditionally, to stabilize the effects of uncertainty in NN-based learning, we\nemploy a smooth-weight-update approach for training the proposed network.\nEvaluation of the proposed method on multiple public image recognition and\nmedical image recognition datasets shows that it performs up to 8 percent\nbetter than the baseline nearest neighbor method, and is comparable to other\npreviously proposed SSL methods.\n","authors":["Momojit Biswas","Himanshu Buckchash","Dilip K. Prasad"],"pdf_url":"https://arxiv.org/pdf/2308.06983v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2301.08413v3","updated":"2023-08-14T07:25:27Z","published":"2023-01-20T03:39:35Z","title":"Chaos to Order: A Label Propagation Perspective on Source-Free Domain\n Adaptation","summary":" Source-free domain adaptation (SFDA), where only a pre-trained source model\nis used to adapt to the target distribution, is a more general approach to\nachieving domain adaptation in the real world. However, it can be challenging\nto capture the inherent structure of the target features accurately due to the\nlack of supervised information on the target domain. By analyzing the\nclustering performance of the target features, we show that they still contain\ncore features related to discriminative attributes but lack the collation of\nsemantic information. Inspired by this insight, we present Chaos to Order\n(CtO), a novel approach for SFDA that strives to constrain semantic credibility\nand propagate label information among target subpopulations. CtO divides the\ntarget data into inner and outlier samples based on the adaptive threshold of\nthe learning state, customizing the learning strategy to fit the data\nproperties best. Specifically, inner samples are utilized for learning\nintra-class structure thanks to their relatively well-clustered properties. The\nlow-density outlier samples are regularized by input consistency to achieve\nhigh accuracy with respect to the ground truth labels. In CtO, by employing\ndifferent learning strategies to propagate the labels from the inner local to\noutlier instances, it clusters the global samples from chaos to order. We\nfurther adaptively regulate the neighborhood affinity of the inner samples to\nconstrain the local semantic credibility. In theoretical and empirical\nanalyses, we demonstrate that our algorithm not only propagates from inner to\noutlier but also prevents local clustering from forming spurious clusters.\nEmpirical evidence demonstrates that CtO outperforms the state of the arts on\nthree public benchmarks: Office-31, Office-Home, and VisDA.\n","authors":["Chunwei Wu","Guitao Cao","Yan Li","Xidong Xi","Wenming Cao","Hong Wang"],"pdf_url":"https://arxiv.org/pdf/2301.08413v3.pdf","comment":"Accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2308.06974v1","updated":"2023-08-14T07:12:31Z","published":"2023-08-14T07:12:31Z","title":"A One Stop 3D Target Reconstruction and multilevel Segmentation Method","summary":" 3D object reconstruction and multilevel segmentation are fundamental to\ncomputer vision research. Existing algorithms usually perform 3D scene\nreconstruction and target objects segmentation independently, and the\nperformance is not fully guaranteed due to the challenge of the 3D\nsegmentation. Here we propose an open-source one stop 3D target reconstruction\nand multilevel segmentation framework (OSTRA), which performs segmentation on\n2D images, tracks multiple instances with segmentation labels in the image\nsequence, and then reconstructs labelled 3D objects or multiple parts with\nMulti-View Stereo (MVS) or RGBD-based 3D reconstruction methods. We extend\nobject tracking and 3D reconstruction algorithms to support continuous\nsegmentation labels to leverage the advances in the 2D image segmentation,\nespecially the Segment-Anything Model (SAM) which uses the pretrained neural\nnetwork without additional training for new scenes, for 3D object segmentation.\nOSTRA supports most popular 3D object models including point cloud, mesh and\nvoxel, and achieves high performance for semantic segmentation, instance\nsegmentation and part segmentation on several 3D datasets. It even surpasses\nthe manual segmentation in scenes with complex structures and occlusions. Our\nmethod opens up a new avenue for reconstructing 3D targets embedded with rich\nmulti-scale segmentation information in complex scenes. OSTRA is available from\nhttps://github.com/ganlab/OSTRA.\n","authors":["Jiexiong Xu","Weikun Zhao","Zhiyan Tang","Xiangchao Gan"],"pdf_url":"https://arxiv.org/pdf/2308.06974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14786v2","updated":"2023-08-14T07:06:03Z","published":"2023-07-27T11:28:33Z","title":"Towards Deeply Unified Depth-aware Panoptic Segmentation with\n Bi-directional Guidance Learning","summary":" Depth-aware panoptic segmentation is an emerging topic in computer vision\nwhich combines semantic and geometric understanding for more robust scene\ninterpretation. Recent works pursue unified frameworks to tackle this challenge\nbut mostly still treat it as two individual learning tasks, which limits their\npotential for exploring cross-domain information. We propose a deeply unified\nframework for depth-aware panoptic segmentation, which performs joint\nsegmentation and depth estimation both in a per-segment manner with identical\nobject queries. To narrow the gap between the two tasks, we further design a\ngeometric query enhancement method, which is able to integrate scene geometry\ninto object queries using latent representations. In addition, we propose a\nbi-directional guidance learning approach to facilitate cross-task feature\nlearning by taking advantage of their mutual relations. Our method sets the new\nstate of the art for depth-aware panoptic segmentation on both Cityscapes-DVPS\nand SemKITTI-DVPS datasets. Moreover, our guidance learning approach is shown\nto deliver performance improvement even under incomplete supervision labels.\n","authors":["Junwen He","Yifan Wang","Lijun Wang","Huchuan Lu","Jun-Yan He","Jin-Peng Lan","Bin Luo","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2307.14786v2.pdf","comment":"to be published in ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06964v1","updated":"2023-08-14T06:40:20Z","published":"2023-08-14T06:40:20Z","title":"How inter-rater variability relates to aleatoric and epistemic\n uncertainty: a case study with deep learning-based paraspinal muscle\n segmentation","summary":" Recent developments in deep learning (DL) techniques have led to great\nperformance improvement in medical image segmentation tasks, especially with\nthe latest Transformer model and its variants. While labels from fusing\nmulti-rater manual segmentations are often employed as ideal ground truths in\nDL model training, inter-rater variability due to factors such as training\nbias, image noise, and extreme anatomical variability can still affect the\nperformance and uncertainty of the resulting algorithms. Knowledge regarding\nhow inter-rater variability affects the reliability of the resulting DL\nalgorithms, a key element in clinical deployment, can help inform better\ntraining data construction and DL models, but has not been explored\nextensively. In this paper, we measure aleatoric and epistemic uncertainties\nusing test-time augmentation (TTA), test-time dropout (TTD), and deep ensemble\nto explore their relationship with inter-rater variability. Furthermore, we\ncompare UNet and TransUNet to study the impacts of Transformers on model\nuncertainty with two label fusion strategies. We conduct a case study using\nmulti-class paraspinal muscle segmentation from T2w MRIs. Our study reveals the\ninterplay between inter-rater variability and uncertainties, affected by\nchoices of label fusion strategies and DL models.\n","authors":["Parinaz Roshanzamir","Hassan Rivaz","Joshua Ahn","Hamza Mirza","Neda Naghdi","Meagan Anstruther","Michele C. Battié","Maryse Fortin","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.06964v1.pdf","comment":"Accepted in UNSURE MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.06962v1","updated":"2023-08-14T06:32:54Z","published":"2023-08-14T06:32:54Z","title":"Color-NeuS: Reconstructing Neural Implicit Surfaces with Color","summary":" The reconstruction of object surfaces from multi-view images or monocular\nvideo is a fundamental issue in computer vision. However, much of the recent\nresearch concentrates on reconstructing geometry through implicit or explicit\nmethods. In this paper, we shift our focus towards reconstructing mesh in\nconjunction with color. We remove the view-dependent color from neural volume\nrendering while retaining volume rendering performance through a relighting\nnetwork. Mesh is extracted from the signed distance function (SDF) network for\nthe surface, and color for each surface vertex is drawn from the global color\nnetwork. To evaluate our approach, we conceived a in hand object scanning task\nfeaturing numerous occlusions and dramatic shifts in lighting conditions. We've\ngathered several videos for this task, and the results surpass those of any\nexisting methods capable of reconstructing mesh alongside color. Additionally,\nour method's performance was assessed using public datasets, including DTU,\nBlendedMVS, and OmniObject3D. The results indicated that our method performs\nwell across all these datasets. Project page:\nhttps://colmar-zlicheng.github.io/color_neus.\n","authors":["Licheng Zhong","Lixin Yang","Kailin Li","Haoyu Zhen","Mei Han","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2308.06962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06957v1","updated":"2023-08-14T06:22:49Z","published":"2023-08-14T06:22:49Z","title":"CEmb-SAM: Segment Anything Model with Condition Embedding for Joint\n Learning from Heterogeneous Datasets","summary":" Automated segmentation of ultrasound images can assist medical experts with\ndiagnostic and therapeutic procedures. Although using the common modality of\nultrasound, one typically needs separate datasets in order to segment, for\nexample, different anatomical structures or lesions with different levels of\nmalignancy. In this paper, we consider the problem of jointly learning from\nheterogeneous datasets so that the model can improve generalization abilities\nby leveraging the inherent variability among datasets. We merge the\nheterogeneous datasets into one dataset and refer to each component dataset as\na subgroup. We propose to train a single segmentation model so that the model\ncan adapt to each sub-group. For robust segmentation, we leverage recently\nproposed Segment Anything model (SAM) in order to incorporate sub-group\ninformation into the model. We propose SAM with Condition Embedding block\n(CEmb-SAM) which encodes sub-group conditions and combines them with image\nembeddings from SAM. The conditional embedding block effectively adapts SAM to\neach image sub-group by incorporating dataset properties through learnable\nparameters for normalization. Experiments show that CEmb-SAM outperforms the\nbaseline methods on ultrasound image segmentation for peripheral nerves and\nbreast cancer. The experiments highlight the effectiveness of Cemb-SAM in\nlearning from heterogeneous datasets in medical image segmentation tasks.\n","authors":["Dongik Shin","Beomsuk Kim","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2308.06957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06954v1","updated":"2023-08-14T06:13:27Z","published":"2023-08-14T06:13:27Z","title":"Global Features are All You Need for Image Retrieval and Reranking","summary":" Utilizing a two-stage paradigm comprising of coarse image retrieval and\nprecise reranking, a well-established image retrieval system is formed. It has\nbeen widely accepted for long time that local feature is imperative to the\nsubsequent stage - reranking, but this requires sizeable storage and computing\ncapacities. We, for the first time, propose an image retrieval paradigm\nleveraging global feature only to enable accurate and lightweight image\nretrieval for both coarse retrieval and reranking, thus the name - SuperGlobal.\nIt consists of several plug-in modules that can be easily integrated into an\nalready trained model, for both coarse retrieval and reranking stage. This\nseries of approaches is inspired by the investigation into Generalized Mean\n(GeM) Pooling. Possessing these tools, we strive to defy the notion that local\nfeature is essential for a high-performance image retrieval paradigm. Extensive\nexperiments demonstrate substantial improvements compared to the state of the\nart in standard benchmarks. Notably, on the Revisited Oxford (ROxford)+1M Hard\ndataset, our single-stage results improve by 8.2% absolute, while our two-stage\nversion gain reaches 3.7% with a strong 7568X speedup. Furthermore, when the\nfull SuperGlobal is compared with the current single-stage state-of-the-art\nmethod, we achieve roughly 17% improvement with a minimal 0.005% time overhead.\nCode: https://github.com/ShihaoShao-GH/SuperGlobal.\n","authors":["Shihao Shao","Kaifeng Chen","Arjun Karpur","Qinghua Cui","Andre Araujo","Bingyi Cao"],"pdf_url":"https://arxiv.org/pdf/2308.06954v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06952v1","updated":"2023-08-14T06:04:50Z","published":"2023-08-14T06:04:50Z","title":"Channel-Wise Contrastive Learning for Learning with Noisy Labels","summary":" In real-world datasets, noisy labels are pervasive. The challenge of learning\nwith noisy labels (LNL) is to train a classifier that discerns the actual\nclasses from given instances. For this, the model must identify features\nindicative of the authentic labels. While research indicates that genuine label\ninformation is embedded in the learned features of even inaccurately labeled\ndata, it's often intertwined with noise, complicating its direct application.\nAddressing this, we introduce channel-wise contrastive learning (CWCL). This\nmethod distinguishes authentic label information from noise by undertaking\ncontrastive learning across diverse channels. Unlike conventional instance-wise\ncontrastive learning (IWCL), CWCL tends to yield more nuanced and resilient\nfeatures aligned with the authentic labels. Our strategy is twofold: firstly,\nusing CWCL to extract pertinent features to identify cleanly labeled samples,\nand secondly, progressively fine-tuning using these samples. Evaluations on\nseveral benchmark datasets validate our method's superiority over existing\napproaches.\n","authors":["Hui Kang","Sheng Liu","Huaxi Huang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.06952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06948v1","updated":"2023-08-14T05:55:38Z","published":"2023-08-14T05:55:38Z","title":"MixBCT: Towards Self-Adapting Backward-Compatible Training","summary":" The exponential growth of data, alongside advancements in model structures\nand loss functions, has necessitated the enhancement of image retrieval systems\nthrough the utilization of new models with superior feature embeddings.\nHowever, the expensive process of updating the old retrieval database by\nreplacing embeddings poses a challenge. As a solution, backward-compatible\ntraining can be employed to avoid the necessity of updating old retrieval\ndatasets. While previous methods achieved backward compatibility by aligning\nprototypes of the old model, they often overlooked the distribution of the old\nfeatures, thus limiting their effectiveness when the old model's low quality\nleads to a weakly discriminative feature distribution. On the other hand,\ninstance-based methods like L2 regression take into account the distribution of\nold features but impose strong constraints on the performance of the new model\nitself. In this paper, we propose MixBCT, a simple yet highly effective\nbackward-compatible training method that serves as a unified framework for old\nmodels of varying qualities. Specifically, we summarize four constraints that\nare essential for ensuring backward compatibility in an ideal scenario, and we\nconstruct a single loss function to facilitate backward-compatible training.\nOur approach adaptively adjusts the constraint domain for new features based on\nthe distribution of the old embeddings. We conducted extensive experiments on\nthe large-scale face recognition datasets MS1Mv3 and IJB-C to verify the\neffectiveness of our method. The experimental results clearly demonstrate its\nsuperiority over previous methods. Code is available at\nhttps://github.com/yuleung/MixBCT\n","authors":["Yu Liang","Shiliang Zhang","Yaowei Wang","Sheng Xiao","Kenli Li","Xiaoyu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06947v1","updated":"2023-08-14T05:54:32Z","published":"2023-08-14T05:54:32Z","title":"Knowing Where to Focus: Event-aware Transformer for Video Grounding","summary":" Recent DETR-based video grounding models have made the model directly predict\nmoment timestamps without any hand-crafted components, such as a pre-defined\nproposal or non-maximum suppression, by learning moment queries. However, their\ninput-agnostic moment queries inevitably overlook an intrinsic temporal\nstructure of a video, providing limited positional information. In this paper,\nwe formulate an event-aware dynamic moment query to enable the model to take\nthe input-specific content and positional information of the video into\naccount. To this end, we present two levels of reasoning: 1) Event reasoning\nthat captures distinctive event units constituting a given video using a slot\nattention mechanism; and 2) moment reasoning that fuses the moment queries with\na given sentence through a gated fusion transformer layer and learns\ninteractions between the moment queries and video-sentence representations to\npredict moment timestamps. Extensive experiments demonstrate the effectiveness\nand efficiency of the event-aware dynamic moment queries, outperforming\nstate-of-the-art approaches on several video grounding benchmarks.\n","authors":["Jinhyun Jang","Jungin Park","Jin Kim","Hyeongjun Kwon","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2308.06947v1.pdf","comment":"ICCV 2023. Code is available at https://github.com/jinhyunj/EaTR"},{"id":"http://arxiv.org/abs/2308.06945v1","updated":"2023-08-14T05:37:07Z","published":"2023-08-14T05:37:07Z","title":"Semantic-aware Network for Aerial-to-Ground Image Synthesis","summary":" Aerial-to-ground image synthesis is an emerging and challenging problem that\naims to synthesize a ground image from an aerial image. Due to the highly\ndifferent layout and object representation between the aerial and ground\nimages, existing approaches usually fail to transfer the components of the\naerial scene into the ground scene. In this paper, we propose a novel framework\nto explore the challenges by imposing enhanced structural alignment and\nsemantic awareness. We introduce a novel semantic-attentive feature\ntransformation module that allows to reconstruct the complex geographic\nstructures by aligning the aerial feature to the ground layout. Furthermore, we\npropose semantic-aware loss functions by leveraging a pre-trained segmentation\nnetwork. The network is enforced to synthesize realistic objects across various\nclasses by separately calculating losses for different classes and balancing\nthem. Extensive experiments including comparisons with previous methods and\nablation studies show the effectiveness of the proposed framework both\nqualitatively and quantitatively.\n","authors":["Jinhyun Jang","Taeyong Song","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2308.06945v1.pdf","comment":"ICIP 2021. Code is available at https://github.com/jinhyunj/SANet"},{"id":"http://arxiv.org/abs/2308.06944v1","updated":"2023-08-14T05:34:36Z","published":"2023-08-14T05:34:36Z","title":"One-shot lip-based biometric authentication: extending behavioral\n features with authentication phrase information","summary":" Lip-based biometric authentication (LBBA) is an authentication method based\non a person's lip movements during speech in the form of video data captured by\na camera sensor. LBBA can utilize both physical and behavioral characteristics\nof lip movements without requiring any additional sensory equipment apart from\nan RGB camera. State-of-the-art (SOTA) approaches use one-shot learning to\ntrain deep siamese neural networks which produce an embedding vector out of\nthese features. Embeddings are further used to compute the similarity between\nan enrolled user and a user being authenticated. A flaw of these approaches is\nthat they model behavioral features as style-of-speech without relation to what\nis being said. This makes the system vulnerable to video replay attacks of the\nclient speaking any phrase. To solve this problem we propose a one-shot\napproach which models behavioral features to discriminate against what is being\nsaid in addition to style-of-speech. We achieve this by customizing the GRID\ndataset to obtain required triplets and training a siamese neural network based\non 3D convolutions and recurrent neural network layers. A custom triplet loss\nfor batch-wise hard-negative mining is proposed. Obtained results using an\nopen-set protocol are 3.2% FAR and 3.8% FRR on the test set of the customized\nGRID dataset. Additional analysis of the results was done to quantify the\ninfluence and discriminatory power of behavioral and physical features for\nLBBA.\n","authors":["Brando Koch","Ratko Grbić"],"pdf_url":"https://arxiv.org/pdf/2308.06944v1.pdf","comment":"28 pages, 10 figures, 7 tables"},{"id":"http://arxiv.org/abs/2210.13869v4","updated":"2023-08-14T05:33:48Z","published":"2022-10-25T09:45:49Z","title":"A jet tagging algorithm of graph network with HaarPooling message\n passing","summary":" Recently methods of graph neural networks (GNNs) have been applied to solving\nthe problems in high energy physics (HEP) and have shown its great potential\nfor quark-gluon tagging with graph representation of jet events. In this paper,\nwe introduce an approach of GNNs combined with a HaarPooling operation to\nanalyze the events, called HaarPooling Message Passing neural network (HMPNet).\nIn HMPNet, HaarPooling not only extracts the features of graph, but embeds\nadditional information obtained by clustering of k-means of different particle\nfeatures. We construct Haarpooling from five different features: absolute\nenergy $\\log E$, transverse momentum $\\log p_T$, relative coordinates\n$(\\Delta\\eta,\\Delta\\phi)$, the mixed ones $(\\log E, \\log p_T)$ and $(\\log E,\n\\log p_T, \\Delta\\eta,\\Delta\\phi)$. The results show that an appropriate\nselection of information for HaarPooling enhances the accuracy of quark-gluon\ntagging, as adding extra information of $\\log P_T$ to the HMPNet outperforms\nall the others, whereas adding relative coordinates information\n$(\\Delta\\eta,\\Delta\\phi)$ is not very effective. This implies that by adding\neffective particle features from HaarPooling can achieve much better results\nthan solely pure message passing neutral network (MPNN) can do, which\ndemonstrates significant improvement of feature extraction via the pooling\nprocess. Finally we compare the HMPNet study, ordering by $p_T$, with other\nstudies and prove that the HMPNet is also a good choice of GNN algorithms for\njet tagging.\n","authors":["Fei Ma","Feiyi Liu","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2210.13869v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.00847v2","updated":"2023-08-14T05:22:41Z","published":"2022-08-01T13:34:33Z","title":"MAFW: A Large-scale, Multi-modal, Compound Affective Database for\n Dynamic Facial Expression Recognition in the Wild","summary":" Dynamic facial expression recognition (FER) databases provide important data\nsupport for affective computing and applications. However, most FER databases\nare annotated with several basic mutually exclusive emotional categories and\ncontain only one modality, e.g., videos. The monotonous labels and modality\ncannot accurately imitate human emotions and fulfill applications in the real\nworld. In this paper, we propose MAFW, a large-scale multi-modal compound\naffective database with 10,045 video-audio clips in the wild. Each clip is\nannotated with a compound emotional category and a couple of sentences that\ndescribe the subjects' affective behaviors in the clip. For the compound\nemotion annotation, each clip is categorized into one or more of the 11\nwidely-used emotions, i.e., anger, disgust, fear, happiness, neutral, sadness,\nsurprise, contempt, anxiety, helplessness, and disappointment. To ensure high\nquality of the labels, we filter out the unreliable annotations by an\nExpectation Maximization (EM) algorithm, and then obtain 11 single-label\nemotion categories and 32 multi-label emotion categories. To the best of our\nknowledge, MAFW is the first in-the-wild multi-modal database annotated with\ncompound emotion annotations and emotion-related captions. Additionally, we\nalso propose a novel Transformer-based expression snippet feature learning\nmethod to recognize the compound emotions leveraging the expression-change\nrelations among different emotions and modalities. Extensive experiments on\nMAFW database show the advantages of the proposed method over other\nstate-of-the-art methods for both uni- and multi-modal FER. Our MAFW database\nis publicly available from https://mafw-database.github.io/MAFW.\n","authors":["Yuanyuan Liu","Wei Dai","Chuanxu Feng","Wenbin Wang","Guanghao Yin","Jiabei Zeng","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2208.00847v2.pdf","comment":"This paper has been accepted by ACM MM'22"},{"id":"http://arxiv.org/abs/2303.08597v5","updated":"2023-08-14T04:44:50Z","published":"2023-03-15T13:07:21Z","title":"Aerial-Ground Person Re-ID","summary":" Person re-ID matches persons across multiple non-overlapping cameras. Despite\nthe increasing deployment of airborne platforms in surveillance, current\nexisting person re-ID benchmarks' focus is on ground-ground matching and very\nlimited efforts on aerial-aerial matching. We propose a new benchmark dataset -\nAG-ReID, which performs person re-ID matching in a new setting: across aerial\nand ground cameras. Our dataset contains 21,983 images of 388 identities and 15\nsoft attributes for each identity. The data was collected by a UAV flying at\naltitudes between 15 to 45 meters and a ground-based CCTV camera on a\nuniversity campus. Our dataset presents a novel elevated-viewpoint challenge\nfor person re-ID due to the significant difference in person appearance across\nthese cameras. We propose an explainable algorithm to guide the person re-ID\nmodel's training with soft attributes to address this challenge. Experiments\ndemonstrate the efficacy of our method on the aerial-ground person re-ID task.\nThe dataset will be published and the baseline codes will be open-sourced at\nhttps://github.com/huynguyen792/AG-ReID to facilitate research in this area.\n","authors":["Huy Nguyen","Kien Nguyen","Sridha Sridharan","Clinton Fookes"],"pdf_url":"https://arxiv.org/pdf/2303.08597v5.pdf","comment":"Published on IEEE International Conference on Multimedia and Expo\n 2023 (ICME2023)"},{"id":"http://arxiv.org/abs/2308.06933v1","updated":"2023-08-14T04:22:36Z","published":"2023-08-14T04:22:36Z","title":"Radiomics-Informed Deep Learning for Classification of Atrial\n Fibrillation Sub-Types from Left-Atrium CT Volumes","summary":" Atrial Fibrillation (AF) is characterized by rapid, irregular heartbeats, and\ncan lead to fatal complications such as heart failure. The disease is divided\ninto two sub-types based on severity, which can be automatically classified\nthrough CT volumes for disease screening of severe cases. However, existing\nclassification approaches rely on generic radiomic features that may not be\noptimal for the task, whilst deep learning methods tend to over-fit to the\nhigh-dimensional volume inputs. In this work, we propose a novel\nradiomics-informed deep-learning method, RIDL, that combines the advantages of\ndeep learning and radiomic approaches to improve AF sub-type classification.\nUnlike existing hybrid techniques that mostly rely on na\\\"ive feature\nconcatenation, we observe that radiomic feature selection methods can serve as\nan information prior, and propose supplementing low-level deep neural network\n(DNN) features with locally computed radiomic features. This reduces DNN\nover-fitting and allows local variations between radiomic features to be better\ncaptured. Furthermore, we ensure complementary information is learned by deep\nand radiomic features by designing a novel feature de-correlation loss.\nCombined, our method addresses the limitations of deep learning and radiomic\napproaches and outperforms state-of-the-art radiomic, deep learning, and hybrid\napproaches, achieving 86.9% AUC for the AF sub-type classification task. Code\nis available at https://github.com/xmed-lab/RIDL.\n","authors":["Weihang Dai","Xiaomeng Li","Taihui Yu","Di Zhao","Jun Shen","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.06933v1.pdf","comment":"Accepted by MICCAI23"},{"id":"http://arxiv.org/abs/2308.06926v1","updated":"2023-08-14T04:10:45Z","published":"2023-08-14T04:10:45Z","title":"OpenGCD: Assisting Open World Recognition with Generalized Category\n Discovery","summary":" A desirable open world recognition (OWR) system requires performing three\ntasks: (1) Open set recognition (OSR), i.e., classifying the known (classes\nseen during training) and rejecting the unknown (unseen$/$novel classes)\nonline; (2) Grouping and labeling these unknown as novel known classes; (3)\nIncremental learning (IL), i.e., continual learning these novel classes and\nretaining the memory of old classes. Ideally, all of these steps should be\nautomated. However, existing methods mostly assume that the second task is\ncompletely done manually. To bridge this gap, we propose OpenGCD that combines\nthree key ideas to solve the above problems sequentially: (a) We score the\norigin of instances (unknown or specifically known) based on the uncertainty of\nthe classifier's prediction; (b) For the first time, we introduce generalized\ncategory discovery (GCD) techniques in OWR to assist humans in grouping\nunlabeled data; (c) For the smooth execution of IL and GCD, we retain an equal\nnumber of informative exemplars for each class with diversity as the goal.\nMoreover, we present a new performance evaluation metric for GCD called\nharmonic clustering accuracy. Experiments on two standard classification\nbenchmarks and a challenging dataset demonstrate that OpenGCD not only offers\nexcellent compatibility but also substantially outperforms other baselines.\nCode: https://github.com/Fulin-Gao/OpenGCD.\n","authors":["Fulin Gao","Weimin Zhong","Zhixing Cao","Xin Peng","Zhi Li"],"pdf_url":"https://arxiv.org/pdf/2308.06926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06925v1","updated":"2023-08-14T04:03:51Z","published":"2023-08-14T04:03:51Z","title":"CBA: Improving Online Continual Learning via Continual Bias Adaptor","summary":" Online continual learning (CL) aims to learn new knowledge and consolidate\npreviously learned knowledge from non-stationary data streams. Due to the\ntime-varying training setting, the model learned from a changing distribution\neasily forgets the previously learned knowledge and biases toward the newly\nreceived task. To address this problem, we propose a Continual Bias Adaptor\n(CBA) module to augment the classifier network to adapt to catastrophic\ndistribution change during training, such that the classifier network is able\nto learn a stable consolidation of previously learned tasks. In the testing\nstage, CBA can be removed which introduces no additional computation cost and\nmemory overhead. We theoretically reveal the reason why the proposed method can\neffectively alleviate catastrophic distribution shifts, and empirically\ndemonstrate its effectiveness through extensive experiments based on four\nrehearsal-based baselines and three public continual learning benchmarks.\n","authors":["Quanziang Wang","Renzhen Wang","Yichen Wu","Xixi Jia","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2308.06925v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06909v1","updated":"2023-08-14T03:11:17Z","published":"2023-08-14T03:11:17Z","title":"Hierarchy Flow For High-Fidelity Image-to-Image Translation","summary":" Image-to-image (I2I) translation comprises a wide spectrum of tasks. Here we\ndivide this problem into three levels: strong-fidelity translation,\nnormal-fidelity translation, and weak-fidelity translation, indicating the\nextent to which the content of the original image is preserved. Although\nexisting methods achieve good performance in weak-fidelity translation, they\nfail to fully preserve the content in both strong- and normal-fidelity tasks,\ne.g. sim2real, style transfer and low-level vision. In this work, we propose\nHierarchy Flow, a novel flow-based model to achieve better content preservation\nduring translation. Specifically, 1) we first unveil the drawbacks of standard\nflow-based models when applied to I2I translation. 2) Next, we propose a new\ndesign, namely hierarchical coupling for reversible feature transformation and\nmulti-scale modeling, to constitute Hierarchy Flow. 3) Finally, we present a\ndedicated aligned-style loss for a better trade-off between content\npreservation and stylization during translation. Extensive experiments on a\nwide range of I2I translation benchmarks demonstrate that our approach achieves\nstate-of-the-art performance, with convincing advantages in both strong- and\nnormal-fidelity tasks. Code and models will be at\nhttps://github.com/WeichenFan/HierarchyFlow.\n","authors":["Weichen Fan","Jinghuan Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.06909v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2207.01909"},{"id":"http://arxiv.org/abs/2308.06905v1","updated":"2023-08-14T02:53:20Z","published":"2023-08-14T02:53:20Z","title":"The Michigan Robotics Undergraduate Curriculum: Defining the Discipline\n of Robotics for Equity and Excellence","summary":" The Robotics Major at the University of Michigan was successfully launched in\nthe 2022-23 academic year as an innovative step forward to better serve\nstudents, our communities, and our society. Building on our guiding principle\nof \"Robotics with Respect\" and our larger Robotics Pathways model, the Michigan\nRobotics Major was designed to define robotics as a true academic discipline\nwith both equity and excellence as our highest priorities. Understanding that\ntalent is equally distributed but opportunity is not, the Michigan Robotics\nMajor has embraced an adaptable curriculum that is accessible through a\ndiversity of student pathways and enables successful and sustained career-long\nparticipation in robotics, AI, and automation professions. The results after\nour planning efforts (2019-22) and first academic year (2022-23) have been\nhighly encouraging: more than 100 students declared Robotics as their major,\ncompletion of the Robotics major by our first two graduates, soaring\nenrollments in our Robotics classes, thriving partnerships with Historically\nBlack Colleges and Universities. This document provides our original curricular\nproposal for the Robotics Undergraduate Program at the University of Michigan,\nsubmitted to the Michigan Association of State Universities in April 2022 and\napproved in June 2022. The dissemination of our program design is in the spirit\nof continued growth for higher education towards realizing equity and\nexcellence.\n The most recent version of this document is also available on Google Docs\nthrough this link: https://ocj.me/robotics_major\n","authors":["Odest Chadwicke Jenkins","Jessy Grizzle","Ella Atkins","Leia Stirling","Elliott Rouse","Mark Guzdial","Damen Provost","Kimberly Mann","Joanna Millunchick"],"pdf_url":"https://arxiv.org/pdf/2308.06905v1.pdf","comment":"49 pages, approximately 25 figures"},{"id":"http://arxiv.org/abs/2308.04669v2","updated":"2023-08-14T02:52:02Z","published":"2023-08-09T02:27:23Z","title":"A General Implicit Framework for Fast NeRF Composition and Rendering","summary":" A variety of Neural Radiance Fields (NeRF) methods have recently achieved\nremarkable success in high render speed. However, current accelerating methods\nare specialized and incompatible with various implicit methods, preventing\nreal-time composition over various types of NeRF works. Because NeRF relies on\nsampling along rays, it is possible to provide general guidance for\nacceleration. To that end, we propose a general implicit pipeline for composing\nNeRF objects quickly. Our method enables the casting of dynamic shadows within\nor between objects using analytical light sources while allowing multiple NeRF\nobjects to be seamlessly placed and rendered together with any arbitrary rigid\ntransformations. Mainly, our work introduces a new surface representation known\nas Neural Depth Fields (NeDF) that quickly determines the spatial relationship\nbetween objects by allowing direct intersection computation between rays and\nimplicit surfaces. It leverages an intersection neural network to query NeRF\nfor acceleration instead of depending on an explicit spatial structure.Our\nproposed method is the first to enable both the progressive and interactive\ncomposition of NeRF objects. Additionally, it also serves as a previewing\nplugin for a range of existing NeRF works.\n","authors":["Xinyu Gao","Ziyi Yang","Yunlu Zhao","Yuxiang Sun","Xiaogang Jin","Changqing Zou"],"pdf_url":"https://arxiv.org/pdf/2308.04669v2.pdf","comment":"7 pages for main content"},{"id":"http://arxiv.org/abs/2308.06904v1","updated":"2023-08-14T02:51:34Z","published":"2023-08-14T02:51:34Z","title":"Exploring Lightweight Hierarchical Vision Transformers for Efficient\n Visual Tracking","summary":" Transformer-based visual trackers have demonstrated significant progress\nowing to their superior modeling capabilities. However, existing trackers are\nhampered by low speed, limiting their applicability on devices with limited\ncomputational power. To alleviate this problem, we propose HiT, a new family of\nefficient tracking models that can run at high speed on different devices while\nretaining high performance. The central idea of HiT is the Bridge Module, which\nbridges the gap between modern lightweight transformers and the tracking\nframework. The Bridge Module incorporates the high-level information of deep\nfeatures into the shallow large-resolution features. In this way, it produces\nbetter features for the tracking head. We also propose a novel dual-image\nposition encoding technique that simultaneously encodes the position\ninformation of both the search region and template images. The HiT model\nachieves promising speed with competitive performance. For instance, it runs at\n61 frames per second (fps) on the Nvidia Jetson AGX edge device. Furthermore,\nHiT attains 64.6% AUC on the LaSOT benchmark, surpassing all previous efficient\ntrackers.\n","authors":["Ben Kang","Xin Chen","Dong Wang","Houwen Peng","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.06904v1.pdf","comment":"This paper was accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.06897v1","updated":"2023-08-14T02:26:49Z","published":"2023-08-14T02:26:49Z","title":"Orthogonal Temporal Interpolation for Zero-Shot Video Recognition","summary":" Zero-shot video recognition (ZSVR) is a task that aims to recognize video\ncategories that have not been seen during the model training process. Recently,\nvision-language models (VLMs) pre-trained on large-scale image-text pairs have\ndemonstrated impressive transferability for ZSVR. To make VLMs applicable to\nthe video domain, existing methods often use an additional temporal learning\nmodule after the image-level encoder to learn the temporal relationships among\nvideo frames. Unfortunately, for video from unseen categories, we observe an\nabnormal phenomenon where the model that uses spatial-temporal feature performs\nmuch worse than the model that removes temporal learning module and uses only\nspatial feature. We conjecture that improper temporal modeling on video\ndisrupts the spatial feature of the video. To verify our hypothesis, we propose\nFeature Factorization to retain the orthogonal temporal feature of the video\nand use interpolation to construct refined spatial-temporal feature. The model\nusing appropriately refined spatial-temporal feature performs better than the\none using only spatial feature, which verifies the effectiveness of the\northogonal temporal feature for the ZSVR task. Therefore, an Orthogonal\nTemporal Interpolation module is designed to learn a better refined\nspatial-temporal video feature during training. Additionally, a Matching Loss\nis introduced to improve the quality of the orthogonal temporal feature. We\npropose a model called OTI for ZSVR by employing orthogonal temporal\ninterpolation and the matching loss based on VLMs. The ZSVR accuracies on\npopular video datasets (i.e., Kinetics-600, UCF101 and HMDB51) show that OTI\noutperforms the previous state-of-the-art method by a clear margin.\n","authors":["Yan Zhu","Junbao Zhuo","Bin Ma","Jiajia Geng","Xiaoming Wei","Xiaolin Wei","Shuhui Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06889v1","updated":"2023-08-14T02:02:56Z","published":"2023-08-14T02:02:56Z","title":"Robustness Stress Testing in Medical Image Classification","summary":" Deep neural networks have shown impressive performance for image-based\ndisease detection. Performance is commonly evaluated through clinical\nvalidation on independent test sets to demonstrate clinically acceptable\naccuracy. Reporting good performance metrics on test sets, however, is not\nalways a sufficient indication of the generalizability and robustness of an\nalgorithm. In particular, when the test data is drawn from the same\ndistribution as the training data, the iid test set performance can be an\nunreliable estimate of the accuracy on new data. In this paper, we employ\nstress testing to assess model robustness and subgroup performance disparities\nin disease detection models. We design progressive stress testing using five\ndifferent bidirectional and unidirectional image perturbations with six\ndifferent severity levels. As a use case, we apply stress tests to measure the\nrobustness of disease detection models for chest X-ray and skin lesion images,\nand demonstrate the importance of studying class and domain-specific model\nbehaviour. Our experiments indicate that some models may yield more robust and\nequitable performance than others. We also find that pretraining\ncharacteristics play an important role in downstream robustness. We conclude\nthat progressive stress testing is a viable and important tool and should\nbecome standard practice in the clinical validation of image-based disease\ndetection models.\n","authors":["Mobarakol Islam","Zeju Li","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2308.06889v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2308.06887v1","updated":"2023-08-14T01:47:26Z","published":"2023-08-14T01:47:26Z","title":"Robustified ANNs Reveal Wormholes Between Human Category Percepts","summary":" The visual object category reports of artificial neural networks (ANNs) are\nnotoriously sensitive to tiny, adversarial image perturbations. Because human\ncategory reports (aka human percepts) are thought to be insensitive to those\nsame small-norm perturbations -- and locally stable in general -- this argues\nthat ANNs are incomplete scientific models of human visual perception.\nConsistent with this, we show that when small-norm image perturbations are\ngenerated by standard ANN models, human object category percepts are indeed\nhighly stable. However, in this very same \"human-presumed-stable\" regime, we\nfind that robustified ANNs reliably discover low-norm image perturbations that\nstrongly disrupt human percepts. These previously undetectable human perceptual\ndisruptions are massive in amplitude, approaching the same level of sensitivity\nseen in robustified ANNs. Further, we show that robustified ANNs support\nprecise perceptual state interventions: they guide the construction of low-norm\nimage perturbations that strongly alter human category percepts toward specific\nprescribed percepts. These observations suggest that for arbitrary starting\npoints in image space, there exists a set of nearby \"wormholes\", each leading\nthe subject from their current category perceptual state into a semantically\nvery different state. Moreover, contemporary ANN models of biological visual\nprocessing are now accurate enough to consistently guide us to those portals.\n","authors":["Guy Gaziv","Michael J. Lee","James J. DiCarlo"],"pdf_url":"https://arxiv.org/pdf/2308.06887v1.pdf","comment":"*Equal contribution"},{"id":"http://arxiv.org/abs/2308.06879v1","updated":"2023-08-14T01:24:18Z","published":"2023-08-14T01:24:18Z","title":"Towards Open-Set Test-Time Adaptation Utilizing the Wisdom of Crowds in\n Entropy Minimization","summary":" Test-time adaptation (TTA) methods, which generally rely on the model's\npredictions (e.g., entropy minimization) to adapt the source pretrained model\nto the unlabeled target domain, suffer from noisy signals originating from 1)\nincorrect or 2) open-set predictions. Long-term stable adaptation is hampered\nby such noisy signals, so training models without such error accumulation is\ncrucial for practical TTA. To address these issues, including open-set TTA, we\npropose a simple yet effective sample selection method inspired by the\nfollowing crucial empirical finding. While entropy minimization compels the\nmodel to increase the probability of its predicted label (i.e., confidence\nvalues), we found that noisy samples rather show decreased confidence values.\nTo be more specific, entropy minimization attempts to raise the confidence\nvalues of an individual sample's prediction, but individual confidence values\nmay rise or fall due to the influence of signals from numerous other\npredictions (i.e., wisdom of crowds). Due to this fact, noisy signals\nmisaligned with such 'wisdom of crowds', generally found in the correct\nsignals, fail to raise the individual confidence values of wrong samples,\ndespite attempts to increase them. Based on such findings, we filter out the\nsamples whose confidence values are lower in the adapted model than in the\noriginal model, as they are likely to be noisy. Our method is widely applicable\nto existing TTA methods and improves their long-term adaptation performance in\nboth image classification (e.g., 49.4% reduced error rates with TENT) and\nsemantic segmentation (e.g., 11.7% gain in mIoU with TENT).\n","authors":["Jungsoo Lee","Debasmit Das","Jaegul Choo","Sungha Choi"],"pdf_url":"https://arxiv.org/pdf/2308.06879v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2210.07346v2","updated":"2023-08-14T01:07:38Z","published":"2022-10-13T20:39:21Z","title":"An Embarrassingly Simple Backdoor Attack on Self-supervised Learning","summary":" As a new paradigm in machine learning, self-supervised learning (SSL) is\ncapable of learning high-quality representations of complex data without\nrelying on labels. In addition to eliminating the need for labeled data,\nresearch has found that SSL improves the adversarial robustness over supervised\nlearning since lacking labels makes it more challenging for adversaries to\nmanipulate model predictions. However, the extent to which this robustness\nsuperiority generalizes to other types of attacks remains an open question.\n We explore this question in the context of backdoor attacks. Specifically, we\ndesign and evaluate CTRL, an embarrassingly simple yet highly effective\nself-supervised backdoor attack. By only polluting a tiny fraction of training\ndata (<= 1%) with indistinguishable poisoning samples, CTRL causes any\ntrigger-embedded input to be misclassified to the adversary's designated class\nwith a high probability (>= 99%) at inference time. Our findings suggest that\nSSL and supervised learning are comparably vulnerable to backdoor attacks. More\nimportantly, through the lens of CTRL, we study the inherent vulnerability of\nSSL to backdoor attacks. With both empirical and analytical evidence, we reveal\nthat the representation invariance property of SSL, which benefits adversarial\nrobustness, may also be the very reason making \\ssl highly susceptible to\nbackdoor attacks. Our findings also imply that the existing defenses against\nsupervised backdoor attacks are not easily retrofitted to the unique\nvulnerability of SSL.\n","authors":["Changjiang Li","Ren Pang","Zhaohan Xi","Tianyu Du","Shouling Ji","Yuan Yao","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2210.07346v2.pdf","comment":"The 2023 International Conference on Computer Vision (ICCV '23)"},{"id":"http://arxiv.org/abs/2308.06869v1","updated":"2023-08-14T00:42:03Z","published":"2023-08-14T00:42:03Z","title":"Shape-Graph Matching Network (SGM-net): Registration for Statistical\n Shape Analysis","summary":" This paper focuses on the statistical analysis of shapes of data objects\ncalled shape graphs, a set of nodes connected by articulated curves with\narbitrary shapes. A critical need here is a constrained registration of points\n(nodes to nodes, edges to edges) across objects. This, in turn, requires\noptimization over the permutation group, made challenging by differences in\nnodes (in terms of numbers, locations) and edges (in terms of shapes,\nplacements, and sizes) across objects. This paper tackles this registration\nproblem using a novel neural-network architecture and involves an unsupervised\nloss function developed using the elastic shape metric for curves. This\narchitecture results in (1) state-of-the-art matching performance and (2) an\norder of magnitude reduction in the computational cost relative to baseline\napproaches. We demonstrate the effectiveness of the proposed approach using\nboth simulated data and real-world 2D and 3D shape graphs. Code and data will\nbe made publicly available after review to foster research.\n","authors":["Shenyuan Liang","Mauricio Pamplona Segundo","Sathyanarayanan N. Aakur","Sudeep Sarkar","Anuj Srivastava"],"pdf_url":"https://arxiv.org/pdf/2308.06869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00164v2","updated":"2023-08-14T00:16:23Z","published":"2022-10-31T22:12:48Z","title":"Agent-Controller Representations: Principled Offline RL with Rich\n Exogenous Information","summary":" Learning to control an agent from data collected offline in a rich\npixel-based visual observation space is vital for real-world applications of\nreinforcement learning (RL). A major challenge in this setting is the presence\nof input information that is hard to model and irrelevant to controlling the\nagent. This problem has been approached by the theoretical RL community through\nthe lens of exogenous information, i.e, any control-irrelevant information\ncontained in observations. For example, a robot navigating in busy streets\nneeds to ignore irrelevant information, such as other people walking in the\nbackground, textures of objects, or birds in the sky. In this paper, we focus\non the setting with visually detailed exogenous information, and introduce new\noffline RL benchmarks offering the ability to study this problem. We find that\ncontemporary representation learning techniques can fail on datasets where the\nnoise is a complex and time dependent process, which is prevalent in practical\napplications. To address these, we propose to use multi-step inverse models,\nwhich have seen a great deal of interest in the RL theory community, to learn\nAgent-Controller Representations for Offline-RL (ACRO). Despite being simple\nand requiring no reward, we show theoretically and empirically that the\nrepresentation created by this objective greatly outperforms baselines.\n","authors":["Riashat Islam","Manan Tomar","Alex Lamb","Yonathan Efroni","Hongyu Zang","Aniket Didolkar","Dipendra Misra","Xin Li","Harm van Seijen","Remi Tachet des Combes","John Langford"],"pdf_url":"https://arxiv.org/pdf/2211.00164v2.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2308.06868v1","updated":"2023-08-14T00:15:01Z","published":"2023-08-14T00:15:01Z","title":"Camera Based mmWave Beam Prediction: Towards Multi-Candidate Real-World\n Scenarios","summary":" Leveraging sensory information to aid the millimeter-wave (mmWave) and\nsub-terahertz (sub-THz) beam selection process is attracting increasing\ninterest. This sensory data, captured for example by cameras at the\nbasestations, has the potential of significantly reducing the beam sweeping\noverhead and enabling highly-mobile applications. The solutions developed so\nfar, however, have mainly considered single-candidate scenarios, i.e.,\nscenarios with a single candidate user in the visual scene, and were evaluated\nusing synthetic datasets. To address these limitations, this paper extensively\ninvestigates the sensing-aided beam prediction problem in a real-world\nmulti-object vehicle-to-infrastructure (V2I) scenario and presents a\ncomprehensive machine learning-based framework. In particular, this paper\nproposes to utilize visual and positional data to predict the optimal beam\nindices as an alternative to the conventional beam sweeping approaches. For\nthis, a novel user (transmitter) identification solution has been developed, a\nkey step in realizing sensing-aided multi-candidate and multi-user beam\nprediction solutions. The proposed solutions are evaluated on the large-scale\nreal-world DeepSense $6$G dataset. Experimental results in realistic V2I\ncommunication scenarios indicate that the proposed solutions achieve close to\n$100\\%$ top-5 beam prediction accuracy for the scenarios with single-user and\nclose to $95\\%$ top-5 beam prediction accuracy for multi-candidate scenarios.\nFurthermore, the proposed approach can identify the probable transmitting\ncandidate with more than $93\\%$ accuracy across the different scenarios. This\nhighlights a promising approach for nearly eliminating the beam training\noverhead in mmWave/THz communication systems.\n","authors":["Gouranga Charan","Muhammad Alrabeiah","Tawfik Osman","Ahmed Alkhateeb"],"pdf_url":"https://arxiv.org/pdf/2308.06868v1.pdf","comment":"Dataset and code files are available on the DeepSense 6G website\n https://deepsense6g.net/"},{"id":"http://arxiv.org/abs/2308.07502v1","updated":"2023-08-14T23:52:19Z","published":"2023-08-14T23:52:19Z","title":"SpecTracle: Wearable Facial Motion Tracking from Unobtrusive Peripheral\n Cameras","summary":" Facial motion tracking in head-mounted displays (HMD) has the potential to\nenable immersive \"face-to-face\" interaction in a virtual environment. However,\ncurrent works on facial tracking are not suitable for unobtrusive augmented\nreality (AR) glasses or do not have the ability to track arbitrary facial\nmovements. In this work, we demonstrate a novel system called SpecTracle that\ntracks a user's facial motions using two wide-angle cameras mounted right next\nto the visor of a Hololens. Avoiding the usage of cameras extended in front of\nthe face, our system greatly improves the feasibility to integrate full-face\ntracking into a low-profile form factor. We also demonstrate that a neural\nnetwork-based model processing the wide-angle cameras can run in real-time at\n24 frames per second (fps) on a mobile GPU and track independent facial\nmovement for different parts of the face with a user-independent model. Using a\nshort personalized calibration, the system improves its tracking performance by\n42.3% compared to the user-independent model.\n","authors":["Yinan Xuan","Varun Viswanath","Sunny Chu","Owen Bartolf","Jessica Echterhoff","Edward Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07498v1","updated":"2023-08-14T23:45:01Z","published":"2023-08-14T23:45:01Z","title":"DREAMWALKER: Mental Planning for Continuous Vision-Language Navigation","summary":" VLN-CE is a recently released embodied task, where AI agents need to navigate\na freely traversable environment to reach a distant target location, given\nlanguage instructions. It poses great challenges due to the huge space of\npossible strategies. Driven by the belief that the ability to anticipate the\nconsequences of future actions is crucial for the emergence of intelligent and\ninterpretable planning behavior, we propose DREAMWALKER -- a world model based\nVLN-CE agent. The world model is built to summarize the visual, topological,\nand dynamic properties of the complicated continuous environment into a\ndiscrete, structured, and compact representation. DREAMWALKER can simulate and\nevaluate possible plans entirely in such internal abstract world, before\nexecuting costly actions. As opposed to existing model-free VLN-CE agents\nsimply making greedy decisions in the real world, which easily results in\nshortsighted behaviors, DREAMWALKER is able to make strategic planning through\nlarge amounts of ``mental experiments.'' Moreover, the imagined future\nscenarios reflect our agent's intention, making its decision-making process\nmore transparent. Extensive experiments and ablation studies on VLN-CE dataset\nconfirm the effectiveness of the proposed approach and outline fruitful\ndirections for future work.\n","authors":["Hanqing Wang","Wei Liang","Luc Van Gool","Wenguan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07498v1.pdf","comment":"Accepted at ICCV 2023; Project page:\n https://github.com/hanqingwangai/Dreamwalker"},{"id":"http://arxiv.org/abs/2308.07490v1","updated":"2023-08-14T22:47:48Z","published":"2023-08-14T22:47:48Z","title":"BSED: Baseline Shapley-Based Explainable Detector","summary":" Explainable artificial intelligence (XAI) has witnessed significant advances\nin the field of object recognition, with saliency maps being used to highlight\nimage features relevant to the predictions of learned models. Although these\nadvances have made AI-based technology more interpretable to humans, several\nissues have come to light. Some approaches present explanations irrelevant to\npredictions, and cannot guarantee the validity of XAI (axioms). In this study,\nwe propose the Baseline Shapley-based Explainable Detector (BSED), which\nextends the Shapley value to object detection, thereby enhancing the validity\nof interpretation. The Shapley value can attribute the prediction of a learned\nmodel to a baseline feature while satisfying the explainability axioms. The\nprocessing cost for the BSED is within the reasonable range, while the original\nShapley value is prohibitively computationally expensive. Furthermore, BSED is\na generalizable method that can be applied to various detectors in a\nmodel-agnostic manner, and interpret various detection targets without\nfine-grained parameter tuning. These strengths can enable the practical\napplicability of XAI. We present quantitative and qualitative comparisons with\nexisting methods to demonstrate the superior performance of our method in terms\nof explanation validity. Moreover, we present some applications, such as\ncorrecting detection based on explanations from our method.\n","authors":["Michihiro Kuroki","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2308.07490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.12391v3","updated":"2023-08-14T22:32:27Z","published":"2022-07-25T17:56:54Z","title":"SegPGD: An Effective and Efficient Adversarial Attack for Evaluating and\n Boosting Segmentation Robustness","summary":" Deep neural network-based image classifications are vulnerable to adversarial\nperturbations. The image classifications can be easily fooled by adding\nartificial small and imperceptible perturbations to input images. As one of the\nmost effective defense strategies, adversarial training was proposed to address\nthe vulnerability of classification models, where the adversarial examples are\ncreated and injected into training data during training. The attack and defense\nof classification models have been intensively studied in past years. Semantic\nsegmentation, as an extension of classifications, has also received great\nattention recently. Recent work shows a large number of attack iterations are\nrequired to create effective adversarial examples to fool segmentation models.\nThe observation makes both robustness evaluation and adversarial training on\nsegmentation models challenging. In this work, we propose an effective and\nefficient segmentation attack method, dubbed SegPGD. Besides, we provide a\nconvergence analysis to show the proposed SegPGD can create more effective\nadversarial examples than PGD under the same number of attack iterations.\nFurthermore, we propose to apply our SegPGD as the underlying attack method for\nsegmentation adversarial training. Since SegPGD can create more effective\nadversarial examples, the adversarial training with our SegPGD can boost the\nrobustness of segmentation models. Our proposals are also verified with\nexperiments on popular Segmentation model architectures and standard\nsegmentation datasets.\n","authors":["Jindong Gu","Hengshuang Zhao","Volker Tresp","Philip Torr"],"pdf_url":"https://arxiv.org/pdf/2207.12391v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07481v1","updated":"2023-08-14T22:21:24Z","published":"2023-08-14T22:21:24Z","title":"Space Object Identification and Classification from Hyperspectral\n Material Analysis","summary":" This paper presents a data processing pipeline designed to extract\ninformation from the hyperspectral signature of unknown space objects. The\nmethodology proposed in this paper determines the material composition of space\nobjects from single pixel images. Two techniques are used for material\nidentification and classification: one based on machine learning and the other\nbased on a least square match with a library of known spectra. From this\ninformation, a supervised machine learning algorithm is used to classify the\nobject into one of several categories based on the detection of materials on\nthe object. The behaviour of the material classification methods is\ninvestigated under non-ideal circumstances, to determine the effect of\nweathered materials, and the behaviour when the training library is missing a\nmaterial that is present in the object being observed. Finally the paper will\npresent some preliminary results on the identification and classification of\nspace objects.\n","authors":["Massimiliano Vasile","Lewis Walker","Andrew Campbell","Simao Marto","Paul Murray","Stephen Marshall","Vasili Savitski"],"pdf_url":"https://arxiv.org/pdf/2308.07481v1.pdf","comment":"30 pages, 24 figures"},{"id":"http://arxiv.org/abs/2308.07477v1","updated":"2023-08-14T22:08:28Z","published":"2023-08-14T22:08:28Z","title":"Probabilistic MIMO U-Net: Efficient and Accurate Uncertainty Estimation\n for Pixel-wise Regression","summary":" Uncertainty estimation in machine learning is paramount for enhancing the\nreliability and interpretability of predictive models, especially in\nhigh-stakes real-world scenarios. Despite the availability of numerous methods,\nthey often pose a trade-off between the quality of uncertainty estimation and\ncomputational efficiency. Addressing this challenge, we present an adaptation\nof the Multiple-Input Multiple-Output (MIMO) framework -- an approach\nexploiting the overparameterization of deep neural networks -- for pixel-wise\nregression tasks. Our MIMO variant expands the applicability of the approach\nfrom simple image classification to broader computer vision domains. For that\npurpose, we adapted the U-Net architecture to train multiple subnetworks within\na single model, harnessing the overparameterization in deep neural networks.\nAdditionally, we introduce a novel procedure for synchronizing subnetwork\nperformance within the MIMO framework. Our comprehensive evaluations of the\nresulting MIMO U-Net on two orthogonal datasets demonstrate comparable accuracy\nto existing models, superior calibration on in-distribution data, robust\nout-of-distribution detection capabilities, and considerable improvements in\nparameter size and inference time. Code available at\ngithub.com/antonbaumann/MIMO-Unet\n","authors":["Anton Baumann","Thomas Roßberg","Michael Schmitt"],"pdf_url":"https://arxiv.org/pdf/2308.07477v1.pdf","comment":"8 pages (references do not count), Accepted at UnCV (Workshop on\n Uncertainty Quantification for Computer Vision at ICCV)"},{"id":"http://arxiv.org/abs/2304.11463v2","updated":"2023-08-14T21:43:42Z","published":"2023-04-22T18:35:50Z","title":"OmniLabel: A Challenging Benchmark for Language-Based Object Detection","summary":" Language-based object detection is a promising direction towards building a\nnatural interface to describe objects in images that goes far beyond plain\ncategory names. While recent methods show great progress in that direction,\nproper evaluation is lacking. With OmniLabel, we propose a novel task\ndefinition, dataset, and evaluation metric. The task subsumes standard- and\nopen-vocabulary detection as well as referring expressions. With more than 28K\nunique object descriptions on over 25K images, OmniLabel provides a challenging\nbenchmark with diverse and complex object descriptions in a naturally\nopen-vocabulary setting. Moreover, a key differentiation to existing benchmarks\nis that our object descriptions can refer to one, multiple or even no object,\nhence, providing negative examples in free-form text. The proposed evaluation\nhandles the large label space and judges performance via a modified average\nprecision metric, which we validate by evaluating strong language-based\nbaselines. OmniLabel indeed provides a challenging test bed for future research\non language-based detection.\n","authors":["Samuel Schulter","Vijay Kumar B G","Yumin Suh","Konstantinos M. Dafnis","Zhixing Zhang","Shiyu Zhao","Dimitris Metaxas"],"pdf_url":"https://arxiv.org/pdf/2304.11463v2.pdf","comment":"ICCV 2023 Oral - Visit our project website at\n https://www.omnilabel.org"},{"id":"http://arxiv.org/abs/2308.07468v1","updated":"2023-08-14T21:39:33Z","published":"2023-08-14T21:39:33Z","title":"Reducing Training Demands for 3D Gait Recognition with Deep Koopman\n Operator Constraints","summary":" Deep learning research has made many biometric recognition solution viable,\nbut it requires vast training data to achieve real-world generalization. Unlike\nother biometric traits, such as face and ear, gait samples cannot be easily\ncrawled from the web to form massive unconstrained datasets. As the human body\nhas been extensively studied for different digital applications, one can rely\non prior shape knowledge to overcome data scarcity. This work follows the\nrecent trend of fitting a 3D deformable body model into gait videos using deep\nneural networks to obtain disentangled shape and pose representations for each\nframe. To enforce temporal consistency in the network, we introduce a new\nLinear Dynamical Systems (LDS) module and loss based on Koopman operator\ntheory, which provides an unsupervised motion regularization for the periodic\nnature of gait, as well as a predictive capacity for extending gait sequences.\nWe compare LDS to the traditional adversarial training approach and use the USF\nHumanID and CASIA-B datasets to show that LDS can obtain better accuracy with\nless training data. Finally, we also show that our 3D modeling approach is much\nbetter than other 3D gait approaches in overcoming viewpoint variation under\nnormal, bag-carrying and clothing change conditions.\n","authors":["Cole Hill","Mauricio Pamplona Segundo","Sudeep Sarkar"],"pdf_url":"https://arxiv.org/pdf/2308.07468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.01303v3","updated":"2023-08-14T21:32:45Z","published":"2021-09-03T04:25:57Z","title":"Self-supervised Pseudo Multi-class Pre-training for Unsupervised Anomaly\n Detection and Segmentation in Medical Images","summary":" Unsupervised anomaly detection (UAD) methods are trained with normal (or\nhealthy) images only, but during testing, they are able to classify normal and\nabnormal (or disease) images. UAD is an important medical image analysis (MIA)\nmethod to be applied in disease screening problems because the training sets\navailable for those problems usually contain only normal images. However, the\nexclusive reliance on normal images may result in the learning of ineffective\nlow-dimensional image representations that are not sensitive enough to detect\nand segment unseen abnormal lesions of varying size, appearance, and shape.\nPre-training UAD methods with self-supervised learning, based on computer\nvision techniques, can mitigate this challenge, but they are sub-optimal\nbecause they do not explore domain knowledge for designing the pretext tasks,\nand their contrastive learning losses do not try to cluster the normal training\nimages, which may result in a sparse distribution of normal images that is\nineffective for anomaly detection. In this paper, we propose a new\nself-supervised pre-training method for MIA UAD applications, named Pseudo\nMulti-class Strong Augmentation via Contrastive Learning (PMSACL). PMSACL\nconsists of a novel optimisation method that contrasts a normal image class\nfrom multiple pseudo classes of synthesised abnormal images, with each class\nenforced to form a dense cluster in the feature space. In the experiments, we\nshow that our PMSACL pre-training improves the accuracy of SOTA UAD methods on\nmany MIA benchmarks using colonoscopy, fundus screening and Covid-19 Chest\nX-ray datasets. The code is made publicly available via\nhttps://github.com/tianyu0207/PMSACL.\n","authors":["Yu Tian","Fengbei Liu","Guansong Pang","Yuanhong Chen","Yuyuan Liu","Johan W. Verjans","Rajvinder Singh","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2109.01303v3.pdf","comment":"Accepted to Medical Image Analysis"},{"id":"http://arxiv.org/abs/2308.07464v1","updated":"2023-08-14T21:21:03Z","published":"2023-08-14T21:21:03Z","title":"There Is a Digital Art History","summary":" In this paper, we revisit Johanna Drucker's question, \"Is there a digital art\nhistory?\" -- posed exactly a decade ago -- in the light of the emergence of\nlarge-scale, transformer-based vision models. While more traditional types of\nneural networks have long been part of digital art history, and digital\nhumanities projects have recently begun to use transformer models, their\nepistemic implications and methodological affordances have not yet been\nsystematically analyzed. We focus our analysis on two main aspects that,\ntogether, seem to suggest a coming paradigm shift towards a \"digital\" art\nhistory in Drucker's sense. On the one hand, the visual-cultural repertoire\nnewly encoded in large-scale vision models has an outsized effect on digital\nart history. The inclusion of significant numbers of non-photographic images\nallows for the extraction and automation of different forms of visual logics.\nLarge-scale vision models have \"seen\" large parts of the Western visual canon\nmediated by Net visual culture, and they continuously solidify and concretize\nthis canon through their already widespread application in all aspects of\ndigital life. On the other hand, based on two technical case studies of\nutilizing a contemporary large-scale visual model to investigate basic\nquestions from the fields of art history and urbanism, we suggest that such\nsystems require a new critical methodology that takes into account the\nepistemic entanglement of a model and its applications. This new methodology\nreads its corpora through a neural model's training data, and vice versa: the\nvisual ideologies of research datasets and training datasets become entangled.\n","authors":["Leonardo Impett","Fabian Offert"],"pdf_url":"https://arxiv.org/pdf/2308.07464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07445v1","updated":"2023-08-14T20:34:54Z","published":"2023-08-14T20:34:54Z","title":"Open-set Face Recognition using Ensembles trained on Clustered Data","summary":" Open-set face recognition describes a scenario where unknown subjects, unseen\nduring the training stage, appear on test time. Not only it requires methods\nthat accurately identify individuals of interest, but also demands approaches\nthat effectively deal with unfamiliar faces. This work details a scalable\nopen-set face identification approach to galleries composed of hundreds and\nthousands of subjects. It is composed of clustering and an ensemble of binary\nlearning algorithms that estimates when query face samples belong to the face\ngallery and then retrieves their correct identity. The approach selects the\nmost suitable gallery subjects and uses the ensemble to improve prediction\nperformance. We carry out experiments on well-known LFW and YTF benchmarks.\nResults show that competitive performance can be achieved even when targeting\nscalability.\n","authors":["Rafael Henrique Vareto","William Robson Schwartz"],"pdf_url":"https://arxiv.org/pdf/2308.07445v1.pdf","comment":"[Original paper title: Unconstrained Face Identification using\n Ensembles trained on Clustered Data] [2020 IEEE International Joint\n Conference on Biometrics (IJCB)]\n [https://ieeexplore.ieee.org/document/9304882]"},{"id":"http://arxiv.org/abs/2308.07444v1","updated":"2023-08-14T20:34:52Z","published":"2023-08-14T20:34:52Z","title":"The Performance of Transferability Metrics does not Translate to Medical\n Tasks","summary":" Transfer learning boosts the performance of medical image analysis by\nenabling deep learning (DL) on small datasets through the knowledge acquired\nfrom large ones. As the number of DL architectures explodes, exhaustively\nattempting all candidates becomes unfeasible, motivating cheaper alternatives\nfor choosing them. Transferability scoring methods emerge as an enticing\nsolution, allowing to efficiently calculate a score that correlates with the\narchitecture accuracy on any target dataset. However, since transferability\nscores have not been evaluated on medical datasets, their use in this context\nremains uncertain, preventing them from benefiting practitioners. We fill that\ngap in this work, thoroughly evaluating seven transferability scores in three\nmedical applications, including out-of-distribution scenarios. Despite\npromising results in general-purpose datasets, our results show that no\ntransferability score can reliably and consistently estimate target performance\nin medical contexts, inviting further work in that direction.\n","authors":["Levy Chaves","Alceu Bissoto","Eduardo Valle","Sandra Avila"],"pdf_url":"https://arxiv.org/pdf/2308.07444v1.pdf","comment":"10 pages, 3 figures. Accepted at the DART workshop @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2210.09887v5","updated":"2023-08-14T20:24:24Z","published":"2022-10-18T14:23:05Z","title":"MotionDeltaCNN: Sparse CNN Inference of Frame Differences in Moving\n Camera Videos","summary":" Convolutional neural network inference on video input is computationally\nexpensive and requires high memory bandwidth. Recently, DeltaCNN managed to\nreduce the cost by only processing pixels with significant updates over the\nprevious frame. However, DeltaCNN relies on static camera input. Moving cameras\nadd new challenges in how to fuse newly unveiled image regions with already\nprocessed regions efficiently to minimize the update rate - without increasing\nmemory overhead and without knowing the camera extrinsics of future frames. In\nthis work, we propose MotionDeltaCNN, a sparse CNN inference framework that\nsupports moving cameras. We introduce spherical buffers and padded convolutions\nto enable seamless fusion of newly unveiled regions and previously processed\nregions -- without increasing memory footprint. Our evaluation shows that we\noutperform DeltaCNN by up to 90% for moving camera videos.\n","authors":["Mathias Parger","Chengcheng Tang","Thomas Neff","Christopher D. Twigg","Cem Keskin","Robert Wang","Markus Steinberger"],"pdf_url":"https://arxiv.org/pdf/2210.09887v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07439v1","updated":"2023-08-14T20:20:26Z","published":"2023-08-14T20:20:26Z","title":"Interaction-Aware Personalized Vehicle Trajectory Prediction Using\n Temporal Graph Neural Networks","summary":" Accurate prediction of vehicle trajectories is vital for advanced driver\nassistance systems and autonomous vehicles. Existing methods mainly rely on\ngeneric trajectory predictions derived from large datasets, overlooking the\npersonalized driving patterns of individual drivers. To address this gap, we\npropose an approach for interaction-aware personalized vehicle trajectory\nprediction that incorporates temporal graph neural networks. Our method\nutilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to\nmodel the spatio-temporal interactions between target vehicles and their\nsurrounding traffic. To personalize the predictions, we establish a pipeline\nthat leverages transfer learning: the model is initially pre-trained on a\nlarge-scale trajectory dataset and then fine-tuned for each driver using their\nspecific driving data. We employ human-in-the-loop simulation to collect\npersonalized naturalistic driving trajectories and corresponding surrounding\nvehicle trajectories. Experimental results demonstrate the superior performance\nof our personalized GCN-LSTM model, particularly for longer prediction\nhorizons, compared to its generic counterpart. Moreover, the personalized model\noutperforms individual models created without pre-training, emphasizing the\nsignificance of pre-training on a large dataset to avoid overfitting. By\nincorporating personalization, our approach enhances trajectory prediction\naccuracy.\n","authors":["Amr Abdelraouf","Rohit Gupta","Kyungtae Han"],"pdf_url":"https://arxiv.org/pdf/2308.07439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07428v1","updated":"2023-08-14T19:49:29Z","published":"2023-08-14T19:49:29Z","title":"UniBrain: Unify Image Reconstruction and Captioning All in One Diffusion\n Model from Human Brain Activity","summary":" Image reconstruction and captioning from brain activity evoked by visual\nstimuli allow researchers to further understand the connection between the\nhuman brain and the visual perception system. While deep generative models have\nrecently been employed in this field, reconstructing realistic captions and\nimages with both low-level details and high semantic fidelity is still a\nchallenging problem. In this work, we propose UniBrain: Unify Image\nReconstruction and Captioning All in One Diffusion Model from Human Brain\nActivity. For the first time, we unify image reconstruction and captioning from\nvisual-evoked functional magnetic resonance imaging (fMRI) through a latent\ndiffusion model termed Versatile Diffusion. Specifically, we transform fMRI\nvoxels into text and image latent for low-level information and guide the\nbackward diffusion process through fMRI-based image and text conditions derived\nfrom CLIP to generate realistic captions and images. UniBrain outperforms\ncurrent methods both qualitatively and quantitatively in terms of image\nreconstruction and reports image captioning results for the first time on the\nNatural Scenes Dataset (NSD) dataset. Moreover, the ablation experiments and\nfunctional region-of-interest (ROI) analysis further exhibit the superiority of\nUniBrain and provide comprehensive insight for visual-evoked brain decoding.\n","authors":["Weijian Mai","Zhijun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07421v1","updated":"2023-08-14T19:21:28Z","published":"2023-08-14T19:21:28Z","title":"U-Turn Diffusion","summary":" We present a comprehensive examination of score-based diffusion models of AI\nfor generating synthetic images. These models hinge upon a dynamic auxiliary\ntime mechanism driven by stochastic differential equations, wherein the score\nfunction is acquired from input images. Our investigation unveils a criterion\nfor evaluating efficiency of the score-based diffusion models: the power of the\ngenerative process depends on the ability to de-construct fast correlations\nduring the reverse/de-noising phase. To improve the quality of the produced\nsynthetic images, we introduce an approach coined \"U-Turn Diffusion\". The\nU-Turn Diffusion technique starts with the standard forward diffusion process,\nalbeit with a condensed duration compared to conventional settings.\nSubsequently, we execute the standard reverse dynamics, initialized with the\nconcluding configuration from the forward process. This U-Turn Diffusion\nprocedure, combining forward, U-turn, and reverse processes, creates a\nsynthetic image approximating an independent and identically distributed\n(i.i.d.) sample from the probability distribution implicitly described via\ninput samples. To analyze relevant time scales we employ various analytical\ntools, including auto-correlation analysis, weighted norm of the score-function\nanalysis, and Kolmogorov-Smirnov Gaussianity test. The tools guide us to\nestablishing that the Kernel Intersection Distance, a metric comparing the\nquality of synthetic samples with real data samples, is minimized at the\noptimal U-turn time.\n","authors":["Hamidreza Behjoo","Michael Chertkov"],"pdf_url":"https://arxiv.org/pdf/2308.07421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09663v2","updated":"2023-08-14T19:20:28Z","published":"2023-03-16T21:47:40Z","title":"Efficient Computation Sharing for Multi-Task Visual Scene Understanding","summary":" Solving multiple visual tasks using individual models can be\nresource-intensive, while multi-task learning can conserve resources by sharing\nknowledge across different tasks. Despite the benefits of multi-task learning,\nsuch techniques can struggle with balancing the loss for each task, leading to\npotential performance degradation. We present a novel computation- and\nparameter-sharing framework that balances efficiency and accuracy to perform\nmultiple visual tasks utilizing individually-trained single-task transformers.\nOur method is motivated by transfer learning schemes to reduce computational\nand parameter storage costs while maintaining the desired performance. Our\napproach involves splitting the tasks into a base task and the other sub-tasks,\nand sharing a significant portion of activations and parameters/weights between\nthe base and sub-tasks to decrease inter-task redundancies and enhance\nknowledge sharing. The evaluation conducted on NYUD-v2 and PASCAL-context\ndatasets shows that our method is superior to the state-of-the-art\ntransformer-based multi-task learning techniques with higher accuracy and\nreduced computational resources. Moreover, our method is extended to video\nstream inputs, further reducing computational costs by efficiently sharing\ninformation across the temporal domain as well as the task domain. Our codes\nand models will be publicly available.\n","authors":["Sara Shoouri","Mingyu Yang","Zichen Fan","Hun-Seok Kim"],"pdf_url":"https://arxiv.org/pdf/2303.09663v2.pdf","comment":"Camera-Ready version. Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07415v1","updated":"2023-08-14T19:07:26Z","published":"2023-08-14T19:07:26Z","title":"Semantify: Simplifying the Control of 3D Morphable Models using CLIP","summary":" We present Semantify: a self-supervised method that utilizes the semantic\npower of CLIP language-vision foundation model to simplify the control of 3D\nmorphable models. Given a parametric model, training data is created by\nrandomly sampling the model's parameters, creating various shapes and rendering\nthem. The similarity between the output images and a set of word descriptors is\ncalculated in CLIP's latent space. Our key idea is first to choose a small set\nof semantically meaningful and disentangled descriptors that characterize the\n3DMM, and then learn a non-linear mapping from scores across this set to the\nparametric coefficients of the given 3DMM. The non-linear mapping is defined by\ntraining a neural network without a human-in-the-loop. We present results on\nnumerous 3DMMs: body shape models, face shape and expression models, as well as\nanimal shapes. We demonstrate how our method defines a simple slider interface\nfor intuitive modeling, and show how the mapping can be used to instantly fit a\n3D parametric body shape to in-the-wild images.\n","authors":["Omer Gralnik","Guy Gafni","Ariel Shamir"],"pdf_url":"https://arxiv.org/pdf/2308.07415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.18232v2","updated":"2023-08-14T18:30:40Z","published":"2023-03-31T17:47:23Z","title":"DIME-FM: DIstilling Multimodal and Efficient Foundation Models","summary":" Large Vision-Language Foundation Models (VLFM), such as CLIP, ALIGN and\nFlorence, are trained on large-scale datasets of image-caption pairs and\nachieve superior transferability and robustness on downstream tasks, but they\nare difficult to use in many practical applications due to their large size,\nhigh latency and fixed architectures. Unfortunately, recent work shows training\na small custom VLFM for resource-limited applications is currently very\ndifficult using public and smaller-scale data. In this paper, we introduce a\nnew distillation mechanism (DIME-FM) that allows us to transfer the knowledge\ncontained in large VLFMs to smaller, customized foundation models using a\nrelatively small amount of inexpensive, unpaired images and sentences. We\ntransfer the knowledge from the pre-trained CLIP-ViTL/14 model to a ViT-B/32\nmodel, with only 40M public images and 28.4M unpaired public sentences. The\nresulting model \"Distill-ViT-B/32\" rivals the CLIP-ViT-B/32 model pre-trained\non its private WiT dataset (400M image-text pairs): Distill-ViT-B/32 achieves\nsimilar results in terms of zero-shot and linear-probing performance on both\nImageNet and the ELEVATER (20 image classification tasks) benchmarks. It also\ndisplays comparable robustness when evaluated on five datasets with natural\ndistribution shifts from ImageNet.\n","authors":["Ximeng Sun","Pengchuan Zhang","Peizhao Zhang","Hardik Shah","Kate Saenko","Xide Xia"],"pdf_url":"https://arxiv.org/pdf/2303.18232v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.05872v2","updated":"2023-08-14T18:27:12Z","published":"2023-08-10T22:57:31Z","title":"Vision Backbone Enhancement via Multi-Stage Cross-Scale Attention","summary":" Convolutional neural networks (CNNs) and vision transformers (ViTs) have\nachieved remarkable success in various vision tasks. However, many\narchitectures do not consider interactions between feature maps from different\nstages and scales, which may limit their performance. In this work, we propose\na simple add-on attention module to overcome these limitations via multi-stage\nand cross-scale interactions. Specifically, the proposed Multi-Stage\nCross-Scale Attention (MSCSA) module takes feature maps from different stages\nto enable multi-stage interactions and achieves cross-scale interactions by\ncomputing self-attention at different scales based on the multi-stage feature\nmaps. Our experiments on several downstream tasks show that MSCSA provides a\nsignificant performance boost with modest additional FLOPs and runtime.\n","authors":["Liang Shang","Yanli Liu","Zhengyang Lou","Shuxue Quan","Nagesh Adluru","Bochen Guan","William A. Sethares"],"pdf_url":"https://arxiv.org/pdf/2308.05872v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07392v1","updated":"2023-08-14T18:23:18Z","published":"2023-08-14T18:23:18Z","title":"A Unified Query-based Paradigm for Camouflaged Instance Segmentation","summary":" Due to the high similarity between camouflaged instances and the background,\nthe recently proposed camouflaged instance segmentation (CIS) faces challenges\nin accurate localization and instance segmentation. To this end, inspired by\nquery-based transformers, we propose a unified query-based multi-task learning\nframework for camouflaged instance segmentation, termed UQFormer, which builds\na set of mask queries and a set of boundary queries to learn a shared composed\nquery representation and efficiently integrates global camouflaged object\nregion and boundary cues, for simultaneous instance segmentation and instance\nboundary detection in camouflaged scenarios. Specifically, we design a composed\nquery learning paradigm that learns a shared representation to capture object\nregion and boundary features by the cross-attention interaction of mask queries\nand boundary queries in the designed multi-scale unified learning transformer\ndecoder. Then, we present a transformer-based multi-task learning framework for\nsimultaneous camouflaged instance segmentation and camouflaged instance\nboundary detection based on the learned composed query representation, which\nalso forces the model to learn a strong instance-level query representation.\nNotably, our model views the instance segmentation as a query-based direct set\nprediction problem, without other post-processing such as non-maximal\nsuppression. Compared with 14 state-of-the-art approaches, our UQFormer\nsignificantly improves the performance of camouflaged instance segmentation.\nOur code will be available at https://github.com/dongbo811/UQFormer.\n","authors":["Do Dong","Jialun Pei","Rongrong Gao","Tian-Zhu Xiang","Shuo Wang","Huan Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.07392v1.pdf","comment":"This paper has been accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2308.07391v1","updated":"2023-08-14T18:18:00Z","published":"2023-08-14T18:18:00Z","title":"PARIS: Part-level Reconstruction and Motion Analysis for Articulated\n Objects","summary":" We address the task of simultaneous part-level reconstruction and motion\nparameter estimation for articulated objects. Given two sets of multi-view\nimages of an object in two static articulation states, we decouple the movable\npart from the static part and reconstruct shape and appearance while predicting\nthe motion parameters. To tackle this problem, we present PARIS: a\nself-supervised, end-to-end architecture that learns part-level implicit shape\nand appearance models and optimizes motion parameters jointly without any 3D\nsupervision, motion, or semantic annotation. Our experiments show that our\nmethod generalizes better across object categories, and outperforms baselines\nand prior work that are given 3D point clouds as input. Our approach improves\nreconstruction relative to state-of-the-art baselines with a Chamfer-L1\ndistance reduction of 3.94 (45.2%) for objects and 26.79 (84.5%) for parts, and\nachieves 5% error rate for motion estimation across 10 object categories.\n Video summary at: https://youtu.be/tDSrROPCgUc\n","authors":["Jiayi Liu","Ali Mahdavi-Amiri","Manolis Savva"],"pdf_url":"https://arxiv.org/pdf/2308.07391v1.pdf","comment":"Presented at ICCV 2023. Project website:\n https://3dlg-hcvc.github.io/paris/"},{"id":"http://arxiv.org/abs/2308.00924v2","updated":"2023-08-14T18:10:29Z","published":"2023-08-02T03:47:19Z","title":"Continual Domain Adaptation on Aerial Images under Gradually Degrading\n Weather","summary":" Domain adaptation (DA) strives to mitigate the domain gap between the source\ndomain where a model is trained, and the target domain where the model is\ndeployed. When a deep learning model is deployed on an aerial platform, it may\nface gradually degrading weather conditions during operation, leading to\nwidening domain gaps between the training data and the encountered evaluation\ndata. We synthesize two such gradually worsening weather conditions on real\nimages from two existing aerial imagery datasets, generating a total of four\nbenchmark datasets. Under the continual, or test-time adaptation setting, we\nevaluate three DA models on our datasets: a baseline standard DA model and two\ncontinual DA models. In such setting, the models can access only one small\nportion, or one batch of the target data at a time, and adaptation takes place\ncontinually, and over only one epoch of the data. The combination of the\nconstraints of continual adaptation, and gradually deteriorating weather\nconditions provide the practical DA scenario for aerial deployment. Among the\nevaluated models, we consider both convolutional and transformer architectures\nfor comparison. We discover stability issues during adaptation for existing\nbuffer-fed continual DA methods, and offer gradient normalization as a simple\nsolution to curb training instability.\n","authors":["Chowdhury Sadman Jahan","Andreas Savakis"],"pdf_url":"https://arxiv.org/pdf/2308.00924v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07387v1","updated":"2023-08-14T18:09:58Z","published":"2023-08-14T18:09:58Z","title":"DISBELIEVE: Distance Between Client Models is Very Essential for\n Effective Local Model Poisoning Attacks","summary":" Federated learning is a promising direction to tackle the privacy issues\nrelated to sharing patients' sensitive data. Often, federated systems in the\nmedical image analysis domain assume that the participating local clients are\n\\textit{honest}. Several studies report mechanisms through which a set of\nmalicious clients can be introduced that can poison the federated setup,\nhampering the performance of the global model. To overcome this, robust\naggregation methods have been proposed that defend against those attacks. We\nobserve that most of the state-of-the-art robust aggregation methods are\nheavily dependent on the distance between the parameters or gradients of\nmalicious clients and benign clients, which makes them prone to local model\npoisoning attacks when the parameters or gradients of malicious and benign\nclients are close. Leveraging this, we introduce DISBELIEVE, a local model\npoisoning attack that creates malicious parameters or gradients such that their\ndistance to benign clients' parameters or gradients is low respectively but at\nthe same time their adverse effect on the global model's performance is high.\nExperiments on three publicly available medical image datasets demonstrate the\nefficacy of the proposed DISBELIEVE attack as it significantly lowers the\nperformance of the state-of-the-art \\textit{robust aggregation} methods for\nmedical image analysis. Furthermore, compared to state-of-the-art local model\npoisoning attacks, DISBELIEVE attack is also effective on natural images where\nwe observe a severe drop in classification performance of the global model for\nmulti-class classification on benchmark dataset CIFAR-10.\n","authors":["Indu Joshi","Priyank Upadhya","Gaurav Kumar Nayak","Peter Schüffler","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2308.07387v1.pdf","comment":"Accepted by MICCAI 2023 - DeCaF"},{"id":"http://arxiv.org/abs/2308.07378v1","updated":"2023-08-14T18:01:45Z","published":"2023-08-14T18:01:45Z","title":"The Devil in the Details: Simple and Effective Optical Flow Synthetic\n Data Generation","summary":" Recent work on dense optical flow has shown significant progress, primarily\nin a supervised learning manner requiring a large amount of labeled data. Due\nto the expensiveness of obtaining large scale real-world data, computer\ngraphics are typically leveraged for constructing datasets. However, there is a\ncommon belief that synthetic-to-real domain gaps limit generalization to real\nscenes. In this paper, we show that the required characteristics in an optical\nflow dataset are rather simple and present a simpler synthetic data generation\nmethod that achieves a certain level of realism with compositions of elementary\noperations. With 2D motion-based datasets, we systematically analyze the\nsimplest yet critical factors for generating synthetic datasets. Furthermore,\nwe propose a novel method of utilizing occlusion masks in a supervised method\nand observe that suppressing gradients on occluded regions serves as a powerful\ninitial state in the curriculum learning sense. The RAFT network initially\ntrained on our dataset outperforms the original RAFT on the two most\nchallenging online benchmarks, MPI Sintel and KITTI 2015.\n","authors":["Kwon Byung-Ki","Kim Sung-Bin","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2308.07378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13056v3","updated":"2023-08-14T17:20:43Z","published":"2022-12-26T09:20:55Z","title":"MonoNeRF: Learning a Generalizable Dynamic Radiance Field from Monocular\n Videos","summary":" In this paper, we target at the problem of learning a generalizable dynamic\nradiance field from monocular videos. Different from most existing NeRF methods\nthat are based on multiple views, monocular videos only contain one view at\neach timestamp, thereby suffering from ambiguity along the view direction in\nestimating point features and scene flows. Previous studies such as DynNeRF\ndisambiguate point features by positional encoding, which is not transferable\nand severely limits the generalization ability. As a result, these methods have\nto train one independent model for each scene and suffer from heavy\ncomputational costs when applying to increasing monocular videos in real-world\napplications. To address this, We propose MonoNeRF to simultaneously learn\npoint features and scene flows with point trajectory and feature correspondence\nconstraints across frames. More specifically, we learn an implicit velocity\nfield to estimate point trajectory from temporal features with Neural ODE,\nwhich is followed by a flow-based feature aggregation module to obtain spatial\nfeatures along the point trajectory. We jointly optimize temporal and spatial\nfeatures in an end-to-end manner. Experiments show that our MonoNeRF is able to\nlearn from multiple scenes and support new applications such as scene editing,\nunseen frame synthesis, and fast novel scene adaptation. Codes are available at\nhttps://github.com/tianfr/MonoNeRF.\n","authors":["Fengrui Tian","Shaoyi Du","Yueqi Duan"],"pdf_url":"https://arxiv.org/pdf/2212.13056v3.pdf","comment":"Accepted by ICCV 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.07284v1","updated":"2023-08-14T17:15:37Z","published":"2023-08-14T17:15:37Z","title":"Cross-Attribute Matrix Factorization Model with Shared User Embedding","summary":" Over the past few years, deep learning has firmly established its prowess\nacross various domains, including computer vision, speech recognition, and\nnatural language processing. Motivated by its outstanding success, researchers\nhave been directing their efforts towards applying deep learning techniques to\nrecommender systems. Neural collaborative filtering (NCF) and Neural Matrix\nFactorization (NeuMF) refreshes the traditional inner product in matrix\nfactorization with a neural architecture capable of learning complex and\ndata-driven functions. While these models effectively capture user-item\ninteractions, they overlook the specific attributes of both users and items.\nThis can lead to robustness issues, especially for items and users that belong\nto the \"long tail\". Such challenges are commonly recognized in recommender\nsystems as a part of the cold-start problem. A direct and intuitive approach to\naddress this issue is by leveraging the features and attributes of the items\nand users themselves. In this paper, we introduce a refined NeuMF model that\nconsiders not only the interaction between users and items, but also acrossing\nassociated attributes. Moreover, our proposed architecture features a shared\nuser embedding, seamlessly integrating with user embeddings to imporve the\nrobustness and effectively address the cold-start problem. Rigorous experiments\non both the Movielens and Pinterest datasets demonstrate the superiority of our\nCross-Attribute Matrix Factorization model, particularly in scenarios\ncharacterized by higher dataset sparsity.\n","authors":["Wen Liang","Zeng Fan","Youzhi Liang","Jianguo Jia"],"pdf_url":"https://arxiv.org/pdf/2308.07284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07269v1","updated":"2023-08-14T16:52:42Z","published":"2023-08-14T16:52:42Z","title":"EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language\n Models","summary":" Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy\nissues, which means they are unaware of unseen events or generate text with\nincorrect facts owing to the outdated/noisy data. To this end, many knowledge\nediting approaches for LLMs have emerged -- aiming to subtly inject/edit\nupdated knowledge or adjust undesired behavior while minimizing the impact on\nunrelated inputs. Nevertheless, due to significant differences among various\nknowledge editing methods and the variations in task setups, there is no\nstandard implementation framework available for the community, which hinders\npractitioners to apply knowledge editing to applications. To address these\nissues, we propose EasyEdit, an easy-to-use knowledge editing framework for\nLLMs. It supports various cutting-edge knowledge editing approaches and can be\nreadily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.\nEmpirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,\ndemonstrating that knowledge editing surpasses traditional fine-tuning in terms\nof reliability and generalization. We have released the source code on GitHub\nat https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and\ncomprehensive documentation for beginners to get started. Besides, we present\nan online system for real-time knowledge editing, and a demo video at\nhttp://knowlm.zjukg.cn/easyedit.mp4.\n","authors":["Peng Wang","Ningyu Zhang","Xin Xie","Yunzhi Yao","Bozhong Tian","Mengru Wang","Zekun Xi","Siyuan Cheng","Kangwei Liu","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07269v1.pdf","comment":"The project website is https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2208.08063v5","updated":"2023-08-14T16:49:07Z","published":"2022-08-17T04:30:58Z","title":"NECE: Narrative Event Chain Extraction Toolkit","summary":" To understand a narrative, it is essential to comprehend the temporal event\nflows, especially those associated with main characters; however, this can be\nchallenging with lengthy and unstructured narrative texts. To address this, we\nintroduce NECE, an open-access, document-level toolkit that automatically\nextracts and aligns narrative events in the temporal order of their occurrence.\nThrough extensive evaluations, we show the high quality of the NECE toolkit and\ndemonstrates its downstream application in analyzing narrative bias regarding\ngender. We also openly discuss the shortcomings of the current approach, and\npotential of leveraging generative models in future works. Lastly the NECE\ntoolkit includes both a Python library and a user-friendly web interface, which\noffer equal access to professionals and layman audience alike, to visualize\nevent chain, obtain narrative flows, or study narrative bias.\n","authors":["Guangxuan Xu","Paulina Toro Isaza","Moshi Li","Akintoye Oloko","Bingsheng Yao","Cassia Sanctos","Aminat Adebiyi","Yufang Hou","Nanyun Peng","Dakuo Wang"],"pdf_url":"https://arxiv.org/pdf/2208.08063v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.07122v2","updated":"2023-08-14T16:04:21Z","published":"2021-10-14T02:23:33Z","title":"Deconfounded Causal Collaborative Filtering","summary":" Recommender systems may be confounded by various types of confounding factors\n(also called confounders) that may lead to inaccurate recommendations and\nsacrificed recommendation performance. Current approaches to solving the\nproblem usually design each specific model for each specific confounder.\nHowever, real-world systems may include a huge number of confounders and thus\ndesigning each specific model for each specific confounder could be\nunrealistic. More importantly, except for those ``explicit confounders'' that\nexperts can manually identify and process such as item's position in the\nranking list, there are also many ``latent confounders'' that are beyond the\nimagination of experts. For example, users' rating on a song may depend on\ntheir current mood or the current weather, and users' preference on ice creams\nmay depend on the air temperature. Such latent confounders may be unobservable\nin the recorded training data. To solve the problem, we propose Deconfounded\nCausal Collaborative Filtering (DCCF). We first frame user behaviors with\nunobserved confounders into a causal graph, and then we design a front-door\nadjustment model carefully fused with machine learning to deconfound the\ninfluence of unobserved confounders. Experiments on real-world datasets show\nthat our method is able to deconfound unobserved confounders to achieve better\nrecommendation performance.\n","authors":["Shuyuan Xu","Juntao Tan","Shelby Heinecke","Jia Li","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2110.07122v2.pdf","comment":"Accepted by the ACM Transactions on Recommender Systems (TORS)"},{"id":"http://arxiv.org/abs/2102.01868v5","updated":"2023-08-14T15:58:36Z","published":"2021-02-03T04:16:11Z","title":"Causal Collaborative Filtering","summary":" Many of the traditional recommendation algorithms are designed based on the\nfundamental idea of mining or learning correlative patterns from data to\nestimate the user-item correlative preference. However, pure correlative\nlearning may lead to Simpson's paradox in predictions, and thus results in\nsacrificed recommendation performance. Simpson's paradox is a well-known\nstatistical phenomenon, which causes confusions in statistical conclusions and\nignoring the paradox may result in inaccurate decisions. Fortunately, causal\nand counterfactual modeling can help us to think outside of the observational\ndata for user modeling and personalization so as to tackle such issues. In this\npaper, we propose Causal Collaborative Filtering (CCF) -- a general framework\nfor modeling causality in collaborative filtering and recommendation. We\nprovide a unified causal view of CF and mathematically show that many of the\ntraditional CF algorithms are actually special cases of CCF under simplified\ncausal graphs. We then propose a conditional intervention approach for\n$do$-operations so that we can estimate the user-item causal preference based\non the observational data. Finally, we further propose a general counterfactual\nconstrained learning framework for estimating the user-item preferences.\nExperiments are conducted on two types of real-world datasets -- traditional\nand randomized trial data -- and results show that our framework can improve\nthe recommendation performance and reduce the Simpson's paradox problem of many\nCF algorithms.\n","authors":["Shuyuan Xu","Yingqiang Ge","Yunqi Li","Zuohui Fu","Xu Chen","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2102.01868v5.pdf","comment":"Accepted by the 2023 ACM SIGIR International Conference on Theory of\n Information Retrieval"},{"id":"http://arxiv.org/abs/2308.07222v1","updated":"2023-08-14T15:47:36Z","published":"2023-08-14T15:47:36Z","title":"MM-GEF: Multi-modal representation meet collaborative filtering","summary":" In modern e-commerce, item content features in various modalities offer\naccurate yet comprehensive information to recommender systems. The majority of\nprevious work either focuses on learning effective item representation during\nmodelling user-item interactions, or exploring item-item relationships by\nanalysing multi-modal features. Those methods, however, fail to incorporate the\ncollaborative item-user-item relationships into the multi-modal feature-based\nitem structure. In this work, we propose a graph-based item structure\nenhancement method MM-GEF: Multi-Modal recommendation with Graph Early-Fusion,\nwhich effectively combines the latent item structure underlying multi-modal\ncontents with the collaborative signals. Instead of processing the content\nfeature in different modalities separately, we show that the early-fusion of\nmulti-modal features provides significant improvement. MM-GEF learns refined\nitem representations by injecting structural information obtained from both\nmulti-modal and collaborative signals. Through extensive experiments on four\npublicly available datasets, we demonstrate systematical improvements of our\nmethod over state-of-the-art multi-modal recommendation methods.\n","authors":["Hao Wu","Alejandro Ariza-Casabona","Bartłomiej Twardowski","Tri Kurniawan Wijaya"],"pdf_url":"https://arxiv.org/pdf/2308.07222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07192v1","updated":"2023-08-14T14:56:40Z","published":"2023-08-14T14:56:40Z","title":"gSASRec: Reducing Overconfidence in Sequential Recommendation Trained\n with Negative Sampling","summary":" A large catalogue size is one of the central challenges in training\nrecommendation models: a large number of items makes them memory and\ncomputationally inefficient to compute scores for all items during training,\nforcing these models to deploy negative sampling. However, negative sampling\nincreases the proportion of positive interactions in the training data, and\ntherefore models trained with negative sampling tend to overestimate the\nprobabilities of positive interactions a phenomenon we call overconfidence.\nWhile the absolute values of the predicted scores or probabilities are not\nimportant for the ranking of retrieved recommendations, overconfident models\nmay fail to estimate nuanced differences in the top-ranked items, resulting in\ndegraded performance. In this paper, we show that overconfidence explains why\nthe popular SASRec model underperforms when compared to BERT4Rec. This is\ncontrary to the BERT4Rec authors explanation that the difference in performance\nis due to the bi-directional attention mechanism. To mitigate overconfidence,\nwe propose a novel Generalised Binary Cross-Entropy Loss function (gBCE) and\ntheoretically prove that it can mitigate overconfidence. We further propose the\ngSASRec model, an improvement over SASRec that deploys an increased number of\nnegatives and the gBCE loss. We show through detailed experiments on three\ndatasets that gSASRec does not exhibit the overconfidence problem. As a result,\ngSASRec can outperform BERT4Rec (e.g. +9.47% NDCG on the MovieLens-1M dataset),\nwhile requiring less training time (e.g. -73% training time on MovieLens-1M).\nMoreover, in contrast to BERT4Rec, gSASRec is suitable for large datasets that\ncontain more than 1 million items.\n","authors":["Aleksandr Petrov","Craig Macdonald"],"pdf_url":"https://arxiv.org/pdf/2308.07192v1.pdf","comment":"Accepted at ACM RecSys 2023"},{"id":"http://arxiv.org/abs/2308.07134v1","updated":"2023-08-14T13:41:09Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of language models\ncontinues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundational model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLMs to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative language models replacing GNNs as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v1.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.07107v1","updated":"2023-08-14T12:47:22Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":" As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions within\nthis expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15464v4","updated":"2023-08-14T11:30:51Z","published":"2023-07-28T10:34:47Z","title":"Framework to Automatically Determine the Quality of Open Data Catalogs","summary":" Data catalogs play a crucial role in modern data-driven organizations by\nfacilitating the discovery, understanding, and utilization of diverse data\nassets. However, ensuring their quality and reliability is complex, especially\nin open and large-scale data environments. This paper proposes a framework to\nautomatically determine the quality of open data catalogs, addressing the need\nfor efficient and reliable quality assessment mechanisms. Our framework can\nanalyze various core quality dimensions, such as accuracy, completeness,\nconsistency, scalability, and timeliness, offer several alternatives for the\nassessment of compatibility and similarity across such catalogs as well as the\nimplementation of a set of non-core quality dimensions such as provenance,\nreadability, and licensing. The goal is to empower data-driven organizations to\nmake informed decisions based on trustworthy and well-curated data assets. The\nsource code that illustrates our approach can be downloaded from\nhttps://www.github.com/jorge-martinez-gil/dataq/.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.15464v4.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2306.07946v2","updated":"2023-08-14T11:09:09Z","published":"2023-06-02T14:47:56Z","title":"STUDY: Socially Aware Temporally Causal Decoder Recommender Systems","summary":" Recommender systems are widely used to help people find items that are\ntailored to their interests. These interests are often influenced by social\nnetworks, making it important to use social network information effectively in\nrecommender systems. This is especially true for demographic groups with\ninterests that differ from the majority. This paper introduces STUDY, a\nSocially-aware Temporally caUsal Decoder recommender sYstem. STUDY introduces a\nnew socially-aware recommender system architecture that is significantly more\nefficient to learn and train than existing methods. STUDY performs joint\ninference over socially connected groups in a single forward pass of a modified\ntransformer decoder network. We demonstrate the benefits of STUDY in the\nrecommendation of books for students who are dyslexic, or struggling readers.\nDyslexic students often have difficulty engaging with reading material, making\nit critical to recommend books that are tailored to their interests. We worked\nwith our non-profit partner Learning Ally to evaluate STUDY on a dataset of\nstruggling readers. STUDY was able to generate recommendations that more\naccurately predicted student engagement, when compared with existing methods.\n","authors":["Eltayeb Ahmed","Diana Mincu","Lauren Harrell","Katherine Heller","Subhrajit Roy"],"pdf_url":"https://arxiv.org/pdf/2306.07946v2.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.07048v1","updated":"2023-08-14T10:18:24Z","published":"2023-08-14T10:18:24Z","title":"UIPC-MF: User-Item Prototype Connection Matrix Factorization for\n Explainable Collaborative Filtering","summary":" Recommending items to potentially interested users has been an important\ncommercial task that faces two main challenges: accuracy and explainability.\nWhile most collaborative filtering models rely on statistical computations on a\nlarge scale of interaction data between users and items and can achieve high\nperformance, they often lack clear explanatory power. We propose UIPC-MF, a\nprototype-based matrix factorization method for explainable collaborative\nfiltering recommendations. In UIPC-MF, both users and items are associated with\nsets of prototypes, capturing general collaborative attributes. To enhance\nexplainability, UIPC-MF learns connection weights that reflect the associative\nrelations between user and item prototypes for recommendations. UIPC-MF\noutperforms other prototype-based baseline methods in terms of Hit Ratio and\nNormalized Discounted Cumulative Gain on three datasets, while also providing\nbetter transparency.\n","authors":["Lei Pan","Von-Wun Soo"],"pdf_url":"https://arxiv.org/pdf/2308.07048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05379v2","updated":"2023-08-14T09:49:21Z","published":"2023-08-10T06:52:53Z","title":"Beyond Semantics: Learning a Behavior Augmented Relevance Model with\n Self-supervised Learning","summary":" Relevance modeling aims to locate desirable items for corresponding queries,\nwhich is crucial for search engines to ensure user experience. Although most\nconventional approaches address this problem by assessing the semantic\nsimilarity between the query and item, pure semantic matching is not\neverything. In reality, auxiliary query-item interactions extracted from user\nhistorical behavior data of the search log could provide hints to reveal users'\nsearch intents further. Drawing inspiration from this, we devise a novel\nBehavior Augmented Relevance Learning model for Alipay Search (BARL-ASe) that\nleverages neighbor queries of target item and neighbor items of target query to\ncomplement target query-item semantic matching. Specifically, our model builds\nmulti-level co-attention for distilling coarse-grained and fine-grained\nsemantic representations from both neighbor and target views. The model\nsubsequently employs neighbor-target self-supervised learning to improve the\naccuracy and robustness of BARL-ASe by strengthening representation and logit\nlearning. Furthermore, we discuss how to deal with the long-tail query-item\nmatching of the mini apps search scenario of Alipay practically. Experiments on\nreal-world industry data and online A/B testing demonstrate our proposal\nachieves promising performance with low latency.\n","authors":["Zeyuan Chen","Wei Chen","Jia Xu","Zhongyi Liu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05379v2.pdf","comment":"Accepted by CIKM2023"},{"id":"http://arxiv.org/abs/2308.07001v1","updated":"2023-08-14T08:32:22Z","published":"2023-08-14T08:32:22Z","title":"The Scientometrics and Reciprocality Underlying Co-Authorship Panels in\n Google Scholar Profiles","summary":" Online academic profiles are used by scholars to reflect a desired image to\ntheir online audience. In Google Scholar, scholars can select a subset of\nco-authors for presentation in a central location on their profile using a\nsocial feature called the Co-authroship panel. In this work, we examine whether\nscientometrics and reciprocality can explain the observed selections. To this\nend, we scrape and thoroughly analyze a novel set of 120,000 Google Scholar\nprofiles, ranging across four disciplines and various academic institutions.\nOur results suggest that scholars tend to favor co-authors with higher\nscientometrics over others for inclusion in their co-authorship panels.\nInterestingly, as one's own scientometrics are higher, the tendency to include\nco-authors with high scientometrics is diminishing. Furthermore, we find that\nreciprocality is central to explaining scholars' selections.\n","authors":["Ariel Alexi","Teddy Lazebnik","Ariel Rosenfeld"],"pdf_url":"https://arxiv.org/pdf/2308.07001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04067v2","updated":"2023-08-14T07:53:26Z","published":"2023-08-08T06:04:17Z","title":"Online Distillation-enhanced Multi-modal Transformer for Sequential\n Recommendation","summary":" Multi-modal recommendation systems, which integrate diverse types of\ninformation, have gained widespread attention in recent years. However,\ncompared to traditional collaborative filtering-based multi-modal\nrecommendation systems, research on multi-modal sequential recommendation is\nstill in its nascent stages. Unlike traditional sequential recommendation\nmodels that solely rely on item identifier (ID) information and focus on\nnetwork structure design, multi-modal recommendation models need to emphasize\nitem representation learning and the fusion of heterogeneous data sources. This\npaper investigates the impact of item representation learning on downstream\nrecommendation tasks and examines the disparities in information fusion at\ndifferent stages. Empirical experiments are conducted to demonstrate the need\nto design a framework suitable for collaborative learning and fusion of diverse\ninformation. Based on this, we propose a new model-agnostic framework for\nmulti-modal sequential recommendation tasks, called Online\nDistillation-enhanced Multi-modal Transformer (ODMT), to enhance feature\ninteraction and mutual learning among multi-source input (ID, text, and image),\nwhile avoiding conflicts among different features during training, thereby\nimproving recommendation accuracy. To be specific, we first introduce an\nID-aware Multi-modal Transformer module in the item representation learning\nstage to facilitate information interaction among different features. Secondly,\nwe employ an online distillation training strategy in the prediction\noptimization stage to make multi-source data learn from each other and improve\nprediction robustness. Experimental results on a stream media recommendation\ndataset and three e-commerce recommendation datasets demonstrate the\neffectiveness of the proposed two modules, which is approximately 10%\nimprovement in performance compared to baseline models.\n","authors":["Wei Ji","Xiangyan Liu","An Zhang","Yinwei Wei","Yongxin Ni","Xiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.04067v2.pdf","comment":"11 pages, 7 figures, accepted in ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.06982v1","updated":"2023-08-14T07:35:14Z","published":"2023-08-14T07:35:14Z","title":"Discrete Conditional Diffusion for Reranking in Recommendation","summary":" Reranking plays a crucial role in modern multi-stage recommender systems by\nrearranging the initial ranking list to model interplay between items.\nConsidering the inherent challenges of reranking such as combinatorial\nsearching space, some previous studies have adopted the evaluator-generator\nparadigm, with a generator producing feasible sequences and a evaluator\nselecting the best one based on estimated listwise utility. Inspired by the\nremarkable success of diffusion generative models, this paper explores the\npotential of diffusion models for generating high-quality sequences in\nreranking. However, we argue that it is nontrivial to take diffusion models as\nthe generator in the context of recommendation. Firstly, diffusion models\nprimarily operate in continuous data space, differing from the discrete data\nspace of item permutations. Secondly, the recommendation task is different from\nconventional generation tasks as the purpose of recommender systems is to\nfulfill user interests. Lastly, real-life recommender systems require\nefficiency, posing challenges for the inference of diffusion models. To\novercome these challenges, we propose a novel Discrete Conditional Diffusion\nReranking (DCDR) framework for recommendation. DCDR extends traditional\ndiffusion models by introducing a discrete forward process with tractable\nposteriors, which adds noise to item sequences through step-wise discrete\noperations (e.g., swapping). Additionally, DCDR incorporates a conditional\nreverse process that generates item sequences conditioned on expected user\nresponses. Extensive offline experiments conducted on public datasets\ndemonstrate that DCDR outperforms state-of-the-art reranking methods.\nFurthermore, DCDR has been deployed in a real-world video app with over 300\nmillion daily active users, significantly enhancing online recommendation\nquality.\n","authors":["Xiao Lin","Xiaokai Chen","Chenyang Wang","Hantao Shu","Linfeng Song","Biao Li","Peng jiang"],"pdf_url":"https://arxiv.org/pdf/2308.06982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06965v1","updated":"2023-08-14T06:43:59Z","published":"2023-08-14T06:43:59Z","title":"AutoAssign+: Automatic Shared Embedding Assignment in Streaming\n Recommendation","summary":" In the domain of streaming recommender systems, conventional methods for\naddressing new user IDs or item IDs typically involve assigning initial ID\nembeddings randomly. However, this practice results in two practical\nchallenges: (i) Items or users with limited interactive data may yield\nsuboptimal prediction performance. (ii) Embedding new IDs or low-frequency IDs\nnecessitates consistently expanding the embedding table, leading to unnecessary\nmemory consumption. In light of these concerns, we introduce a reinforcement\nlearning-driven framework, namely AutoAssign+, that facilitates Automatic\nShared Embedding Assignment Plus. To be specific, AutoAssign+ utilizes an\nIdentity Agent as an actor network, which plays a dual role: (i) Representing\nlow-frequency IDs field-wise with a small set of shared embeddings to enhance\nthe embedding initialization, and (ii) Dynamically determining which ID\nfeatures should be retained or eliminated in the embedding table. The policy of\nthe agent is optimized with the guidance of a critic network. To evaluate the\neffectiveness of our approach, we perform extensive experiments on three\ncommonly used benchmark datasets. Our experiment results demonstrate that\nAutoAssign+ is capable of significantly enhancing recommendation performance by\nmitigating the cold-start problem. Furthermore, our framework yields a\nreduction in memory usage of approximately 20-30%, verifying its practical\neffectiveness and efficiency for streaming recommender systems.\n","authors":["Ziru Liu","Kecheng Chen","Fengyi Song","Bo Chen","Xiangyu Zhao","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2308.06965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07705v4","updated":"2023-08-14T03:48:45Z","published":"2023-06-13T11:46:37Z","title":"KuaiSAR: A Unified Search And Recommendation Dataset","summary":" The confluence of Search and Recommendation (S&R) services is vital to online\nservices, including e-commerce and video platforms. The integration of S&R\nmodeling is a highly intuitive approach adopted by industry practitioners.\nHowever, there is a noticeable lack of research conducted in this area within\nacademia, primarily due to the absence of publicly available datasets.\nConsequently, a substantial gap has emerged between academia and industry\nregarding research endeavors in joint optimization using user behavior data\nfrom both S&R services. To bridge this gap, we introduce the first large-scale,\nreal-world dataset KuaiSAR of integrated Search And Recommendation behaviors\ncollected from Kuaishou, a leading short-video app in China with over 350\nmillion daily active users. Previous research in this field has predominantly\nemployed publicly available semi-synthetic datasets and simulated, with\nartificially fabricated search behaviors. Distinct from previous datasets,\nKuaiSAR contains genuine user behaviors, including the occurrence of each\ninteraction within either search or recommendation service, and the users'\ntransitions between the two services. This work aids in joint modeling of S&R,\nand utilizing search data for recommender systems (and recommendation data for\nsearch engines). Furthermore, due to the various feedback labels associated\nwith user-video interactions, KuaiSAR also supports a broad range of tasks,\nincluding intent recommendation, multi-task learning, and modeling of long\nsequential multi-behavioral patterns. We believe this dataset will serve as a\ncatalyst for innovative research and bridge the gap between academia and\nindustry in understanding the S&R services in practical, real-world\napplications.\n","authors":["Zhongxiang Sun","Zihua Si","Xiaoxue Zang","Dewei Leng","Yanan Niu","Yang Song","Xiao Zhang","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2306.07705v4.pdf","comment":"CIKM 2023 resource track"},{"id":"http://arxiv.org/abs/2308.05508v2","updated":"2023-08-14T01:48:12Z","published":"2023-08-10T11:41:34Z","title":"Multi-domain Recommendation with Embedding Disentangling and Domain\n Alignment","summary":" Multi-domain recommendation (MDR) aims to provide recommendations for\ndifferent domains (e.g., types of products) with overlapping users/items and is\ncommon for platforms such as Amazon, Facebook, and LinkedIn that host multiple\nservices. Existing MDR models face two challenges: First, it is difficult to\ndisentangle knowledge that generalizes across domains (e.g., a user likes cheap\nitems) and knowledge specific to a single domain (e.g., a user likes blue\nclothing but not blue cars). Second, they have limited ability to transfer\nknowledge across domains with small overlaps. We propose a new MDR method named\nEDDA with two key components, i.e., embedding disentangling recommender and\ndomain alignment, to tackle the two challenges respectively. In particular, the\nembedding disentangling recommender separates both the model and embedding for\nthe inter-domain part and the intra-domain part, while most existing MDR\nmethods only focus on model-level disentangling. The domain alignment leverages\nrandom walks from graph processing to identify similar user/item pairs from\ndifferent domains and encourages similar user/item pairs to have similar\nembeddings, enhancing knowledge transfer. We compare EDDA with 12\nstate-of-the-art baselines on 3 real datasets. The results show that EDDA\nconsistently outperforms the baselines on all datasets and domains. All\ndatasets and codes are available at https://github.com/Stevenn9981/EDDA.\n","authors":["Wentao Ning","Xiao Yan","Weiwen Liu","Reynold Cheng","Rui Zhang","Bo Tang"],"pdf_url":"https://arxiv.org/pdf/2308.05508v2.pdf","comment":"Accepted by CIKM'23 as a Long paper"},{"id":"http://arxiv.org/abs/2308.06885v1","updated":"2023-08-14T01:37:02Z","published":"2023-08-14T01:37:02Z","title":"Bridging Offline-Online Evaluation with a Time-dependent and Popularity\n Bias-free Offline Metric for Recommenders","summary":" The evaluation of recommendation systems is a complex task. The offline and\nonline evaluation metrics for recommender systems are ambiguous in their true\nobjectives. The majority of recently published papers benchmark their methods\nusing ill-posed offline evaluation methodology that often fails to predict true\nonline performance. Because of this, the impact that academic research has on\nthe industry is reduced. The aim of our research is to investigate and compare\nthe online performance of offline evaluation metrics. We show that penalizing\npopular items and considering the time of transactions during the evaluation\nsignificantly improves our ability to choose the best recommendation model for\na live recommender system. Our results, averaged over five large-size\nreal-world live data procured from recommenders, aim to help the academic\ncommunity to understand better offline evaluation and optimization criteria\nthat are more relevant for real applications of recommender systems.\n","authors":["Petr Kasalický","Rodrigo Alves","Pavel Kordík"],"pdf_url":"https://arxiv.org/pdf/2308.06885v1.pdf","comment":"Accepted to evalRS 2023@KDD"},{"id":"http://arxiv.org/abs/2308.06878v1","updated":"2023-08-14T01:23:37Z","published":"2023-08-14T01:23:37Z","title":"AutoSeqRec: Autoencoder for Efficient Sequential Recommendation","summary":" Sequential recommendation demonstrates the capability to recommend items by\nmodeling the sequential behavior of users. Traditional methods typically treat\nusers as sequences of items, overlooking the collaborative relationships among\nthem. Graph-based methods incorporate collaborative information by utilizing\nthe user-item interaction graph. However, these methods sometimes face\nchallenges in terms of time complexity and computational efficiency. To address\nthese limitations, this paper presents AutoSeqRec, an incremental\nrecommendation model specifically designed for sequential recommendation tasks.\nAutoSeqRec is based on autoencoders and consists of an encoder and three\ndecoders within the autoencoder architecture. These components consider both\nthe user-item interaction matrix and the rows and columns of the item\ntransition matrix. The reconstruction of the user-item interaction matrix\ncaptures user long-term preferences through collaborative filtering. In\naddition, the rows and columns of the item transition matrix represent the item\nout-degree and in-degree hopping behavior, which allows for modeling the user's\nshort-term interests. When making incremental recommendations, only the input\nmatrices need to be updated, without the need to update parameters, which makes\nAutoSeqRec very efficient. Comprehensive evaluations demonstrate that\nAutoSeqRec outperforms existing methods in terms of accuracy, while showcasing\nits robustness and efficiency.\n","authors":["Sijia Liu","Jiahao Liu","Hansu Gu","Dongsheng Li","Tun Lu","Peng Zhang","Ning Gu"],"pdf_url":"https://arxiv.org/pdf/2308.06878v1.pdf","comment":"10 pages, accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2308.07426v1","updated":"2023-08-14T19:36:57Z","published":"2023-08-14T19:36:57Z","title":"A Survey on Point-of-Interest Recommendations Leveraging Heterogeneous\n Data","summary":" Tourism is an important application domain for recommender systems. In this\ndomain, recommender systems are for example tasked with providing personalized\nrecommendations for transportation, accommodation, points-of-interest (POIs),\nor tourism services. Among these tasks, in particular the problem of\nrecommending POIs that are of likely interest to individual tourists has gained\ngrowing attention in recent years. Providing POI recommendations to tourists\n\\emph{during their trip} can however be especially challenging due to the\nvariability of the users' context. With the rapid development of the Web and\ntoday's multitude of online services, vast amounts of data from various sources\nhave become available, and these heterogeneous data sources represent a huge\npotential to better address the challenges of in-trip POI recommendation\nproblems. In this work, we provide a comprehensive survey of published research\non POI recommendation between 2017 and 2022 from the perspective of\nheterogeneous data sources. Specifically, we investigate which types of data\nare used in the literature and which technical approaches and evaluation\nmethods are predominant. Among other aspects, we find that today's research\nworks often focus on a narrow range of data sources, leaving great potential\nfor future works that better utilize heterogeneous data sources and diverse\ndata types for improved in-trip recommendations.\n","authors":["Zehui Wang","Wolfram Höpken","Dietmar Jannach"],"pdf_url":"https://arxiv.org/pdf/2308.07426v1.pdf","comment":"35 pages, 19 figures, submitted to Information Technology & Tourism\n (ITT)"},{"id":"http://arxiv.org/abs/2308.07359v1","updated":"2023-08-14T14:56:07Z","published":"2023-08-14T14:56:07Z","title":"Improving ICD-based semantic similarity by accounting for varying\n degrees of comorbidity","summary":" Finding similar patients is a common objective in precision medicine,\nfacilitating treatment outcome assessment and clinical decision support.\nChoosing widely-available patient features and appropriate mathematical methods\nfor similarity calculations is crucial. International Statistical\nClassification of Diseases and Related Health Problems (ICD) codes are used\nworldwide to encode diseases and are available for nearly all patients.\nAggregated as sets consisting of primary and secondary diagnoses they can\ndisplay a degree of comorbidity and reveal comorbidity patterns. It is possible\nto compute the similarity of patients based on their ICD codes by using\nsemantic similarity algorithms. These algorithms have been traditionally\nevaluated using a single-term expert rated data set.\n However, real-word patient data often display varying degrees of documented\ncomorbidities that might impair algorithm performance. To account for this, we\npresent a scale term that considers documented comorbidity-variance. In this\nwork, we compared the performance of 80 combinations of established algorithms\nin terms of semantic similarity based on ICD-code sets. The sets have been\nextracted from patients with a C25.X (pancreatic cancer) primary diagnosis and\nprovide a variety of different combinations of ICD-codes. Using our scale term\nwe yielded the best results with a combination of level-based information\ncontent, Leacock & Chodorow concept similarity and bipartite graph matching for\nthe set similarities reaching a correlation of 0.75 with our expert's ground\ntruth. Our results highlight the importance of accounting for comorbidity\nvariance while demonstrating how well current semantic similarity algorithms\nperform.\n","authors":["Jan Janosch Schneider","Marius Adler","Christoph Ammer-Herrmenau","Alexander Otto König","Ulrich Sax","Jonas Hügel"],"pdf_url":"https://arxiv.org/pdf/2308.07359v1.pdf","comment":"11 pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.08499v1","updated":"2023-08-14T14:40:13Z","published":"2023-08-14T14:40:13Z","title":"Context-Aware Service Recommendation System for the Social Internet of\n Things","summary":" The Social Internet of Things (SIoT) enables interconnected smart devices to\nshare data and services, opening up opportunities for personalized service\nrecommendations. However, existing research often overlooks crucial aspects\nthat can enhance the accuracy and relevance of recommendations in the SIoT\ncontext. Specifically, existing techniques tend to consider the extraction of\nsocial relationships between devices and neglect the contextual presentation of\nservice reviews. This study aims to address these gaps by exploring the\ncontextual representation of each device-service pair. Firstly, we propose a\nlatent features combination technique that can capture latent feature\ninteractions, by aggregating the device-device relationships within the SIoT.\nThen, we leverage Factorization Machines to model higher-order feature\ninteractions specific to each SIoT device-service pair to accomplish accurate\nrating prediction. Finally, we propose a service recommendation framework for\nSIoT based on review aggregation and feature learning processes. The\nexperimental evaluation demonstrates the framework's effectiveness in improving\nservice recommendation accuracy and relevance.\n","authors":["Amar Khelloufi","Huansheng Ning","Abdelkarim Ben Sada","Abdenacer Naouri","Sahraoui Dhelim"],"pdf_url":"https://arxiv.org/pdf/2308.08499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08459v1","updated":"2023-08-14T14:31:33Z","published":"2023-08-14T14:31:33Z","title":"Knowledge Prompt-tuning for Sequential Recommendation","summary":" Pre-trained language models (PLMs) have demonstrated strong performance in\nsequential recommendation (SR), which are utilized to extract general\nknowledge. However, existing methods still lack domain knowledge and struggle\nto capture users' fine-grained preferences. Meanwhile, many traditional SR\nmethods improve this issue by integrating side information while suffering from\ninformation loss. To summarize, we believe that a good recommendation system\nshould utilize both general and domain knowledge simultaneously. Therefore, we\nintroduce an external knowledge base and propose Knowledge Prompt-tuning for\nSequential Recommendation (\\textbf{KP4SR}). Specifically, we construct a set of\nrelationship templates and transform a structured knowledge graph (KG) into\nknowledge prompts to solve the problem of the semantic gap. However, knowledge\nprompts disrupt the original data structure and introduce a significant amount\nof noise. We further construct a knowledge tree and propose a knowledge tree\nmask, which restores the data structure in a mask matrix form, thus mitigating\nthe noise problem. We evaluate KP4SR on three real-world datasets, and\nexperimental results show that our approach outperforms state-of-the-art\nmethods on multiple evaluation metrics. Specifically, compared with PLM-based\nmethods, our method improves NDCG@5 and HR@5 by \\textcolor{red}{40.65\\%} and\n\\textcolor{red}{36.42\\%} on the books dataset, \\textcolor{red}{11.17\\%} and\n\\textcolor{red}{11.47\\%} on the music dataset, and \\textcolor{red}{22.17\\%} and\n\\textcolor{red}{19.14\\%} on the movies dataset, respectively. Our code is\npublicly available at the link:\n\\href{https://github.com/zhaijianyang/KP4SR}{\\textcolor{blue}{https://github.com/zhaijianyang/KP4SR}.}\n","authors":["Jianyang Zhai","Xiawu Zheng","Chang-Dong Wang","Hui Li","Yonghong Tian"],"pdf_url":"https://arxiv.org/pdf/2308.08459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08497v1","updated":"2023-08-14T14:04:57Z","published":"2023-08-14T14:04:57Z","title":"HyperBandit: Contextual Bandit with Hypernewtork for Time-Varying User\n Preferences in Streaming Recommendation","summary":" In real-world streaming recommender systems, user preferences often\ndynamically change over time (e.g., a user may have different preferences\nduring weekdays and weekends). Existing bandit-based streaming recommendation\nmodels only consider time as a timestamp, without explicitly modeling the\nrelationship between time variables and time-varying user preferences. This\nleads to recommendation models that cannot quickly adapt to dynamic scenarios.\nTo address this issue, we propose a contextual bandit approach using\nhypernetwork, called HyperBandit, which takes time features as input and\ndynamically adjusts the recommendation model for time-varying user preferences.\nSpecifically, HyperBandit maintains a neural network capable of generating the\nparameters for estimating time-varying rewards, taking into account the\ncorrelation between time features and user preferences. Using the estimated\ntime-varying rewards, a bandit policy is employed to make online\nrecommendations by learning the latent item contexts. To meet the real-time\nrequirements in streaming recommendation scenarios, we have verified the\nexistence of a low-rank structure in the parameter matrix and utilize low-rank\nfactorization for efficient training. Theoretically, we demonstrate a sublinear\nregret upper bound against the best policy. Extensive experiments on real-world\ndatasets show that the proposed HyperBandit consistently outperforms the\nstate-of-the-art baselines in terms of accumulated rewards.\n","authors":["Chenglei Shen","Xiao Zhang","Wei Wei","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2308.08497v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2305.03153v2","updated":"2023-08-14T17:38:23Z","published":"2023-05-04T21:04:19Z","title":"G-MATT: Single-step Retrosynthesis Prediction using Molecular Grammar\n Tree Transformer","summary":" Various template-based and template-free approaches have been proposed for\nsingle-step retrosynthesis prediction in recent years. While these approaches\ndemonstrate strong performance from a data-driven metrics standpoint, many\nmodel architectures do not incorporate underlying chemistry principles. Here,\nwe propose a novel chemistry-aware retrosynthesis prediction framework that\ncombines powerful data-driven models with prior domain knowledge. We present a\ntree-to-sequence transformer architecture that utilizes hierarchical SMILES\ngrammar-based trees, incorporating crucial chemistry information that is often\noverlooked by SMILES text-based representations, such as local structures and\nfunctional groups. The proposed framework, grammar-based molecular attention\ntree transformer (G-MATT), achieves significant performance improvements\ncompared to baseline retrosynthesis models. G-MATT achieves a promising top-1\naccuracy of 51% (top-10 accuracy of 79.1%), invalid rate of 1.5%, and bioactive\nsimilarity rate of 74.8% on the USPTO- 50K dataset. Additional analyses of\nG-MATT attention maps demonstrate the ability to retain chemistry knowledge\nwithout relying on excessively complex model architectures.\n","authors":["Kevin Zhang","Vipul Mann","Venkat Venkatasubramanian"],"pdf_url":"https://arxiv.org/pdf/2305.03153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07293v1","updated":"2023-08-14T17:29:41Z","published":"2023-08-14T17:29:41Z","title":"DiffSED: Sound Event Detection with Denoising Diffusion","summary":" Sound Event Detection (SED) aims to predict the temporal boundaries of all\nthe events of interest and their class labels, given an unconstrained audio\nsample. Taking either the splitand-classify (i.e., frame-level) strategy or the\nmore principled event-level modeling approach, all existing methods consider\nthe SED problem from the discriminative learning perspective. In this work, we\nreformulate the SED problem by taking a generative learning perspective.\nSpecifically, we aim to generate sound temporal boundaries from noisy proposals\nin a denoising diffusion process, conditioned on a target audio sample. During\ntraining, our model learns to reverse the noising process by converting noisy\nlatent queries to the groundtruth versions in the elegant Transformer decoder\nframework. Doing so enables the model generate accurate event boundaries from\neven noisy queries during inference. Extensive experiments on the Urban-SED and\nEPIC-Sounds datasets demonstrate that our model significantly outperforms\nexisting alternatives, with 40+% faster convergence in training.\n","authors":["Swapnil Bhosale","Sauradip Nag","Diptesh Kanojia","Jiankang Deng","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.07293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07539v2","updated":"2023-08-14T17:22:21Z","published":"2023-07-14T13:56:11Z","title":"On the Sublinear Regret of GP-UCB","summary":" In the kernelized bandit problem, a learner aims to sequentially compute the\noptimum of a function lying in a reproducing kernel Hilbert space given only\nnoisy evaluations at sequentially chosen points. In particular, the learner\naims to minimize regret, which is a measure of the suboptimality of the choices\nmade. Arguably the most popular algorithm is the Gaussian Process Upper\nConfidence Bound (GP-UCB) algorithm, which involves acting based on a simple\nlinear estimator of the unknown function. Despite its popularity, existing\nanalyses of GP-UCB give a suboptimal regret rate, which fails to be sublinear\nfor many commonly used kernels such as the Mat\\'ern kernel. This has led to a\nlongstanding open question: are existing regret analyses for GP-UCB tight, or\ncan bounds be improved by using more sophisticated analytical techniques? In\nthis work, we resolve this open question and show that GP-UCB enjoys nearly\noptimal regret. In particular, our results yield sublinear regret rates for the\nMat\\'ern kernel, improving over the state-of-the-art analyses and partially\nresolving a COLT open problem posed by Vakili et al. Our improvements rely on a\nkey technical contribution -- regularizing kernel ridge estimators in\nproportion to the smoothness of the underlying kernel $k$. Applying this key\nidea together with a largely overlooked concentration result in separable\nHilbert spaces (for which we provide an independent, simplified derivation), we\nare able to provide a tighter analysis of the GP-UCB algorithm.\n","authors":["Justin Whitehouse","Zhiwei Steven Wu","Aaditya Ramdas"],"pdf_url":"https://arxiv.org/pdf/2307.07539v2.pdf","comment":"20 pages, 0 figures"},{"id":"http://arxiv.org/abs/2308.07286v1","updated":"2023-08-14T17:17:21Z","published":"2023-08-14T17:17:21Z","title":"The Devil is in the Errors: Leveraging Large Language Models for\n Fine-grained Machine Translation Evaluation","summary":" Automatic evaluation of machine translation (MT) is a critical tool driving\nthe rapid iterative development of MT systems. While considerable progress has\nbeen made on estimating a single scalar quality score, current metrics lack the\ninformativeness of more detailed schemes that annotate individual errors, such\nas Multidimensional Quality Metrics (MQM). In this paper, we help fill this gap\nby proposing AutoMQM, a prompting technique which leverages the reasoning and\nin-context learning capabilities of large language models (LLMs) and asks them\nto identify and categorize errors in translations. We start by evaluating\nrecent LLMs, such as PaLM and PaLM-2, through simple score prediction\nprompting, and we study the impact of labeled data through in-context learning\nand finetuning. We then evaluate AutoMQM with PaLM-2 models, and we find that\nit improves performance compared to just prompting for scores (with\nparticularly large gains for larger models) while providing interpretability\nthrough error spans that align with human annotations.\n","authors":["Patrick Fernandes","Daniel Deutsch","Mara Finkelstein","Parker Riley","André F. T. Martins","Graham Neubig","Ankush Garg","Jonathan H. Clark","Markus Freitag","Orhan Firat"],"pdf_url":"https://arxiv.org/pdf/2308.07286v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2308.07284v1","updated":"2023-08-14T17:15:37Z","published":"2023-08-14T17:15:37Z","title":"Cross-Attribute Matrix Factorization Model with Shared User Embedding","summary":" Over the past few years, deep learning has firmly established its prowess\nacross various domains, including computer vision, speech recognition, and\nnatural language processing. Motivated by its outstanding success, researchers\nhave been directing their efforts towards applying deep learning techniques to\nrecommender systems. Neural collaborative filtering (NCF) and Neural Matrix\nFactorization (NeuMF) refreshes the traditional inner product in matrix\nfactorization with a neural architecture capable of learning complex and\ndata-driven functions. While these models effectively capture user-item\ninteractions, they overlook the specific attributes of both users and items.\nThis can lead to robustness issues, especially for items and users that belong\nto the \"long tail\". Such challenges are commonly recognized in recommender\nsystems as a part of the cold-start problem. A direct and intuitive approach to\naddress this issue is by leveraging the features and attributes of the items\nand users themselves. In this paper, we introduce a refined NeuMF model that\nconsiders not only the interaction between users and items, but also acrossing\nassociated attributes. Moreover, our proposed architecture features a shared\nuser embedding, seamlessly integrating with user embeddings to imporve the\nrobustness and effectively address the cold-start problem. Rigorous experiments\non both the Movielens and Pinterest datasets demonstrate the superiority of our\nCross-Attribute Matrix Factorization model, particularly in scenarios\ncharacterized by higher dataset sparsity.\n","authors":["Wen Liang","Zeng Fan","Youzhi Liang","Jianguo Jia"],"pdf_url":"https://arxiv.org/pdf/2308.07284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07273v1","updated":"2023-08-14T17:00:13Z","published":"2023-08-14T17:00:13Z","title":"Data-Efficient Energy-Aware Participant Selection for UAV-Enabled\n Federated Learning","summary":" Unmanned aerial vehicle (UAV)-enabled edge federated learning (FL) has\nsparked a rise in research interest as a result of the massive and\nheterogeneous data collected by UAVs, as well as the privacy concerns related\nto UAV data transmissions to edge servers. However, due to the redundancy of\nUAV collected data, e.g., imaging data, and non-rigorous FL participant\nselection, the convergence time of the FL learning process and bias of the FL\nmodel may increase. Consequently, we investigate in this paper the problem of\nselecting UAV participants for edge FL, aiming to improve the FL model's\naccuracy, under UAV constraints of energy consumption, communication quality,\nand local datasets' heterogeneity. We propose a novel UAV participant selection\nscheme, called data-efficient energy-aware participant selection strategy\n(DEEPS), which consists of selecting the best FL participant in each sub-region\nbased on the structural similarity index measure (SSIM) average score of its\nlocal dataset and its power consumption profile. Through experiments, we\ndemonstrate that the proposed selection scheme is superior to the benchmark\nrandom selection method, in terms of model accuracy, training time, and UAV\nenergy consumption.\n","authors":["Youssra Cheriguene","Wael Jaafar","Chaker Abdelaziz Kerrache","Halim Yanikomeroglu","Fatima Zohra Bousbaa","Nasreddine Lagraa"],"pdf_url":"https://arxiv.org/pdf/2308.07273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07272v1","updated":"2023-08-14T16:58:50Z","published":"2023-08-14T16:58:50Z","title":"Dialogue for Prompting: a Policy-Gradient-Based Discrete Prompt\n Optimization for Few-shot Learning","summary":" Prompt-based pre-trained language models (PLMs) paradigm have succeeded\nsubstantially in few-shot natural language processing (NLP) tasks. However,\nprior discrete prompt optimization methods require expert knowledge to design\nthe base prompt set and identify high-quality prompts, which is costly,\ninefficient, and subjective. Meanwhile, existing continuous prompt optimization\nmethods improve the performance by learning the ideal prompts through the\ngradient information of PLMs, whose high computational cost, and low\nreadability and generalizability are often concerning. To address the research\ngap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt\nOptimization ($DP_2O$) method. We first design a multi-round dialogue alignment\nstrategy for readability prompt set generation based on GPT-4. Furthermore, we\npropose an efficient prompt screening metric to identify high-quality prompts\nwith linear complexity. Finally, we construct a reinforcement learning (RL)\nframework based on policy gradients to match the prompts to inputs optimally.\nBy training a policy network with only 0.67% of the PLM parameter size on the\ntasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA)\nmethod by 1.52% in accuracy on average on four open-source datasets. Moreover,\nsubsequent experiments also demonstrate that $DP_2O$ has good universality,\nrobustness, and generalization ability.\n","authors":["Chengzhengxu Li","Xiaoming Liu","Yichen Wang","Duyi Li","Yu Lan","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.07272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07269v1","updated":"2023-08-14T16:52:42Z","published":"2023-08-14T16:52:42Z","title":"EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language\n Models","summary":" Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy\nissues, which means they are unaware of unseen events or generate text with\nincorrect facts owing to the outdated/noisy data. To this end, many knowledge\nediting approaches for LLMs have emerged -- aiming to subtly inject/edit\nupdated knowledge or adjust undesired behavior while minimizing the impact on\nunrelated inputs. Nevertheless, due to significant differences among various\nknowledge editing methods and the variations in task setups, there is no\nstandard implementation framework available for the community, which hinders\npractitioners to apply knowledge editing to applications. To address these\nissues, we propose EasyEdit, an easy-to-use knowledge editing framework for\nLLMs. It supports various cutting-edge knowledge editing approaches and can be\nreadily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc.\nEmpirically, we report the knowledge editing results on LlaMA-2 with EasyEdit,\ndemonstrating that knowledge editing surpasses traditional fine-tuning in terms\nof reliability and generalization. We have released the source code on GitHub\nat https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and\ncomprehensive documentation for beginners to get started. Besides, we present\nan online system for real-time knowledge editing, and a demo video at\nhttp://knowlm.zjukg.cn/easyedit.mp4.\n","authors":["Peng Wang","Ningyu Zhang","Xin Xie","Yunzhi Yao","Bozhong Tian","Mengru Wang","Zekun Xi","Siyuan Cheng","Kangwei Liu","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07269v1.pdf","comment":"The project website is https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2305.12622v2","updated":"2023-08-14T16:49:03Z","published":"2023-05-22T01:27:51Z","title":"Evaluating the Impact of Social Determinants on Health Prediction in the\n Intensive Care Unit","summary":" Social determinants of health (SDOH) -- the conditions in which people live,\ngrow, and age -- play a crucial role in a person's health and well-being. There\nis a large, compelling body of evidence in population health studies showing\nthat a wide range of SDOH is strongly correlated with health outcomes. Yet, a\nmajority of the risk prediction models based on electronic health records (EHR)\ndo not incorporate a comprehensive set of SDOH features as they are often noisy\nor simply unavailable. Our work links a publicly available EHR database,\nMIMIC-IV, to well-documented SDOH features. We investigate the impact of such\nfeatures on common EHR prediction tasks across different patient populations.\nWe find that community-level SDOH features do not improve model performance for\na general patient population, but can improve data-limited model fairness for\nspecific subpopulations. We also demonstrate that SDOH features are vital for\nconducting thorough audits of algorithmic biases beyond protective attributes.\nWe hope the new integrated EHR-SDOH database will enable studies on the\nrelationship between community health and individual outcomes and provide new\nbenchmarks to study algorithmic biases beyond race, gender, and age.\n","authors":["Ming Ying Yang","Gloria Hyunjung Kwak","Tom Pollard","Leo Anthony Celi","Marzyeh Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2305.12622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07250v1","updated":"2023-08-14T16:34:47Z","published":"2023-08-14T16:34:47Z","title":"LCE -- An Augmented Combination of Bagging and Boosting in Python","summary":" lcensemble is a high-performing, scalable and user-friendly Python package\nfor the general tasks of classification and regression. The package implements\nLocal Cascade Ensemble (LCE), a machine learning method that further enhances\nthe prediction performance of the current state-of-the-art methods Random\nForest and XGBoost. LCE combines their strengths and adopts a complementary\ndiversification approach to obtain a better generalizing predictor. The package\nis compatible with scikit-learn, therefore it can interact with scikit-learn\npipelines and model selection tools. It is distributed under the Apache 2.0\nlicense, and its source code is available at\nhttps://github.com/LocalCascadeEnsemble/LCE.\n","authors":["Kevin Fauvel","Élisa Fromont","Véronique Masson","Philippe Faverdin","Alexandre Termier"],"pdf_url":"https://arxiv.org/pdf/2308.07250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03202v2","updated":"2023-08-14T16:33:43Z","published":"2023-08-06T20:19:06Z","title":"Source-free Domain Adaptive Human Pose Estimation","summary":" Human Pose Estimation (HPE) is widely used in various fields, including\nmotion analysis, healthcare, and virtual reality. However, the great expenses\nof labeled real-world datasets present a significant challenge for HPE. To\novercome this, one approach is to train HPE models on synthetic datasets and\nthen perform domain adaptation (DA) on real-world data. Unfortunately, existing\nDA methods for HPE neglect data privacy and security by using both source and\ntarget data in the adaptation process. To this end, we propose a new task,\nnamed source-free domain adaptive HPE, which aims to address the challenges of\ncross-domain learning of HPE without access to source data during the\nadaptation process. We further propose a novel framework that consists of three\nmodels: source model, intermediate model, and target model, which explores the\ntask from both source-protect and target-relevant perspectives. The\nsource-protect module preserves source information more effectively while\nresisting noise, and the target-relevant module reduces the sparsity of spatial\nrepresentations by building a novel spatial probability space, and\npose-specific contrastive learning and information maximization are proposed on\nthe basis of this space. Comprehensive experiments on several domain adaptive\nHPE benchmarks show that the proposed method outperforms existing approaches by\na considerable margin.\n","authors":["Qucheng Peng","Ce Zheng","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03202v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07247v1","updated":"2023-08-14T16:32:24Z","published":"2023-08-14T16:32:24Z","title":"Can we Agree? On the Rashōmon Effect and the Reliability of Post-Hoc\n Explainable AI","summary":" The Rash\\=omon effect poses challenges for deriving reliable knowledge from\nmachine learning models. This study examined the influence of sample size on\nexplanations from models in a Rash\\=omon set using SHAP. Experiments on 5\npublic datasets showed that explanations gradually converged as the sample size\nincreased. Explanations from <128 samples exhibited high variability, limiting\nreliable knowledge extraction. However, agreement between models improved with\nmore data, allowing for consensus. Bagging ensembles often had higher\nagreement. The results provide guidance on sufficient data to trust\nexplanations. Variability at low samples suggests that conclusions may be\nunreliable without validation. Further work is needed with more model types,\ndata domains, and explanation methods. Testing convergence in neural networks\nand with model-specific explanation methods would be impactful. The approaches\nexplored here point towards principled techniques for eliciting knowledge from\nambiguous models.\n","authors":["Clement Poiret","Antoine Grigis","Justin Thomas","Marion Noulhiane"],"pdf_url":"https://arxiv.org/pdf/2308.07247v1.pdf","comment":"13 pages, 6 figures and 6 tables"},{"id":"http://arxiv.org/abs/2308.07233v1","updated":"2023-08-14T16:16:31Z","published":"2023-08-14T16:16:31Z","title":"A Unifying Generator Loss Function for Generative Adversarial Networks","summary":" A unifying $\\alpha$-parametrized generator loss function is introduced for a\ndual-objective generative adversarial network (GAN), which uses a canonical (or\nclassical) discriminator loss function such as the one in the original GAN\n(VanillaGAN) system. The generator loss function is based on a symmetric class\nprobability estimation type function, $\\mathcal{L}_\\alpha$, and the resulting\nGAN system is termed $\\mathcal{L}_\\alpha$-GAN. Under an optimal discriminator,\nit is shown that the generator's optimization problem consists of minimizing a\nJensen-$f_\\alpha$-divergence, a natural generalization of the Jensen-Shannon\ndivergence, where $f_\\alpha$ is a convex function expressed in terms of the\nloss function $\\mathcal{L}_\\alpha$. It is also demonstrated that this\n$\\mathcal{L}_\\alpha$-GAN problem recovers as special cases a number of GAN\nproblems in the literature, including VanillaGAN, Least Squares GAN (LSGAN),\nLeast $k$th order GAN (L$k$GAN) and the recently introduced\n$(\\alpha_D,\\alpha_G)$-GAN with $\\alpha_D=1$. Finally, experimental results are\nconducted on three datasets, MNIST, CIFAR-10, and Stacked MNIST to illustrate\nthe performance of various examples of the $\\mathcal{L}_\\alpha$-GAN system.\n","authors":["Justin Veiner","Fady Alajaji","Bahman Gharesifard"],"pdf_url":"https://arxiv.org/pdf/2308.07233v1.pdf","comment":"31 pages, 4 figures, 12 tables"},{"id":"http://arxiv.org/abs/2307.07522v2","updated":"2023-08-14T16:12:00Z","published":"2023-07-09T21:16:56Z","title":"The Future of Fundamental Science Led by Generative Closed-Loop\n Artificial Intelligence","summary":" Recent advances in machine learning and AI, including Generative AI and LLMs,\nare disrupting technological innovation, product development, and society as a\nwhole. AI's contribution to technology can come from multiple approaches that\nrequire access to large training data sets and clear performance evaluation\ncriteria, ranging from pattern recognition and classification to generative\nmodels. Yet, AI has contributed less to fundamental science in part because\nlarge data sets of high-quality data for scientific practice and model\ndiscovery are more difficult to access. Generative AI, in general, and Large\nLanguage Models in particular, may represent an opportunity to augment and\naccelerate the scientific discovery of fundamental deep science with\nquantitative models. Here we explore and investigate aspects of an AI-driven,\nautomated, closed-loop approach to scientific discovery, including self-driven\nhypothesis generation and open-ended autonomous exploration of the hypothesis\nspace. Integrating AI-driven automation into the practice of science would\nmitigate current problems, including the replication of findings, systematic\nproduction of data, and ultimately democratisation of the scientific process.\nRealising these possibilities requires a vision for augmented AI coupled with a\ndiversity of AI approaches able to deal with fundamental aspects of causality\nanalysis and model discovery while enabling unbiased search across the space of\nputative explanations. These advances hold the promise to unleash AI's\npotential for searching and discovering the fundamental structure of our world\nbeyond what human scientists have been able to achieve. Such a vision would\npush the boundaries of new fundamental science rather than automatize current\nworkflows and instead open doors for technological innovation to tackle some of\nthe greatest challenges facing humanity today.\n","authors":["Hector Zenil","Jesper Tegnér","Felipe S. Abrahão","Alexander Lavin","Vipin Kumar","Jeremy G. Frey","Adrian Weller","Larisa Soldatova","Alan R. Bundy","Nicholas R. Jennings","Koichi Takahashi","Lawrence Hunter","Saso Dzeroski","Andrew Briggs","Frederick D. Gregory","Carla P. Gomes","Christopher K. I. Williams","Jon Rowe","James Evans","Hiroaki Kitano","Joshua B. Tenenbaum","Ross King"],"pdf_url":"https://arxiv.org/pdf/2307.07522v2.pdf","comment":"35 pages, first draft of the final report from the Alan Turing\n Institute on AI for Scientific Discovery"},{"id":"http://arxiv.org/abs/2110.07122v2","updated":"2023-08-14T16:04:21Z","published":"2021-10-14T02:23:33Z","title":"Deconfounded Causal Collaborative Filtering","summary":" Recommender systems may be confounded by various types of confounding factors\n(also called confounders) that may lead to inaccurate recommendations and\nsacrificed recommendation performance. Current approaches to solving the\nproblem usually design each specific model for each specific confounder.\nHowever, real-world systems may include a huge number of confounders and thus\ndesigning each specific model for each specific confounder could be\nunrealistic. More importantly, except for those ``explicit confounders'' that\nexperts can manually identify and process such as item's position in the\nranking list, there are also many ``latent confounders'' that are beyond the\nimagination of experts. For example, users' rating on a song may depend on\ntheir current mood or the current weather, and users' preference on ice creams\nmay depend on the air temperature. Such latent confounders may be unobservable\nin the recorded training data. To solve the problem, we propose Deconfounded\nCausal Collaborative Filtering (DCCF). We first frame user behaviors with\nunobserved confounders into a causal graph, and then we design a front-door\nadjustment model carefully fused with machine learning to deconfound the\ninfluence of unobserved confounders. Experiments on real-world datasets show\nthat our method is able to deconfound unobserved confounders to achieve better\nrecommendation performance.\n","authors":["Shuyuan Xu","Juntao Tan","Shelby Heinecke","Jia Li","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2110.07122v2.pdf","comment":"Accepted by the ACM Transactions on Recommender Systems (TORS)"},{"id":"http://arxiv.org/abs/2102.01868v5","updated":"2023-08-14T15:58:36Z","published":"2021-02-03T04:16:11Z","title":"Causal Collaborative Filtering","summary":" Many of the traditional recommendation algorithms are designed based on the\nfundamental idea of mining or learning correlative patterns from data to\nestimate the user-item correlative preference. However, pure correlative\nlearning may lead to Simpson's paradox in predictions, and thus results in\nsacrificed recommendation performance. Simpson's paradox is a well-known\nstatistical phenomenon, which causes confusions in statistical conclusions and\nignoring the paradox may result in inaccurate decisions. Fortunately, causal\nand counterfactual modeling can help us to think outside of the observational\ndata for user modeling and personalization so as to tackle such issues. In this\npaper, we propose Causal Collaborative Filtering (CCF) -- a general framework\nfor modeling causality in collaborative filtering and recommendation. We\nprovide a unified causal view of CF and mathematically show that many of the\ntraditional CF algorithms are actually special cases of CCF under simplified\ncausal graphs. We then propose a conditional intervention approach for\n$do$-operations so that we can estimate the user-item causal preference based\non the observational data. Finally, we further propose a general counterfactual\nconstrained learning framework for estimating the user-item preferences.\nExperiments are conducted on two types of real-world datasets -- traditional\nand randomized trial data -- and results show that our framework can improve\nthe recommendation performance and reduce the Simpson's paradox problem of many\nCF algorithms.\n","authors":["Shuyuan Xu","Yingqiang Ge","Yunqi Li","Zuohui Fu","Xu Chen","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2102.01868v5.pdf","comment":"Accepted by the 2023 ACM SIGIR International Conference on Theory of\n Information Retrieval"},{"id":"http://arxiv.org/abs/2308.07223v1","updated":"2023-08-14T15:49:19Z","published":"2023-08-14T15:49:19Z","title":"Distance Matters For Improving Performance Estimation Under Covariate\n Shift","summary":" Performance estimation under covariate shift is a crucial component of safe\nAI model deployment, especially for sensitive use-cases. Recently, several\nsolutions were proposed to tackle this problem, most leveraging model\npredictions or softmax confidence to derive accuracy estimates. However, under\ndataset shifts, confidence scores may become ill-calibrated if samples are too\nfar from the training distribution. In this work, we show that taking into\naccount distances of test samples to their expected training distribution can\nsignificantly improve performance estimation under covariate shift. Precisely,\nwe introduce a \"distance-check\" to flag samples that lie too far from the\nexpected distribution, to avoid relying on their untrustworthy model outputs in\nthe accuracy estimation step. We demonstrate the effectiveness of this method\non 13 image classification tasks, across a wide-range of natural and synthetic\ndistribution shifts and hundreds of models, with a median relative MAE\nimprovement of 27% over the best baseline across all tasks, and SOTA\nperformance on 10 out of 13 tasks. Our code is publicly available at\nhttps://github.com/melanibe/distance_matters_performance_estimation.\n","authors":["Mélanie Roschewitz","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2308.07223v1.pdf","comment":"Accepted to ICCV Workshop on Uncertainty Quantification for Computer\n Vision 2023"},{"id":"http://arxiv.org/abs/2308.07221v1","updated":"2023-08-14T15:47:25Z","published":"2023-08-14T15:47:25Z","title":"AudioFormer: Audio Transformer learns audio feature representations from\n discrete acoustic codes","summary":" We propose a method named AudioFormer, which learns audio feature\nrepresentations through the acquisition of discrete acoustic codes and\nsubsequently fine-tunes them for audio classification tasks. Initially, we\nintroduce a novel perspective by considering the audio classification task as a\nform of natural language understanding (NLU). Leveraging an existing neural\naudio codec model, we generate discrete acoustic codes and utilize them to\ntrain a masked language model (MLM), thereby obtaining audio feature\nrepresentations. Furthermore, we pioneer the integration of a\n\\textbf{M}ulti-\\textbf{P}ositive sample \\textbf{C}ontrastive (MPC) learning\napproach. This method enables the learning of joint representations among\nmultiple discrete acoustic codes within the same audio input. In our\nexperiments, we treat discrete acoustic codes as textual data and train a\nmasked language model using a cloze-like methodology, ultimately deriving\nhigh-quality audio representations. Notably, the MPC learning technique\neffectively captures collaborative representations among distinct positive\nsamples. Our research outcomes demonstrate that AudioFormer attains\nsignificantly improved performance compared to prevailing monomodal audio\nclassification models across multiple datasets, and even outperforms\naudio-visual multimodal classification models on select datasets. Specifically,\nour approach achieves remarkable results on datasets including AudioSet (2M,\n20K), and FSD50K, with performance scores of 53.9, 45.1, and 65.6,\nrespectively. We have openly shared both the code and models:\n\\url{https://github.com/LZH-0225/AudioFormer.git}.\n","authors":["Zhaohui Li","Haitao Wang","Xinghua Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.07221v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.07212v1","updated":"2023-08-14T15:29:32Z","published":"2023-08-14T15:29:32Z","title":"Automated Ensemble-Based Segmentation of Pediatric Brain Tumors: A Novel\n Approach Using the CBTN-CONNECT-ASNR-MICCAI BraTS-PEDs 2023 Challenge Data","summary":" Brain tumors remain a critical global health challenge, necessitating\nadvancements in diagnostic techniques and treatment methodologies. In response\nto the growing need for age-specific segmentation models, particularly for\npediatric patients, this study explores the deployment of deep learning\ntechniques using magnetic resonance imaging (MRI) modalities. By introducing a\nnovel ensemble approach using ONet and modified versions of UNet, coupled with\ninnovative loss functions, this study achieves a precise segmentation model for\nthe BraTS-PEDs 2023 Challenge. Data augmentation, including both single and\ncomposite transformations, ensures model robustness and accuracy across\ndifferent scanning protocols. The ensemble strategy, integrating the ONet and\nUNet models, shows greater effectiveness in capturing specific features and\nmodeling diverse aspects of the MRI images which result in lesion_wise dice\nscores of 0.52, 0.72 and 0.78 for enhancing tumor, tumor core and whole tumor\nlabels respectively. Visual comparisons further confirm the superiority of the\nensemble method in accurate tumor region coverage. The results indicate that\nthis advanced ensemble approach, building upon the unique strengths of\nindividual models, offers promising prospects for enhanced diagnostic accuracy\nand effective treatment planning for brain tumors in pediatric brains.\n","authors":["Shashidhar Reddy Javaji","Sovesh Mohapatra","Advait Gosai","Gottfried Schlaug"],"pdf_url":"https://arxiv.org/pdf/2308.07212v1.pdf","comment":"3 Figs, 3 Tables"},{"id":"http://arxiv.org/abs/2308.07209v1","updated":"2023-08-14T15:25:07Z","published":"2023-08-14T15:25:07Z","title":"Unified Data-Free Compression: Pruning and Quantization without\n Fine-Tuning","summary":" Structured pruning and quantization are promising approaches for reducing the\ninference time and memory footprint of neural networks. However, most existing\nmethods require the original training dataset to fine-tune the model. This not\nonly brings heavy resource consumption but also is not possible for\napplications with sensitive or proprietary data due to privacy and security\nconcerns. Therefore, a few data-free methods are proposed to address this\nproblem, but they perform data-free pruning and quantization separately, which\ndoes not explore the complementarity of pruning and quantization. In this\npaper, we propose a novel framework named Unified Data-Free Compression(UDFC),\nwhich performs pruning and quantization simultaneously without any data and\nfine-tuning process. Specifically, UDFC starts with the assumption that the\npartial information of a damaged(e.g., pruned or quantized) channel can be\npreserved by a linear combination of other channels, and then derives the\nreconstruction form from the assumption to restore the information loss due to\ncompression. Finally, we formulate the reconstruction error between the\noriginal network and its compressed network, and theoretically deduce the\nclosed-form solution. We evaluate the UDFC on the large-scale image\nclassification task and obtain significant improvements over various network\narchitectures and compression methods. For example, we achieve a 20.54%\naccuracy improvement on ImageNet dataset compared to SOTA method with 30%\npruning ratio and 6-bit quantization on ResNet-34.\n","authors":["Shipeng Bai","Jun Chen","Xintian Shen","Yixuan Qian","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07209v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.07204v1","updated":"2023-08-14T15:16:39Z","published":"2023-08-14T15:16:39Z","title":"Algorithms for the Training of Neural Support Vector Machines","summary":" Neural support vector machines (NSVMs) allow for the incorporation of domain\nknowledge in the design of the model architecture. In this article we introduce\na set of training algorithms for NSVMs that leverage the Pegasos algorithm and\nprovide a proof of concept by solving a set of standard machine learning tasks.\n","authors":["Lars Simon","Manuel Radons"],"pdf_url":"https://arxiv.org/pdf/2308.07204v1.pdf","comment":"19 pages, 0 figures"},{"id":"http://arxiv.org/abs/2308.07200v1","updated":"2023-08-14T15:10:29Z","published":"2023-08-14T15:10:29Z","title":"Neural Categorical Priors for Physics-Based Character Control","summary":" Recent advances in learning reusable motion priors have demonstrated their\neffectiveness in generating naturalistic behaviors. In this paper, we propose a\nnew learning framework in this paradigm for controlling physics-based\ncharacters with significantly improved motion quality and diversity over\nexisting state-of-the-art methods. The proposed method uses reinforcement\nlearning (RL) to initially track and imitate life-like movements from\nunstructured motion clips using the discrete information bottleneck, as adopted\nin the Vector Quantized Variational AutoEncoder (VQ-VAE). This structure\ncompresses the most relevant information from the motion clips into a compact\nyet informative latent space, i.e., a discrete space over vector quantized\ncodes. By sampling codes in the space from a trained categorical prior\ndistribution, high-quality life-like behaviors can be generated, similar to the\nusage of VQ-VAE in computer vision. Although this prior distribution can be\ntrained with the supervision of the encoder's output, it follows the original\nmotion clip distribution in the dataset and could lead to imbalanced behaviors\nin our setting. To address the issue, we further propose a technique named\nprior shifting to adjust the prior distribution using curiosity-driven RL. The\noutcome distribution is demonstrated to offer sufficient behavioral diversity\nand significantly facilitates upper-level policy learning for downstream tasks.\nWe conduct comprehensive experiments using humanoid characters on two\nchallenging downstream tasks, sword-shield striking and two-player boxing game.\nOur results demonstrate that the proposed framework is capable of controlling\nthe character to perform considerably high-quality movements in terms of\nbehavioral strategies, diversity, and realism. Videos, codes, and data are\navailable at https://tencent-roboticsx.github.io/NCP/.\n","authors":["Qingxu Zhu","He Zhang","Mengting Lan","Lei Han"],"pdf_url":"https://arxiv.org/pdf/2308.07200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07198v1","updated":"2023-08-14T15:07:05Z","published":"2023-08-14T15:07:05Z","title":"Explaining Black-Box Models through Counterfactuals","summary":" We present CounterfactualExplanations.jl: a package for generating\nCounterfactual Explanations (CE) and Algorithmic Recourse (AR) for black-box\nmodels in Julia. CE explain how inputs into a model need to change to yield\nspecific model predictions. Explanations that involve realistic and actionable\nchanges can be used to provide AR: a set of proposed actions for individuals to\nchange an undesirable outcome for the better. In this article, we discuss the\nusefulness of CE for Explainable Artificial Intelligence and demonstrate the\nfunctionality of our package. The package is straightforward to use and\ndesigned with a focus on customization and extensibility. We envision it to one\nday be the go-to place for explaining arbitrary predictive models in Julia\nthrough a diverse suite of counterfactual generators.\n","authors":["Patrick Altmeyer","Arie van Deursen","Cynthia C. S. Liem"],"pdf_url":"https://arxiv.org/pdf/2308.07198v1.pdf","comment":"13 pages, 9 figures, originally published in The Proceedings of the\n JuliaCon Conferences (JCON)"},{"id":"http://arxiv.org/abs/2303.11908v2","updated":"2023-08-14T15:02:11Z","published":"2023-03-21T14:58:16Z","title":"Non-Asymptotic Pointwise and Worst-Case Bounds for Classical Spectrum\n Estimators","summary":" Spectrum estimation is a fundamental methodology in the analysis of\ntime-series data, with applications including medicine, speech analysis, and\ncontrol design. The asymptotic theory of spectrum estimation is\nwell-understood, but the theory is limited when the number of samples is fixed\nand finite. This paper gives non-asymptotic error bounds for a broad class of\nspectral estimators, both pointwise (at specific frequencies) and in the worst\ncase over all frequencies. The general method is used to derive error bounds\nfor the classical Blackman-Tukey, Bartlett, and Welch estimators. In\nparticular, these are first non-asymptotic error bounds for Bartlett and Welch\nestimators.\n","authors":["Andrew Lamperski"],"pdf_url":"https://arxiv.org/pdf/2303.11908v2.pdf","comment":"15 pages, 3 figures, under review in IEEE Transactions on Signal\n Processing"},{"id":"http://arxiv.org/abs/2308.07192v1","updated":"2023-08-14T14:56:40Z","published":"2023-08-14T14:56:40Z","title":"gSASRec: Reducing Overconfidence in Sequential Recommendation Trained\n with Negative Sampling","summary":" A large catalogue size is one of the central challenges in training\nrecommendation models: a large number of items makes them memory and\ncomputationally inefficient to compute scores for all items during training,\nforcing these models to deploy negative sampling. However, negative sampling\nincreases the proportion of positive interactions in the training data, and\ntherefore models trained with negative sampling tend to overestimate the\nprobabilities of positive interactions a phenomenon we call overconfidence.\nWhile the absolute values of the predicted scores or probabilities are not\nimportant for the ranking of retrieved recommendations, overconfident models\nmay fail to estimate nuanced differences in the top-ranked items, resulting in\ndegraded performance. In this paper, we show that overconfidence explains why\nthe popular SASRec model underperforms when compared to BERT4Rec. This is\ncontrary to the BERT4Rec authors explanation that the difference in performance\nis due to the bi-directional attention mechanism. To mitigate overconfidence,\nwe propose a novel Generalised Binary Cross-Entropy Loss function (gBCE) and\ntheoretically prove that it can mitigate overconfidence. We further propose the\ngSASRec model, an improvement over SASRec that deploys an increased number of\nnegatives and the gBCE loss. We show through detailed experiments on three\ndatasets that gSASRec does not exhibit the overconfidence problem. As a result,\ngSASRec can outperform BERT4Rec (e.g. +9.47% NDCG on the MovieLens-1M dataset),\nwhile requiring less training time (e.g. -73% training time on MovieLens-1M).\nMoreover, in contrast to BERT4Rec, gSASRec is suitable for large datasets that\ncontain more than 1 million items.\n","authors":["Aleksandr Petrov","Craig Macdonald"],"pdf_url":"https://arxiv.org/pdf/2308.07192v1.pdf","comment":"Accepted at ACM RecSys 2023"},{"id":"http://arxiv.org/abs/2305.07041v2","updated":"2023-08-14T14:47:34Z","published":"2023-05-11T14:25:34Z","title":"Fairness in Machine Learning meets with Equity in Healthcare","summary":" With the growing utilization of machine learning in healthcare, there is\nincreasing potential to enhance healthcare outcomes. However, this also brings\nthe risk of perpetuating biases in data and model design that can harm certain\ndemographic groups based on factors such as age, gender, and race. This study\nproposes an artificial intelligence framework, grounded in software engineering\nprinciples, for identifying and mitigating biases in data and models while\nensuring fairness in healthcare settings. A case study is presented to\ndemonstrate how systematic biases in data can lead to amplified biases in model\npredictions, and machine learning methods are suggested to prevent such biases.\nFuture research aims to test and validate the proposed ML framework in\nreal-world clinical settings to evaluate its impact on promoting health equity.\n","authors":["Shaina Raza","Parisa Osivand Pour","Syed Raza Bashir"],"pdf_url":"https://arxiv.org/pdf/2305.07041v2.pdf","comment":"Accepted in Association for the Advancement of Artificial\n Intelligence (AAAI) 2023 , Responsible Medical AI, Design, and\n Operationalization Symposium"},{"id":"http://arxiv.org/abs/2308.07175v1","updated":"2023-08-14T14:32:42Z","published":"2023-08-14T14:32:42Z","title":"Efficient Learning of Quantum States Prepared With Few Non-Clifford\n Gates II: Single-Copy Measurements","summary":" Recent work has shown that $n$-qubit quantum states output by circuits with\nat most $t$ single-qubit non-Clifford gates can be learned to trace distance\n$\\epsilon$ using $\\mathsf{poly}(n,2^t,1/\\epsilon)$ time and samples. All prior\nalgorithms achieving this runtime use entangled measurements across two copies\nof the input state. In this work, we give a similarly efficient algorithm that\nlearns the same class of states using only single-copy measurements.\n","authors":["Sabee Grewal","Vishnu Iyer","William Kretschmer","Daniel Liang"],"pdf_url":"https://arxiv.org/pdf/2308.07175v1.pdf","comment":"22 pages. arXiv admin note: text overlap with arXiv:2305.13409"},{"id":"http://arxiv.org/abs/2308.07170v1","updated":"2023-08-14T14:26:52Z","published":"2023-08-14T14:26:52Z","title":"PitchNet: A Fully Convolutional Neural Network for Pitch Estimation","summary":" In the domain of music and sound processing, pitch extraction plays a pivotal\nrole. This research introduces \"PitchNet\", a convolutional neural network\ntailored for pitch extraction from the human singing voice, including acapella\nperformances. Integrating autocorrelation with deep learning techniques,\nPitchNet aims to optimize the accuracy of pitch detection. Evaluation across\ndatasets comprising synthetic sounds, opera recordings, and time-stretched\nvowels demonstrates its efficacy. This work paves the way for enhanced pitch\nextraction in both music and voice settings.\n","authors":["Jeremy Cochoy"],"pdf_url":"https://arxiv.org/pdf/2308.07170v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01464v2","updated":"2023-08-14T14:14:41Z","published":"2023-03-02T18:27:00Z","title":"Efficient Rate Optimal Regret for Adversarial Contextual MDPs Using\n Online Function Approximation","summary":" We present the OMG-CMDP! algorithm for regret minimization in adversarial\nContextual MDPs. The algorithm operates under the minimal assumptions of\nrealizable function class and access to online least squares and log loss\nregression oracles. Our algorithm is efficient (assuming efficient online\nregression oracles), simple and robust to approximation errors. It enjoys an\n$\\widetilde{O}(H^{2.5} \\sqrt{ T|S||A| ( \\mathcal{R}(\\mathcal{O}) + H\n\\log(\\delta^{-1}) )})$ regret guarantee, with $T$ being the number of episodes,\n$S$ the state space, $A$ the action space, $H$ the horizon and\n$\\mathcal{R}(\\mathcal{O}) = \\mathcal{R}(\\mathcal{O}_{\\mathrm{sq}}^\\mathcal{F})\n+ \\mathcal{R}(\\mathcal{O}_{\\mathrm{log}}^\\mathcal{P})$ is the sum of the\nregression oracles' regret, used to approximate the context-dependent rewards\nand dynamics, respectively. To the best of our knowledge, our algorithm is the\nfirst efficient rate optimal regret minimization algorithm for adversarial\nCMDPs that operates under the minimal standard assumption of online function\napproximation.\n","authors":["Orin Levy","Alon Cohen","Asaf Cassel","Yishay Mansour"],"pdf_url":"https://arxiv.org/pdf/2303.01464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.11644v3","updated":"2023-08-14T14:14:05Z","published":"2021-08-26T08:23:32Z","title":"Hybrid quantum-classical machine learning for generative chemistry and\n drug design","summary":" Deep generative chemistry models emerge as powerful tools to expedite drug\ndiscovery. However, the immense size and complexity of the structural space of\nall possible drug-like molecules pose significant obstacles, which could be\novercome with hybrid architectures combining quantum computers with deep\nclassical networks. As the first step toward this goal, we built a compact\ndiscrete variational autoencoder (DVAE) with a Restricted Boltzmann Machine\n(RBM) of reduced size in its latent layer. The size of the proposed model was\nsmall enough to fit on a state-of-the-art D-Wave quantum annealer and allowed\ntraining on a subset of the ChEMBL dataset of biologically active compounds.\nFinally, we generated 2331 novel chemical structures with medicinal chemistry\nand synthetic accessibility properties in the ranges typical for molecules from\nChEMBL. The presented results demonstrate the feasibility of using already\nexisting or soon-to-be-available quantum computing devices as testbeds for\nfuture drug discovery applications.\n","authors":["A. I. Gircha","A. S. Boev","K. Avchaciov","P. O. Fedichev","A. K. Fedorov"],"pdf_url":"https://arxiv.org/pdf/2108.11644v3.pdf","comment":"8 pages. 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2305.17330v2","updated":"2023-08-14T13:48:38Z","published":"2023-05-27T02:14:09Z","title":"MADiff: Offline Multi-agent Learning with Diffusion Models","summary":" Diffusion model (DM), as a powerful generative model, recently achieved huge\nsuccess in various scenarios including offline reinforcement learning, where\nthe policy learns to conduct planning by generating trajectory in the online\nevaluation. However, despite the effectiveness shown for single-agent learning,\nit remains unclear how DMs can operate in multi-agent problems, where agents\ncan hardly complete teamwork without good coordination by independently\nmodeling each agent's trajectories. In this paper, we propose MADiff, a novel\ngenerative multi-agent learning framework to tackle this problem. MADiff is\nrealized with an attention-based diffusion model to model the complex\ncoordination among behaviors of multiple diffusion agents. To the best of our\nknowledge, MADiff is the first diffusion-based multi-agent offline RL\nframework, which behaves as both a decentralized policy and a centralized\ncontroller, which includes opponent modeling and can be used for multi-agent\ntrajectory prediction. MADiff takes advantage of the powerful generative\nability of diffusion while well-suited in modeling complex multi-agent\ninteractions. Our experiments show the superior performance of MADiff compared\nto baseline algorithms in a range of multi-agent learning tasks.\n","authors":["Zhengbang Zhu","Minghuan Liu","Liyuan Mao","Bingyi Kang","Minkai Xu","Yong Yu","Stefano Ermon","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.17330v2.pdf","comment":"17 pages, 7 figures, 4 tables. The first two authors contributed\n equally to the work"},{"id":"http://arxiv.org/abs/2308.07136v1","updated":"2023-08-14T13:42:09Z","published":"2023-08-14T13:42:09Z","title":"Pairing interacting protein sequences using masked language modeling","summary":" Predicting which proteins interact together from amino-acid sequences is an\nimportant task. We develop a method to pair interacting protein sequences which\nleverages the power of protein language models trained on multiple sequence\nalignments, such as MSA Transformer and the EvoFormer module of AlphaFold. We\nformulate the problem of pairing interacting partners among the paralogs of two\nprotein families in a differentiable way. We introduce a method called DiffPALM\nthat solves it by exploiting the ability of MSA Transformer to fill in masked\namino acids in multiple sequence alignments using the surrounding context. MSA\nTransformer encodes coevolution between functionally or structurally coupled\namino acids. We show that it captures inter-chain coevolution, while it was\ntrained on single-chain data, which means that it can be used\nout-of-distribution. Relying on MSA Transformer without fine-tuning, DiffPALM\noutperforms existing coevolution-based pairing methods on difficult benchmarks\nof shallow multiple sequence alignments extracted from ubiquitous prokaryotic\nprotein datasets. It also outperforms an alternative method based on a\nstate-of-the-art protein language model trained on single sequences. Paired\nalignments of interacting protein sequences are a crucial ingredient of\nsupervised deep learning methods to predict the three-dimensional structure of\nprotein complexes. DiffPALM substantially improves the structure prediction of\nsome eukaryotic protein complexes by AlphaFold-Multimer, without significantly\ndeteriorating any of those we tested. It also achieves competitive performance\nwith using orthology-based pairing.\n","authors":["Umberto Lupo","Damiano Sgarbossa","Anne-Florence Bitbol"],"pdf_url":"https://arxiv.org/pdf/2308.07136v1.pdf","comment":"33 pages, 14 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.07134v1","updated":"2023-08-14T13:41:09Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of language models\ncontinues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundational model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLMs to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative language models replacing GNNs as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v1.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.07126v1","updated":"2023-08-14T13:13:50Z","published":"2023-08-14T13:13:50Z","title":"A Time-aware tensor decomposition for tracking evolving patterns","summary":" Time-evolving data sets can often be arranged as a higher-order tensor with\none of the modes being the time mode. While tensor factorizations have been\nsuccessfully used to capture the underlying patterns in such higher-order data\nsets, the temporal aspect is often ignored, allowing for the reordering of time\npoints. In recent studies, temporal regularizers are incorporated in the time\nmode to tackle this issue. Nevertheless, existing approaches still do not allow\nunderlying patterns to change in time (e.g., spatial changes in the brain,\ncontextual changes in topics). In this paper, we propose temporal PARAFAC2\n(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal\nregularization to extract gradually evolving patterns from temporal data.\nThrough extensive experiments on synthetic data, we demonstrate that tPARAFAC2\ncan capture the underlying evolving patterns accurately performing better than\nPARAFAC2 and coupled matrix factorization with temporal smoothness\nregularization.\n","authors":["Christos Chatzis","Max Pfeffer","Pedro Lind","Evrim Acar"],"pdf_url":"https://arxiv.org/pdf/2308.07126v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.07121v1","updated":"2023-08-14T13:06:10Z","published":"2023-08-14T13:06:10Z","title":"Active Bird2Vec: Towards End-to-End Bird Sound Monitoring with\n Transformers","summary":" We propose a shift towards end-to-end learning in bird sound monitoring by\ncombining self-supervised (SSL) and deep active learning (DAL). Leveraging\ntransformer models, we aim to bypass traditional spectrogram conversions,\nenabling direct raw audio processing. ActiveBird2Vec is set to generate\nhigh-quality bird sound representations through SSL, potentially accelerating\nthe assessment of environmental changes and decision-making processes for wind\nfarms. Additionally, we seek to utilize the wide variety of bird vocalizations\nthrough DAL, reducing the reliance on extensively labeled datasets by human\nexperts. We plan to curate a comprehensive set of tasks through Huggingface\nDatasets, enhancing future comparability and reproducibility of bioacoustic\nresearch. A comparative analysis between various transformer models will be\nconducted to evaluate their proficiency in bird sound recognition tasks. We aim\nto accelerate the progression of avian bioacoustic research and contribute to\nmore effective conservation strategies.\n","authors":["Lukas Rauch","Raphael Schwinger","Moritz Wirth","Bernhard Sick","Sven Tomforde","Christoph Scholz"],"pdf_url":"https://arxiv.org/pdf/2308.07121v1.pdf","comment":"Accepted @AI4S ECAI2023. This is the author's version of the work"},{"id":"http://arxiv.org/abs/2308.07118v1","updated":"2023-08-14T12:57:12Z","published":"2023-08-14T12:57:12Z","title":"Neural radiance fields in the industrial and robotics domain:\n applications, research opportunities and use cases","summary":" The proliferation of technologies, such as extended reality (XR), has\nincreased the demand for high-quality three-dimensional (3D) graphical\nrepresentations. Industrial 3D applications encompass computer-aided design\n(CAD), finite element analysis (FEA), scanning, and robotics. However, current\nmethods employed for industrial 3D representations suffer from high\nimplementation costs and reliance on manual human input for accurate 3D\nmodeling. To address these challenges, neural radiance fields (NeRFs) have\nemerged as a promising approach for learning 3D scene representations based on\nprovided training 2D images. Despite a growing interest in NeRFs, their\npotential applications in various industrial subdomains are still unexplored.\nIn this paper, we deliver a comprehensive examination of NeRF industrial\napplications while also providing direction for future research endeavors. We\nalso present a series of proof-of-concept experiments that demonstrate the\npotential of NeRFs in the industrial domain. These experiments include\nNeRF-based video compression techniques and using NeRFs for 3D motion\nestimation in the context of collision avoidance. In the video compression\nexperiment, our results show compression savings up to 48\\% and 74\\% for\nresolutions of 1920x1080 and 300x168, respectively. The motion estimation\nexperiment used a 3D animation of a robotic arm to train Dynamic-NeRF (D-NeRF)\nand achieved an average disparity map PSNR of 23 dB and an SSIM of 0.97. The\ncode for our experiments is publicly available at\nhttps://github.com/Maftej/iisnerf .\n","authors":["Eugen Šlapak","Enric Pardo","Matúš Dopiriak","Taras Maksymyuk","Juraj Gazda"],"pdf_url":"https://arxiv.org/pdf/2308.07118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01050v2","updated":"2023-08-14T12:57:04Z","published":"2023-08-02T09:48:08Z","title":"A Counterfactual Safety Margin Perspective on the Scoring of Autonomous\n Vehicles' Riskiness","summary":" Autonomous Vehicles (AVs) have the potential to provide numerous societal\nbenefits, such as decreased road accidents and increased overall transportation\nefficiency. However, quantifying the risk associated with AVs is challenging\ndue to the lack of historical data and the rapidly evolving technology. This\npaper presents a data-driven framework for comparing the risk of different AVs'\nbehaviors in various operational design domains (ODDs), based on counterfactual\nsimulations of \"misbehaving\" road users. We introduce the concept of\ncounterfactual safety margin, which represents the minimum deviation from\nnormal behavior that could lead to a collision. This concept helps to find the\nmost critical scenarios but also to assess the frequency and severity of risk\nof AVs. We show that the proposed methodology is applicable even when the AV's\nbehavioral policy is unknown -- through worst- and best-case analyses -- making\nthe method useful also to external third-party risk assessors. Our experimental\nresults demonstrate the correlation between the safety margin, the driving\npolicy quality, and the ODD shedding light on the relative risk associated with\ndifferent AV providers. This work contributes to AV safety assessment and aids\nin addressing legislative and insurance concerns surrounding this emerging\ntechnology.\n","authors":["Alessandro Zanardi","Andrea Censi","Margherita Atzei","Luigi Di Lillo","Emilio Frazzoli"],"pdf_url":"https://arxiv.org/pdf/2308.01050v2.pdf","comment":"updated affiliations"},{"id":"http://arxiv.org/abs/2308.07117v1","updated":"2023-08-14T12:56:31Z","published":"2023-08-14T12:56:31Z","title":"iSTFTNet2: Faster and More Lightweight iSTFT-Based Neural Vocoder Using\n 1D-2D CNN","summary":" The inverse short-time Fourier transform network (iSTFTNet) has garnered\nattention owing to its fast, lightweight, and high-fidelity speech synthesis.\nIt obtains these characteristics using a fast and lightweight 1D CNN as the\nbackbone and replacing some neural processes with iSTFT. Owing to the\ndifficulty of a 1D CNN to model high-dimensional spectrograms, the frequency\ndimension is reduced via temporal upsampling. However, this strategy\ncompromises the potential to enhance the speed. Therefore, we propose\niSTFTNet2, an improved variant of iSTFTNet with a 1D-2D CNN that employs 1D and\n2D CNNs to model temporal and spectrogram structures, respectively. We designed\na 2D CNN that performs frequency upsampling after conversion in a few-frequency\nspace. This design facilitates the modeling of high-dimensional spectrograms\nwithout compromising the speed. The results demonstrated that iSTFTNet2 made\niSTFTNet faster and more lightweight with comparable speech quality. Audio\nsamples are available at\nhttps://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/istftnet2/.\n","authors":["Takuhiro Kaneko","Hirokazu Kameoka","Kou Tanaka","Shogo Seki"],"pdf_url":"https://arxiv.org/pdf/2308.07117v1.pdf","comment":"Accepted to Interspeech 2023. Project page:\n https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/istftnet2/"},{"id":"http://arxiv.org/abs/2308.01138v2","updated":"2023-08-14T12:37:37Z","published":"2023-08-02T13:29:31Z","title":"Can We Transfer Noise Patterns? A Multi-environment Spectrum Analysis\n Model Using Generated Cases","summary":" Spectrum analysis systems in online water quality testing are designed to\ndetect types and concentrations of pollutants and enable regulatory agencies to\nrespond promptly to pollution incidents. However, spectral data-based testing\ndevices suffer from complex noise patterns when deployed in non-laboratory\nenvironments. To make the analysis model applicable to more environments, we\npropose a noise patterns transferring model, which takes the spectrum of\nstandard water samples in different environments as cases and learns the\ndifferences in their noise patterns, thus enabling noise patterns to transfer\nto unknown samples. Unfortunately, the inevitable sample-level baseline noise\nmakes the model unable to obtain the paired data that only differ in\ndataset-level environmental noise. To address the problem, we generate a\nsample-to-sample case-base to exclude the interference of sample-level noise on\ndataset-level noise learning, enhancing the system's learning performance.\nExperiments on spectral data with different background noises demonstrate the\ngood noise-transferring ability of the proposed method against baseline systems\nranging from wavelet denoising, deep neural networks, and generative models.\nFrom this research, we posit that our method can enhance the performance of DL\nmodels by generating high-quality cases. The source code is made publicly\navailable online at https://github.com/Magnomic/CNST.\n","authors":["Haiwen Du","Zheng Ju","Yu An","Honghui Du","Dongjie Zhu","Zhaoshuo Tian","Aonghus Lawlor","Ruihai Dong"],"pdf_url":"https://arxiv.org/pdf/2308.01138v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12334v2","updated":"2023-08-14T12:37:30Z","published":"2023-02-23T20:57:47Z","title":"Using Automated Algorithm Configuration for Parameter Control","summary":" Dynamic Algorithm Configuration (DAC) tackles the question of how to\nautomatically learn policies to control parameters of algorithms in a\ndata-driven fashion. This question has received considerable attention from the\nevolutionary community in recent years. Having a good benchmark collection to\ngain structural understanding on the effectiveness and limitations of different\nsolution methods for DAC is therefore strongly desirable. Following recent work\non proposing DAC benchmarks with well-understood theoretical properties and\nground truth information, in this work, we suggest as a new DAC benchmark the\ncontrolling of the key parameter $\\lambda$ in the\n$(1+(\\lambda,\\lambda))$~Genetic Algorithm for solving OneMax problems. We\nconduct a study on how to solve the DAC problem via the use of (static)\nautomated algorithm configuration on the benchmark, and propose techniques to\nsignificantly improve the performance of the approach. Our approach is able to\nconsistently outperform the default parameter control policy of the benchmark\nderived from previous theoretical work on sufficiently large problem sizes. We\nalso present new findings on the landscape of the parameter-control search\npolicies and propose methods to compute stronger baselines for the benchmark\nvia numerical approximations of the true optimal policies.\n","authors":["Deyao Chen","Maxim Buzdalov","Carola Doerr","Nguyen Dang"],"pdf_url":"https://arxiv.org/pdf/2302.12334v2.pdf","comment":"To appear in the Proc. of the ACM/SIGEVO Conference on Foundations of\n Genetic Algorithms (FOGA XVII)"},{"id":"http://arxiv.org/abs/2305.09241v3","updated":"2023-08-14T12:35:57Z","published":"2023-05-16T07:40:05Z","title":"Unlearnable Examples Give a False Sense of Security: Piercing through\n Unexploitable Data with Learnable Examples","summary":" Safeguarding data from unauthorized exploitation is vital for privacy and\nsecurity, especially in recent rampant research in security breach such as\nadversarial/membership attacks. To this end, \\textit{unlearnable examples}\n(UEs) have been recently proposed as a compelling protection, by adding\nimperceptible perturbation to data so that models trained on them cannot\nclassify them accurately on original clean distribution. Unfortunately, we find\nUEs provide a false sense of security, because they cannot stop unauthorized\nusers from utilizing other unprotected data to remove the protection, by\nturning unlearnable data into learnable again. Motivated by this observation,\nwe formally define a new threat by introducing \\textit{learnable unauthorized\nexamples} (LEs) which are UEs with their protection removed. The core of this\napproach is a novel purification process that projects UEs onto the manifold of\nLEs. This is realized by a new joint-conditional diffusion model which denoises\nUEs conditioned on the pixel and perceptual similarity between UEs and LEs.\nExtensive experiments demonstrate that LE delivers state-of-the-art countering\nperformance against both supervised UEs and unsupervised UEs in various\nscenarios, which is the first generalizable countermeasure to UEs across\nsupervised learning and unsupervised learning. Our code is available at\n\\url{https://github.com/jiangw-0/LE_JCDP}.\n","authors":["Wan Jiang","Yunfeng Diao","He Wang","Jianxin Sun","Meng Wang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2305.09241v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12398v3","updated":"2023-08-14T11:47:41Z","published":"2023-03-22T09:06:07Z","title":"Multiscale Attention via Wavelet Neural Operators for Vision\n Transformers","summary":" Transformers have achieved widespread success in computer vision. At their\nheart, there is a Self-Attention (SA) mechanism, an inductive bias that\nassociates each token in the input with every other token through a weighted\nbasis. The standard SA mechanism has quadratic complexity with the sequence\nlength, which impedes its utility to long sequences appearing in high\nresolution vision. Recently, inspired by operator learning for PDEs, Adaptive\nFourier Neural Operators (AFNO) were introduced for high resolution attention\nbased on global convolution that is efficiently implemented via FFT. However,\nthe AFNO global filtering cannot well represent small and moderate scale\nstructures that commonly appear in natural images. To leverage the\ncoarse-to-fine scale structures we introduce a Multiscale Wavelet Attention\n(MWA) by leveraging wavelet neural operators which incurs linear complexity in\nthe sequence size. We replace the attention in ViT with MWA and our experiments\nwith CIFAR and Tiny-ImageNet classification demonstrate significant improvement\nover alternative Fourier-based attentions such as AFNO and Global Filter\nNetwork (GFN).\n","authors":["Anahita Nekoozadeh","Mohammad Reza Ahmadzadeh","Zahra Mardani"],"pdf_url":"https://arxiv.org/pdf/2303.12398v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07074v1","updated":"2023-08-14T11:16:28Z","published":"2023-08-14T11:16:28Z","title":"#InsTag: Instruction Tagging for Diversity and Complexity Analysis","summary":" Foundation language models obtain the instruction-following ability through\nsupervised fine-tuning (SFT). Diversity and complexity are considered critical\nfactors of a successful SFT dataset, while their definitions remain obscure and\nlack quantitative analyses. In this work, we propose InsTag, an open-set\nfine-grained tagger, to tag samples within SFT datasets based on semantics and\nintentions and define instruction diversity and complexity regarding tags. We\nobtain 6.6K tags to describe comprehensive user queries. Then we analyze\npopular open-sourced SFT datasets and find that the model ability grows with\nmore diverse and complex data. Based on this observation, we propose a data\nselector based on InsTag to select 6K diverse and complex samples from\nopen-source datasets and fine-tune models on InsTag-selected data. The\nresulting models, TagLM, outperform open-source models based on considerably\nlarger SFT data evaluated by MT-Bench, echoing the importance of query\ndiversity and complexity. We open-source InsTag in\nhttps://github.com/OFA-Sys/InsTag.\n","authors":["Keming Lu","Hongyi Yuan","Zheng Yuan","Runji Lin","Junyang Lin","Chuanqi Tan","Chang Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07074v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07061v1","updated":"2023-08-14T10:45:51Z","published":"2023-08-14T10:45:51Z","title":"Machine Unlearning: Solutions and Challenges","summary":" Machine learning models may inadvertently memorize sensitive, unauthorized,\nor malicious data, posing risks of privacy violations, security breaches, and\nperformance deterioration. To address these issues, machine unlearning has\nemerged as a critical technique to selectively remove specific training data\npoints' influence on trained models. This paper provides a comprehensive\ntaxonomy and analysis of machine unlearning research. We categorize existing\nresearch into exact unlearning that algorithmically removes data influence\nentirely and approximate unlearning that efficiently minimizes influence\nthrough limited parameter updates. By reviewing the state-of-the-art solutions,\nwe critically discuss their advantages and limitations. Furthermore, we propose\nfuture directions to advance machine unlearning and establish it as an\nessential capability for trustworthy and adaptive machine learning. This paper\nprovides researchers with a roadmap of open problems, encouraging impactful\ncontributions to address real-world needs for selective data removal.\n","authors":["Jie Xu","Zihan Wu","Cong Wang","Xiaohua Jia"],"pdf_url":"https://arxiv.org/pdf/2308.07061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07052v1","updated":"2023-08-14T10:23:25Z","published":"2023-08-14T10:23:25Z","title":"Diagnosis of Scalp Disorders using Machine Learning and Deep Learning\n Approach -- A Review","summary":" The morbidity of scalp diseases is minuscule compared to other diseases, but\nthe impact on the patient's life is enormous. It is common for people to\nexperience scalp problems that include Dandruff, Psoriasis, Tinea-Capitis,\nAlopecia and Atopic-Dermatitis. In accordance with WHO research, approximately\n70% of adults have problems with their scalp. It has been demonstrated in\ndescriptive research that hair quality is impaired by impaired scalp, but these\nimpacts are reversible with early diagnosis and treatment. Deep Learning\nadvances have demonstrated the effectiveness of CNN paired with FCN in\ndiagnosing scalp and skin disorders. In one proposed Deep-Learning-based scalp\ninspection and diagnosis system, an imaging microscope and a trained model are\ncombined with an app that classifies scalp disorders accurately with an average\nprecision of 97.41%- 99.09%. Another research dealt with classifying the\nPsoriasis using the CNN with an accuracy of 82.9%. As part of another study, an\nML based algorithm was also employed. It accurately classified the healthy\nscalp and alopecia areata with 91.4% and 88.9% accuracy with SVM and KNN\nalgorithms. Using deep learning models to diagnose scalp related diseases has\nimproved due to advancements i computation capabilities and computer vision,\nbut there remains a wide horizon for further improvements.\n","authors":["Hrishabh Tiwari","Jatin Moolchandani","Shamla Mantri"],"pdf_url":"https://arxiv.org/pdf/2308.07052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07051v1","updated":"2023-08-14T10:22:51Z","published":"2023-08-14T10:22:51Z","title":"Fourier neural operator for learning solutions to macroscopic traffic\n flow models: Application to the forward and inverse problems","summary":" Deep learning methods are emerging as popular computational tools for solving\nforward and inverse problems in traffic flow. In this paper, we study a neural\noperator framework for learning solutions to nonlinear hyperbolic partial\ndifferential equations with applications in macroscopic traffic flow models. In\nthis framework, an operator is trained to map heterogeneous and sparse traffic\ninput data to the complete macroscopic traffic state in a supervised learning\nsetting. We chose a physics-informed Fourier neural operator ($\\pi$-FNO) as the\noperator, where an additional physics loss based on a discrete conservation law\nregularizes the problem during training to improve the shock predictions. We\nalso propose to use training data generated from random piecewise constant\ninput data to systematically capture the shock and rarefied solutions. From\nexperiments using the LWR traffic flow model, we found superior accuracy in\npredicting the density dynamics of a ring-road network and urban signalized\nroad. We also found that the operator can be trained using simple traffic\ndensity dynamics, e.g., consisting of $2-3$ vehicle queues and $1-2$ traffic\nsignal cycles, and it can predict density dynamics for heterogeneous vehicle\nqueue distributions and multiple traffic signal cycles $(\\geq 2)$ with an\nacceptable error. The extrapolation error grew sub-linearly with input\ncomplexity for a proper choice of the model architecture and training data.\nAdding a physics regularizer aided in learning long-term traffic density\ndynamics, especially for problems with periodic boundary data.\n","authors":["Bilal Thonnam Thodi","Sai Venkata Ramana Ambadipudi","Saif Eddin Jabari"],"pdf_url":"https://arxiv.org/pdf/2308.07051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07048v1","updated":"2023-08-14T10:18:24Z","published":"2023-08-14T10:18:24Z","title":"UIPC-MF: User-Item Prototype Connection Matrix Factorization for\n Explainable Collaborative Filtering","summary":" Recommending items to potentially interested users has been an important\ncommercial task that faces two main challenges: accuracy and explainability.\nWhile most collaborative filtering models rely on statistical computations on a\nlarge scale of interaction data between users and items and can achieve high\nperformance, they often lack clear explanatory power. We propose UIPC-MF, a\nprototype-based matrix factorization method for explainable collaborative\nfiltering recommendations. In UIPC-MF, both users and items are associated with\nsets of prototypes, capturing general collaborative attributes. To enhance\nexplainability, UIPC-MF learns connection weights that reflect the associative\nrelations between user and item prototypes for recommendations. UIPC-MF\noutperforms other prototype-based baseline methods in terms of Hit Ratio and\nNormalized Discounted Cumulative Gain on three datasets, while also providing\nbetter transparency.\n","authors":["Lei Pan","Von-Wun Soo"],"pdf_url":"https://arxiv.org/pdf/2308.07048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07047v1","updated":"2023-08-14T10:16:12Z","published":"2023-08-14T10:16:12Z","title":"No Regularization is Needed: An Efficient and Effective Model for\n Incomplete Label Distribution Learning","summary":" Label Distribution Learning (LDL) assigns soft labels, a.k.a. degrees, to a\nsample. In reality, it is always laborious to obtain complete degrees, giving\nbirth to the Incomplete LDL (InLDL). However, InLDL often suffers from\nperformance degeneration. To remedy it, existing methods need one or more\nexplicit regularizations, leading to burdensome parameter tuning and extra\ncomputation. We argue that label distribution itself may provide useful prior,\nwhen used appropriately, the InLDL problem can be solved without any explicit\nregularization. In this paper, we offer a rational alternative to use such a\nprior. Our intuition is that large degrees are likely to get more concern, the\nsmall ones are easily overlooked, whereas the missing degrees are completely\nneglected in InLDL. To learn an accurate label distribution, it is crucial not\nto ignore the small observed degrees but to give them properly large weights,\nwhile gradually increasing the weights of the missing degrees. To this end, we\nfirst define a weighted empirical risk and derive upper bounds between the\nexpected risk and the weighted empirical risk, which reveals in principle that\nweighting plays an implicit regularization role. Then, by using the prior of\ndegrees, we design a weighted scheme and verify its effectiveness. To sum up,\nour model has four advantages, it is 1) model selection free, as no explicit\nregularization is imposed; 2) with closed form solution (sub-problem) and\neasy-to-implement (a few lines of codes); 3) with linear computational\ncomplexity in the number of samples, thus scalable to large datasets; 4)\ncompetitive with state-of-the-arts even without any explicit regularization.\n","authors":["Xiang Li","Songcan Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07047v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2011.05885v2","updated":"2023-08-14T10:15:17Z","published":"2020-11-11T16:25:45Z","title":"Leveraged Matrix Completion with Noise","summary":" Completing low-rank matrices from subsampled measurements has received much\nattention in the past decade. Existing works indicate that\n$\\mathcal{O}(nr\\log^2(n))$ datums are required to theoretically secure the\ncompletion of an $n \\times n$ noisy matrix of rank $r$ with high probability,\nunder some quite restrictive assumptions: (1) the underlying matrix must be\nincoherent; (2) observations follow the uniform distribution. The\nrestrictiveness is partially due to ignoring the roles of the leverage score\nand the oracle information of each element. In this paper, we employ the\nleverage scores to characterize the importance of each element and\nsignificantly relax assumptions to: (1) not any other structure assumptions are\nimposed on the underlying low-rank matrix; (2) elements being observed are\nappropriately dependent on their importance via the leverage score. Under these\nassumptions, instead of uniform sampling, we devise an ununiform/biased\nsampling procedure that can reveal the ``importance'' of each observed element.\nOur proofs are supported by a novel approach that phrases sufficient optimality\nconditions based on the Golfing Scheme, which would be of independent interest\nto the wider areas. Theoretical findings show that we can provably recover an\nunknown $n\\times n$ matrix of rank $r$ from just about $\\mathcal{O}(nr\\log^2\n(n))$ entries, even when the observed entries are corrupted with a small amount\nof noisy information. The empirical results align precisely with our theories.\n","authors":["Xinjian Huang","Weiwei Liu","Bo Du","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2011.05885v2.pdf","comment":"This manuscript has been accepted for publication as a regular paper\n in the IEEE Transactions on Cybernetics"},{"id":"http://arxiv.org/abs/2308.07037v1","updated":"2023-08-14T09:56:35Z","published":"2023-08-14T09:56:35Z","title":"Bayesian Flow Networks","summary":" This paper introduces Bayesian Flow Networks (BFNs), a new class of\ngenerative model in which the parameters of a set of independent distributions\nare modified with Bayesian inference in the light of noisy data samples, then\npassed as input to a neural network that outputs a second, interdependent\ndistribution. Starting from a simple prior and iteratively updating the two\ndistributions yields a generative procedure similar to the reverse process of\ndiffusion models; however it is conceptually simpler in that no forward process\nis required. Discrete and continuous-time loss functions are derived for\ncontinuous, discretised and discrete data, along with sample generation\nprocedures. Notably, the network inputs for discrete data lie on the\nprobability simplex, and are therefore natively differentiable, paving the way\nfor gradient-based sample guidance and few-step generation in discrete domains\nsuch as language modelling. The loss function directly optimises data\ncompression and places no restrictions on the network architecture. In our\nexperiments BFNs achieve competitive log-likelihoods for image modelling on\ndynamically binarized MNIST and CIFAR-10, and outperform all known discrete\ndiffusion models on the text8 character-level language modelling task.\n","authors":["Alex Graves","Rupesh Kumar Srivastava","Timothy Atkinson","Faustino Gomez"],"pdf_url":"https://arxiv.org/pdf/2308.07037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07032v1","updated":"2023-08-14T09:45:28Z","published":"2023-08-14T09:45:28Z","title":"S3IM: Stochastic Structural SIMilarity and Its Unreasonable\n Effectiveness for Neural Fields","summary":" Recently, Neural Radiance Field (NeRF) has shown great success in rendering\nnovel-view images of a given scene by learning an implicit representation with\nonly posed RGB images. NeRF and relevant neural field methods (e.g., neural\nsurface representation) typically optimize a point-wise loss and make\npoint-wise predictions, where one data point corresponds to one pixel.\nUnfortunately, this line of research failed to use the collective supervision\nof distant pixels, although it is known that pixels in an image or scene can\nprovide rich structural information. To the best of our knowledge, we are the\nfirst to design a nonlocal multiplex training paradigm for NeRF and relevant\nneural field methods via a novel Stochastic Structural SIMilarity (S3IM) loss\nthat processes multiple data points as a whole set instead of process multiple\ninputs independently. Our extensive experiments demonstrate the unreasonable\neffectiveness of S3IM in improving NeRF and neural surface representation for\nnearly free. The improvements of quality metrics can be particularly\nsignificant for those relatively difficult tasks: e.g., the test MSE loss\nunexpectedly drops by more than 90% for TensoRF and DVGO over eight novel view\nsynthesis tasks; a 198% F-score gain and a 64% Chamfer $L_{1}$ distance\nreduction for NeuS over eight surface reconstruction tasks. Moreover, S3IM is\nconsistently robust even with sparse inputs, corrupted images, and dynamic\nscenes.\n","authors":["Zeke Xie","Xindi Yang","Yujie Yang","Qi Sun","Yixiang Jiang","Haoran Wang","Yunfeng Cai","Mingming Sun"],"pdf_url":"https://arxiv.org/pdf/2308.07032v1.pdf","comment":"ICCV 2023 main conference. Code: https://github.com/Madaoer/S3IM. 14\n pages, 5 figures, 17 tables"},{"id":"http://arxiv.org/abs/2303.01664v2","updated":"2023-08-14T09:22:18Z","published":"2023-03-03T01:57:16Z","title":"Miipher: A Robust Speech Restoration Model Integrating Self-Supervised\n Speech and Text Representations","summary":" Speech restoration (SR) is a task of converting degraded speech signals into\nhigh-quality ones. In this study, we propose a robust SR model called Miipher,\nand apply Miipher to a new SR application: increasing the amount of\nhigh-quality training data for speech generation by converting speech samples\ncollected from the Web to studio-quality. To make our SR model robust against\nvarious degradation, we use (i) a speech representation extracted from w2v-BERT\nfor the input feature, and (ii) a text representation extracted from\ntranscripts via PnG-BERT as a linguistic conditioning feature. Experiments show\nthat Miipher (i) is robust against various audio degradation and (ii) enable us\nto train a high-quality text-to-speech (TTS) model from restored speech samples\ncollected from the Web. Audio samples are available at our demo page:\ngoogle.github.io/df-conformer/miipher/\n","authors":["Yuma Koizumi","Heiga Zen","Shigeki Karita","Yifan Ding","Kohei Yatabe","Nobuyuki Morioka","Yu Zhang","Wei Han","Ankur Bapna","Michiel Bacchiani"],"pdf_url":"https://arxiv.org/pdf/2303.01664v2.pdf","comment":"Accepted to WASPAA 2023"},{"id":"http://arxiv.org/abs/2308.03330v2","updated":"2023-08-14T09:16:34Z","published":"2023-08-07T06:23:24Z","title":"Expediting Neural Network Verification via Network Reduction","summary":" A wide range of verification methods have been proposed to verify the safety\nproperties of deep neural networks ensuring that the networks function\ncorrectly in critical applications. However, many well-known verification tools\nstill struggle with complicated network architectures and large network sizes.\nIn this work, we propose a network reduction technique as a pre-processing\nmethod prior to verification. The proposed method reduces neural networks via\neliminating stable ReLU neurons, and transforming them into a sequential neural\nnetwork consisting of ReLU and Affine layers which can be handled by the most\nverification tools. We instantiate the reduction technique on the\nstate-of-the-art complete and incomplete verification tools, including\nalpha-beta-crown, VeriNet and PRIMA. Our experiments on a large set of\nbenchmarks indicate that the proposed technique can significantly reduce neural\nnetworks and speed up existing verification tools. Furthermore, the experiment\nresults also show that network reduction can improve the availability of\nexisting verification tools on many networks by reducing them into sequential\nneural networks.\n","authors":["Yuyi Zhong","Ruiwei Wang","Siau-Cheng Khoo"],"pdf_url":"https://arxiv.org/pdf/2308.03330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13926v2","updated":"2023-08-14T09:04:14Z","published":"2023-06-24T10:21:11Z","title":"Graph Neural Networks Provably Benefit from Structural Information: A\n Feature Learning Perspective","summary":" Graph neural networks (GNNs) have pioneered advancements in graph\nrepresentation learning, exhibiting superior feature learning and performance\nover multilayer perceptrons (MLPs) when handling graph inputs. However,\nunderstanding the feature learning aspect of GNNs is still in its initial\nstage. This study aims to bridge this gap by investigating the role of graph\nconvolution within the context of feature learning theory in neural networks\nusing gradient descent training. We provide a distinct characterization of\nsignal learning and noise memorization in two-layer graph convolutional\nnetworks (GCNs), contrasting them with two-layer convolutional neural networks\n(CNNs). Our findings reveal that graph convolution significantly augments the\nbenign overfitting regime over the counterpart CNNs, where signal learning\nsurpasses noise memorization, by approximately factor $\\sqrt{D}^{q-2}$, with\n$D$ denoting a node's expected degree and $q$ being the power of the ReLU\nactivation function where $q > 2$. These findings highlight a substantial\ndiscrepancy between GNNs and MLPs in terms of feature learning and\ngeneralization capacity after gradient descent training, a conclusion further\nsubstantiated by our empirical simulations.\n","authors":["Wei Huang","Yuan Cao","Haonan Wang","Xin Cao","Taiji Suzuki"],"pdf_url":"https://arxiv.org/pdf/2306.13926v2.pdf","comment":"33 pages, 7 figures. We have provided a clearer roadmap"},{"id":"http://arxiv.org/abs/2308.07013v1","updated":"2023-08-14T09:00:58Z","published":"2023-08-14T09:00:58Z","title":"Learning to Optimize LSM-trees: Towards A Reinforcement Learning based\n Key-Value Store for Dynamic Workloads","summary":" LSM-trees are widely adopted as the storage backend of key-value stores.\nHowever, optimizing the system performance under dynamic workloads has not been\nsufficiently studied or evaluated in previous work. To fill the gap, we present\nRusKey, a key-value store with the following new features: (1) RusKey is a\nfirst attempt to orchestrate LSM-tree structures online to enable robust\nperformance under the context of dynamic workloads; (2) RusKey is the first\nstudy to use Reinforcement Learning (RL) to guide LSM-tree transformations; (3)\nRusKey includes a new LSM-tree design, named FLSM-tree, for an efficient\ntransition between different compaction policies -- the bottleneck of dynamic\nkey-value stores. We justify the superiority of the new design with theoretical\nanalysis; (4) RusKey requires no prior workload knowledge for system\nadjustment, in contrast to state-of-the-art techniques. Experiments show that\nRusKey exhibits strong performance robustness in diverse workloads, achieving\nup to 4x better end-to-end performance than the RocksDB system under various\nsettings.\n","authors":["Dingheng Mo","Fanchao Chen","Siqiang Luo","Caihua Shan"],"pdf_url":"https://arxiv.org/pdf/2308.07013v1.pdf","comment":"25 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.07012v1","updated":"2023-08-14T08:59:59Z","published":"2023-08-14T08:59:59Z","title":"Greedy online change point detection","summary":" Standard online change point detection (CPD) methods tend to have large false\ndiscovery rates as their detections are sensitive to outliers. To overcome this\ndrawback, we propose Greedy Online Change Point Detection (GOCPD), a\ncomputationally appealing method which finds change points by maximizing the\nprobability of the data coming from the (temporal) concatenation of two\nindependent models. We show that, for time series with a single change point,\nthis objective is unimodal and thus CPD can be accelerated via ternary search\nwith logarithmic complexity. We demonstrate the effectiveness of GOCPD on\nsynthetic data and validate our findings on real-world univariate and\nmultivariate settings.\n","authors":["Jou-Hui Ho","Felipe Tobar"],"pdf_url":"https://arxiv.org/pdf/2308.07012v1.pdf","comment":"Accepted at IEEE MLSP 2023"},{"id":"http://arxiv.org/abs/2105.10377v4","updated":"2023-08-14T08:26:31Z","published":"2021-05-21T14:36:39Z","title":"Adaptive Filters in Graph Convolutional Neural Networks","summary":" Over the last few years, we have witnessed the availability of an increasing\ndata generated from non-Euclidean domains, which are usually represented as\ngraphs with complex relationships, and Graph Neural Networks (GNN) have gained\na high interest because of their potential in processing graph-structured data.\nIn particular, there is a strong interest in exploring the possibilities in\nperforming convolution on graphs using an extension of the GNN architecture,\ngenerally referred to as Graph Convolutional Neural Networks (ConvGNN).\nConvolution on graphs has been achieved mainly in two forms: spectral and\nspatial convolutions. Due to the higher flexibility in exploring and exploiting\nthe graph structure of data, there is recently an increasing interest in\ninvestigating the possibilities that the spatial approach can offer. The idea\nof finding a way to adapt the network behaviour to the inputs they process to\nmaximize the total performances has aroused much interest in the neural\nnetworks literature over the years. This paper presents a novel method to adapt\nthe behaviour of a ConvGNN to the input proposing a method to perform spatial\nconvolution on graphs using input-specific filters, which are dynamically\ngenerated from nodes feature vectors. The experimental assessment confirms the\ncapabilities of the proposed approach, which achieves satisfying results using\na low number of filters.\n","authors":["Andrea Apicella","Francesco Isgrò","Andrea Pollastro","Roberto Prevete"],"pdf_url":"https://arxiv.org/pdf/2105.10377v4.pdf","comment":"This paper has been published in its final version on \\textit{Pattern\n Recognition} journal with DOI https://doi.org/10.1016/j.patcog.2023.109867 in\n Open Access mode. Please consider it as final and peer-reviewed version"},{"id":"http://arxiv.org/abs/2307.16680v3","updated":"2023-08-14T07:59:36Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n A Comprehensive Survey","summary":" Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v3.pdf","comment":"draft version"},{"id":"http://arxiv.org/abs/2210.05674v4","updated":"2023-08-14T07:57:48Z","published":"2022-10-11T07:39:08Z","title":"Semi-supervised detection of structural damage using Variational\n Autoencoder and a One-Class Support Vector Machine","summary":" In recent years, Artificial Neural Networks (ANNs) have been introduced in\nStructural Health Monitoring (SHM) systems. A semi-supervised method with a\ndata-driven approach allows the ANN training on data acquired from an undamaged\nstructural condition to detect structural damages. In standard approaches,\nafter the training stage, a decision rule is manually defined to detect\nanomalous data. However, this process could be made automatic using machine\nlearning methods, whom performances are maximised using hyperparameter\noptimization techniques. The paper proposes a semi-supervised method with a\ndata-driven approach to detect structural anomalies. The methodology consists\nof: (i) a Variational Autoencoder (VAE) to approximate undamaged data\ndistribution and (ii) a One-Class Support Vector Machine (OC-SVM) to\ndiscriminate different health conditions using damage sensitive features\nextracted from VAE's signal reconstruction. The method is applied to a scale\nsteel structure that was tested in nine damage's scenarios by IASC-ASCE\nStructural Health Monitoring Task Group.\n","authors":["Andrea Pollastro","Giusiana Testa","Antonio Bilotta","Roberto Prevete"],"pdf_url":"https://arxiv.org/pdf/2210.05674v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06987v1","updated":"2023-08-14T07:51:15Z","published":"2023-08-14T07:51:15Z","title":"Deep convolutional neural networks for cyclic sensor data","summary":" Predictive maintenance plays a critical role in ensuring the uninterrupted\noperation of industrial systems and mitigating the potential risks associated\nwith system failures. This study focuses on sensor-based condition monitoring\nand explores the application of deep learning techniques using a hydraulic\nsystem testbed dataset. Our investigation involves comparing the performance of\nthree models: a baseline model employing conventional methods, a single CNN\nmodel with early sensor fusion, and a two-lane CNN model (2L-CNN) with late\nsensor fusion. The baseline model achieves an impressive test error rate of 1%\nby employing late sensor fusion, where feature extraction is performed\nindividually for each sensor. However, the CNN model encounters challenges due\nto the diverse sensor characteristics, resulting in an error rate of 20.5%. To\nfurther investigate this issue, we conduct separate training for each sensor\nand observe variations in accuracy. Additionally, we evaluate the performance\nof the 2L-CNN model, which demonstrates significant improvement by reducing the\nerror rate by 33% when considering the combination of the least and most\noptimal sensors. This study underscores the importance of effectively\naddressing the complexities posed by multi-sensor systems in sensor-based\ncondition monitoring.\n","authors":["Payman Goodarzi","Yannick Robin","Andreas Schütze","Tizian Schneider"],"pdf_url":"https://arxiv.org/pdf/2308.06987v1.pdf","comment":"4 pages, 3 figures, submitted to the IEEE Sensors Conference"},{"id":"http://arxiv.org/abs/2304.02849v2","updated":"2023-08-14T07:38:32Z","published":"2023-04-06T03:45:07Z","title":"Logistic-Normal Likelihoods for Heteroscedastic Label Noise","summary":" A natural way of estimating heteroscedastic label noise in regression is to\nmodel the observed (potentially noisy) target as a sample from a normal\ndistribution, whose parameters can be learned by minimizing the negative\nlog-likelihood. This formulation has desirable loss attenuation properties, as\nit reduces the contribution of high-error examples. Intuitively, this behavior\ncan improve robustness against label noise by reducing overfitting. We propose\nan extension of this simple and probabilistic approach to classification that\nhas the same desirable loss attenuation properties. Furthermore, we discuss and\naddress some practical challenges of this extension. We evaluate the\neffectiveness of the method by measuring its robustness against label noise in\nclassification. We perform enlightening experiments exploring the inner\nworkings of the method, including sensitivity to hyperparameters, ablation\nstudies, and other insightful analyses.\n","authors":["Erik Englesson","Amir Mehrpanah","Hossein Azizpour"],"pdf_url":"https://arxiv.org/pdf/2304.02849v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06983v1","updated":"2023-08-14T07:35:43Z","published":"2023-08-14T07:35:43Z","title":"pNNCLR: Stochastic Pseudo Neighborhoods for Contrastive Learning based\n Unsupervised Representation Learning Problems","summary":" Nearest neighbor (NN) sampling provides more semantic variations than\npre-defined transformations for self-supervised learning (SSL) based image\nrecognition problems. However, its performance is restricted by the quality of\nthe support set, which holds positive samples for the contrastive loss. In this\nwork, we show that the quality of the support set plays a crucial role in any\nnearest neighbor based method for SSL. We then provide a refined baseline\n(pNNCLR) to the nearest neighbor based SSL approach (NNCLR). To this end, we\nintroduce pseudo nearest neighbors (pNN) to control the quality of the support\nset, wherein, rather than sampling the nearest neighbors, we sample in the\nvicinity of hard nearest neighbors by varying the magnitude of the resultant\nvector and employing a stochastic sampling strategy to improve the performance.\nAdditionally, to stabilize the effects of uncertainty in NN-based learning, we\nemploy a smooth-weight-update approach for training the proposed network.\nEvaluation of the proposed method on multiple public image recognition and\nmedical image recognition datasets shows that it performs up to 8 percent\nbetter than the baseline nearest neighbor method, and is comparable to other\npreviously proposed SSL methods.\n","authors":["Momojit Biswas","Himanshu Buckchash","Dilip K. Prasad"],"pdf_url":"https://arxiv.org/pdf/2308.06983v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.15941v2","updated":"2023-08-14T07:15:21Z","published":"2023-07-29T09:29:09Z","title":"Continual Learning in Predictive Autoscaling","summary":" Predictive Autoscaling is used to forecast the workloads of servers and\nprepare the resources in advance to ensure service level objectives (SLOs) in\ndynamic cloud environments. However, in practice, its prediction task often\nsuffers from performance degradation under abnormal traffics caused by external\nevents (such as sales promotional activities and applications\nre-configurations), for which a common solution is to re-train the model with\ndata of a long historical period, but at the expense of high computational and\nstorage costs. To better address this problem, we propose a replay-based\ncontinual learning method, i.e., Density-based Memory Selection and Hint-based\nNetwork Learning Model (DMSHM), using only a small part of the historical log\nto achieve accurate predictions. First, we discover the phenomenon of sample\noverlap when applying replay-based continual learning in prediction tasks. In\norder to surmount this challenge and effectively integrate new sample\ndistribution, we propose a density-based sample selection strategy that\nutilizes kernel density estimation to calculate sample density as a reference\nto compute sample weight, and employs weight sampling to construct a new memory\nset. Then we implement hint-based network learning based on hint representation\nto optimize the parameters. Finally, we conduct experiments on public and\nindustrial datasets to demonstrate that our proposed method outperforms\nstate-of-the-art continual learning methods in terms of memory capacity and\nprediction accuracy. Furthermore, we demonstrate remarkable practicability of\nDMSHM in real industrial applications.\n","authors":["Hongyan Hao","Zhixuan Chu","Shiyi Zhu","Gangwei Jiang","Yan Wang","Caigao Jiang","James Zhang","Wei Jiang","Siqiao Xue","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.15941v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06973v1","updated":"2023-08-14T07:11:55Z","published":"2023-08-14T07:11:55Z","title":"Routing Recovery for UAV Networks with Deliberate Attacks: A\n Reinforcement Learning based Approach","summary":" The unmanned aerial vehicle (UAV) network is popular these years due to its\nvarious applications. In the UAV network, routing is significantly affected by\nthe distributed network topology, leading to the issue that UAVs are vulnerable\nto deliberate damage. Hence, this paper focuses on the routing plan and\nrecovery for UAV networks with attacks. In detail, a deliberate attack model\nbased on the importance of nodes is designed to represent enemy attacks. Then,\na node importance ranking mechanism is presented, considering the degree of\nnodes and link importance. However, it is intractable to handle the routing\nproblem by traditional methods for UAV networks, since link connections change\nwith the UAV availability. Hence, an intelligent algorithm based on\nreinforcement learning is proposed to recover the routing path when UAVs are\nattacked. Simulations are conducted and numerical results verify the proposed\nmechanism performs better than other referred methods.\n","authors":["Sijie He","Ziye Jia","Chao Dong","Wei Wang","Yilu Cao","Yang Yang","Qihui Wu"],"pdf_url":"https://arxiv.org/pdf/2308.06973v1.pdf","comment":"IEEE GLOBECOM 2023, 6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.06965v1","updated":"2023-08-14T06:43:59Z","published":"2023-08-14T06:43:59Z","title":"AutoAssign+: Automatic Shared Embedding Assignment in Streaming\n Recommendation","summary":" In the domain of streaming recommender systems, conventional methods for\naddressing new user IDs or item IDs typically involve assigning initial ID\nembeddings randomly. However, this practice results in two practical\nchallenges: (i) Items or users with limited interactive data may yield\nsuboptimal prediction performance. (ii) Embedding new IDs or low-frequency IDs\nnecessitates consistently expanding the embedding table, leading to unnecessary\nmemory consumption. In light of these concerns, we introduce a reinforcement\nlearning-driven framework, namely AutoAssign+, that facilitates Automatic\nShared Embedding Assignment Plus. To be specific, AutoAssign+ utilizes an\nIdentity Agent as an actor network, which plays a dual role: (i) Representing\nlow-frequency IDs field-wise with a small set of shared embeddings to enhance\nthe embedding initialization, and (ii) Dynamically determining which ID\nfeatures should be retained or eliminated in the embedding table. The policy of\nthe agent is optimized with the guidance of a critic network. To evaluate the\neffectiveness of our approach, we perform extensive experiments on three\ncommonly used benchmark datasets. Our experiment results demonstrate that\nAutoAssign+ is capable of significantly enhancing recommendation performance by\nmitigating the cold-start problem. Furthermore, our framework yields a\nreduction in memory usage of approximately 20-30%, verifying its practical\neffectiveness and efficiency for streaming recommender systems.\n","authors":["Ziru Liu","Kecheng Chen","Fengyi Song","Bo Chen","Xiangyu Zhao","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2308.06965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06961v1","updated":"2023-08-14T06:32:52Z","published":"2023-08-14T06:32:52Z","title":"Graph Structural Residuals: A Learning Approach to Diagnosis","summary":" Traditional model-based diagnosis relies on constructing explicit system\nmodels, a process that can be laborious and expertise-demanding. In this paper,\nwe propose a novel framework that combines concepts of model-based diagnosis\nwith deep graph structure learning. This data-driven approach leverages data to\nlearn the system's underlying structure and provide dynamic observations,\nrepresented by two distinct graph adjacency matrices. Our work facilitates a\nseamless integration of graph structure learning with model-based diagnosis by\nmaking three main contributions: (i) redefining the constructs of system\nrepresentation, observations, and faults (ii) introducing two distinct versions\nof a self-supervised graph structure learning model architecture and (iii)\ndemonstrating the potential of our data-driven diagnostic method through\nexperiments on a system of coupled oscillators.\n","authors":["Jan Lukas Augustin","Oliver Niggemann"],"pdf_url":"https://arxiv.org/pdf/2308.06961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06960v1","updated":"2023-08-14T06:32:02Z","published":"2023-08-14T06:32:02Z","title":"Search to Fine-tune Pre-trained Graph Neural Networks for Graph-level\n Tasks","summary":" Recently, graph neural networks (GNNs) have shown its unprecedented success\nin many graph-related tasks. However, GNNs face the label scarcity issue as\nother neural networks do. Thus, recent efforts try to pre-train GNNs on a\nlarge-scale unlabeled graph and adapt the knowledge from the unlabeled graph to\nthe target downstream task. The adaptation is generally achieved by fine-tuning\nthe pre-trained GNNs with a limited number of labeled data. Despite the\nimportance of fine-tuning, current GNNs pre-training works often ignore\ndesigning a good fine-tuning strategy to better leverage transferred knowledge\nand improve the performance on downstream tasks. Only few works start to\ninvestigate a better fine-tuning strategy for pre-trained GNNs. But their\ndesigns either have strong assumptions or overlook the data-aware issue for\nvarious downstream datasets. Therefore, we aim to design a better fine-tuning\nstrategy for pre-trained GNNs to improve the model performance in this paper.\nGiven a pre-trained GNN, we propose to search to fine-tune pre-trained graph\nneural networks for graph-level tasks (S2PGNN), which adaptively design a\nsuitable fine-tuning framework for the given labeled data on the downstream\ntask. To ensure the improvement brought by searching fine-tuning strategy, we\ncarefully summarize a proper search space of fine-tuning framework that is\nsuitable for GNNs. The empirical studies show that S2PGNN can be implemented on\nthe top of 10 famous pre-trained GNNs and consistently improve their\nperformance. Besides, S2PGNN achieves better performance than existing\nfine-tuning strategies within and outside the GNN area. Our code is publicly\navailable at \\url{https://anonymous.4open.science/r/code_icde2024-A9CB/}.\n","authors":["Zhili Wang","Shimin Di","Lei Chen","Xiaofang Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.06960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06959v1","updated":"2023-08-14T06:29:09Z","published":"2023-08-14T06:29:09Z","title":"Data-Driven Allocation of Preventive Care With Application to Diabetes\n Mellitus Type II","summary":" Problem Definition. Increasing costs of healthcare highlight the importance\nof effective disease prevention. However, decision models for allocating\npreventive care are lacking.\n Methodology/Results. In this paper, we develop a data-driven decision model\nfor determining a cost-effective allocation of preventive treatments to\npatients at risk. Specifically, we combine counterfactual inference, machine\nlearning, and optimization techniques to build a scalable decision model that\ncan exploit high-dimensional medical data, such as the data found in modern\nelectronic health records. Our decision model is evaluated based on electronic\nhealth records from 89,191 prediabetic patients. We compare the allocation of\npreventive treatments (metformin) prescribed by our data-driven decision model\nwith that of current practice. We find that if our approach is applied to the\nU.S. population, it can yield annual savings of $1.1 billion. Finally, we\nanalyze the cost-effectiveness under varying budget levels.\n Managerial Implications. Our work supports decision-making in health\nmanagement, with the goal of achieving effective disease prevention at lower\ncosts. Importantly, our decision model is generic and can thus be used for\neffective allocation of preventive care for other preventable diseases.\n","authors":["Mathias Kraus","Stefan Feuerriegel","Maytal Saar-Tsechansky"],"pdf_url":"https://arxiv.org/pdf/2308.06959v1.pdf","comment":"Accepted by Manufacturing & Service Operations Management"},{"id":"http://arxiv.org/abs/2308.06957v1","updated":"2023-08-14T06:22:49Z","published":"2023-08-14T06:22:49Z","title":"CEmb-SAM: Segment Anything Model with Condition Embedding for Joint\n Learning from Heterogeneous Datasets","summary":" Automated segmentation of ultrasound images can assist medical experts with\ndiagnostic and therapeutic procedures. Although using the common modality of\nultrasound, one typically needs separate datasets in order to segment, for\nexample, different anatomical structures or lesions with different levels of\nmalignancy. In this paper, we consider the problem of jointly learning from\nheterogeneous datasets so that the model can improve generalization abilities\nby leveraging the inherent variability among datasets. We merge the\nheterogeneous datasets into one dataset and refer to each component dataset as\na subgroup. We propose to train a single segmentation model so that the model\ncan adapt to each sub-group. For robust segmentation, we leverage recently\nproposed Segment Anything model (SAM) in order to incorporate sub-group\ninformation into the model. We propose SAM with Condition Embedding block\n(CEmb-SAM) which encodes sub-group conditions and combines them with image\nembeddings from SAM. The conditional embedding block effectively adapts SAM to\neach image sub-group by incorporating dataset properties through learnable\nparameters for normalization. Experiments show that CEmb-SAM outperforms the\nbaseline methods on ultrasound image segmentation for peripheral nerves and\nbreast cancer. The experiments highlight the effectiveness of Cemb-SAM in\nlearning from heterogeneous datasets in medical image segmentation tasks.\n","authors":["Dongik Shin","Beomsuk Kim","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2308.06957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06952v1","updated":"2023-08-14T06:04:50Z","published":"2023-08-14T06:04:50Z","title":"Channel-Wise Contrastive Learning for Learning with Noisy Labels","summary":" In real-world datasets, noisy labels are pervasive. The challenge of learning\nwith noisy labels (LNL) is to train a classifier that discerns the actual\nclasses from given instances. For this, the model must identify features\nindicative of the authentic labels. While research indicates that genuine label\ninformation is embedded in the learned features of even inaccurately labeled\ndata, it's often intertwined with noise, complicating its direct application.\nAddressing this, we introduce channel-wise contrastive learning (CWCL). This\nmethod distinguishes authentic label information from noise by undertaking\ncontrastive learning across diverse channels. Unlike conventional instance-wise\ncontrastive learning (IWCL), CWCL tends to yield more nuanced and resilient\nfeatures aligned with the authentic labels. Our strategy is twofold: firstly,\nusing CWCL to extract pertinent features to identify cleanly labeled samples,\nand secondly, progressively fine-tuning using these samples. Evaluations on\nseveral benchmark datasets validate our method's superiority over existing\napproaches.\n","authors":["Hui Kang","Sheng Liu","Huaxi Huang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.06952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06947v1","updated":"2023-08-14T05:54:32Z","published":"2023-08-14T05:54:32Z","title":"Knowing Where to Focus: Event-aware Transformer for Video Grounding","summary":" Recent DETR-based video grounding models have made the model directly predict\nmoment timestamps without any hand-crafted components, such as a pre-defined\nproposal or non-maximum suppression, by learning moment queries. However, their\ninput-agnostic moment queries inevitably overlook an intrinsic temporal\nstructure of a video, providing limited positional information. In this paper,\nwe formulate an event-aware dynamic moment query to enable the model to take\nthe input-specific content and positional information of the video into\naccount. To this end, we present two levels of reasoning: 1) Event reasoning\nthat captures distinctive event units constituting a given video using a slot\nattention mechanism; and 2) moment reasoning that fuses the moment queries with\na given sentence through a gated fusion transformer layer and learns\ninteractions between the moment queries and video-sentence representations to\npredict moment timestamps. Extensive experiments demonstrate the effectiveness\nand efficiency of the event-aware dynamic moment queries, outperforming\nstate-of-the-art approaches on several video grounding benchmarks.\n","authors":["Jinhyun Jang","Jungin Park","Jin Kim","Hyeongjun Kwon","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2308.06947v1.pdf","comment":"ICCV 2023. Code is available at https://github.com/jinhyunj/EaTR"},{"id":"http://arxiv.org/abs/2308.06945v1","updated":"2023-08-14T05:37:07Z","published":"2023-08-14T05:37:07Z","title":"Semantic-aware Network for Aerial-to-Ground Image Synthesis","summary":" Aerial-to-ground image synthesis is an emerging and challenging problem that\naims to synthesize a ground image from an aerial image. Due to the highly\ndifferent layout and object representation between the aerial and ground\nimages, existing approaches usually fail to transfer the components of the\naerial scene into the ground scene. In this paper, we propose a novel framework\nto explore the challenges by imposing enhanced structural alignment and\nsemantic awareness. We introduce a novel semantic-attentive feature\ntransformation module that allows to reconstruct the complex geographic\nstructures by aligning the aerial feature to the ground layout. Furthermore, we\npropose semantic-aware loss functions by leveraging a pre-trained segmentation\nnetwork. The network is enforced to synthesize realistic objects across various\nclasses by separately calculating losses for different classes and balancing\nthem. Extensive experiments including comparisons with previous methods and\nablation studies show the effectiveness of the proposed framework both\nqualitatively and quantitatively.\n","authors":["Jinhyun Jang","Taeyong Song","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2308.06945v1.pdf","comment":"ICIP 2021. Code is available at https://github.com/jinhyunj/SANet"},{"id":"http://arxiv.org/abs/2210.13869v4","updated":"2023-08-14T05:33:48Z","published":"2022-10-25T09:45:49Z","title":"A jet tagging algorithm of graph network with HaarPooling message\n passing","summary":" Recently methods of graph neural networks (GNNs) have been applied to solving\nthe problems in high energy physics (HEP) and have shown its great potential\nfor quark-gluon tagging with graph representation of jet events. In this paper,\nwe introduce an approach of GNNs combined with a HaarPooling operation to\nanalyze the events, called HaarPooling Message Passing neural network (HMPNet).\nIn HMPNet, HaarPooling not only extracts the features of graph, but embeds\nadditional information obtained by clustering of k-means of different particle\nfeatures. We construct Haarpooling from five different features: absolute\nenergy $\\log E$, transverse momentum $\\log p_T$, relative coordinates\n$(\\Delta\\eta,\\Delta\\phi)$, the mixed ones $(\\log E, \\log p_T)$ and $(\\log E,\n\\log p_T, \\Delta\\eta,\\Delta\\phi)$. The results show that an appropriate\nselection of information for HaarPooling enhances the accuracy of quark-gluon\ntagging, as adding extra information of $\\log P_T$ to the HMPNet outperforms\nall the others, whereas adding relative coordinates information\n$(\\Delta\\eta,\\Delta\\phi)$ is not very effective. This implies that by adding\neffective particle features from HaarPooling can achieve much better results\nthan solely pure message passing neutral network (MPNN) can do, which\ndemonstrates significant improvement of feature extraction via the pooling\nprocess. Finally we compare the HMPNet study, ordering by $p_T$, with other\nstudies and prove that the HMPNet is also a good choice of GNN algorithms for\njet tagging.\n","authors":["Fei Ma","Feiyi Liu","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2210.13869v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09833v3","updated":"2023-08-14T05:29:40Z","published":"2022-08-21T07:47:05Z","title":"Label-Noise Learning with Intrinsically Long-Tailed Data","summary":" Label noise is one of the key factors that lead to the poor generalization of\ndeep learning models. Existing label-noise learning methods usually assume that\nthe ground-truth classes of the training data are balanced. However, the\nreal-world data is often imbalanced, leading to the inconsistency between\nobserved and intrinsic class distribution with label noises. In this case, it\nis hard to distinguish clean samples from noisy samples on the intrinsic tail\nclasses with the unknown intrinsic class distribution. In this paper, we\npropose a learning framework for label-noise learning with intrinsically\nlong-tailed data. Specifically, we propose two-stage bi-dimensional sample\nselection (TABASCO) to better separate clean samples from noisy samples,\nespecially for the tail classes. TABASCO consists of two new separation metrics\nthat complement each other to compensate for the limitation of using a single\nmetric in sample separation. Extensive experiments on benchmarks demonstrate\nthe effectiveness of our method. Our code is available at\nhttps://github.com/Wakings/TABASCO.\n","authors":["Yang Lu","Yiliang Zhang","Bo Han","Yiu-ming Cheung","Hanzi Wang"],"pdf_url":"https://arxiv.org/pdf/2208.09833v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2302.12444v3","updated":"2023-08-14T05:22:10Z","published":"2023-02-24T04:10:54Z","title":"On the Training Instability of Shuffling SGD with Batch Normalization","summary":" We uncover how SGD interacts with batch normalization and can exhibit\nundesirable training dynamics such as divergence. More precisely, we study how\nSingle Shuffle (SS) and Random Reshuffle (RR) -- two widely used variants of\nSGD -- interact surprisingly differently in the presence of batch\nnormalization: RR leads to much more stable evolution of training loss than SS.\nAs a concrete example, for regression using a linear network with batch\nnormalization, we prove that SS and RR converge to distinct global optima that\nare \"distorted\" away from gradient descent. Thereafter, for classification we\ncharacterize conditions under which training divergence for SS and RR can, and\ncannot occur. We present explicit constructions to show how SS leads to\ndistorted optima in regression and divergence for classification, whereas RR\navoids both distortion and divergence. We validate our results by confirming\nthem empirically in realistic settings, and conclude that the separation\nbetween SS and RR used with batch normalization is relevant in practice.\n","authors":["David X. Wu","Chulhee Yun","Suvrit Sra"],"pdf_url":"https://arxiv.org/pdf/2302.12444v3.pdf","comment":"ICML 2023 camera-ready version, added references; 75 pages"},{"id":"http://arxiv.org/abs/2308.03669v2","updated":"2023-08-14T05:17:09Z","published":"2023-08-07T15:40:34Z","title":"Diffusion Model in Causal Inference with Unmeasured Confounders","summary":" We study how to extend the use of the diffusion model to answer the causal\nquestion from the observational data under the existence of unmeasured\nconfounders. In Pearl's framework of using a Directed Acyclic Graph (DAG) to\ncapture the causal intervention, a Diffusion-based Causal Model (DCM) was\nproposed incorporating the diffusion model to answer the causal questions more\naccurately, assuming that all of the confounders are observed. However,\nunmeasured confounders in practice exist, which hinders DCM from being\napplicable. To alleviate this limitation of DCM, we propose an extended model\ncalled Backdoor Criterion based DCM (BDCM), whose idea is rooted in the\nBackdoor criterion to find the variables in DAG to be included in the decoding\nprocess of the diffusion model so that we can extend DCM to the case with\nunmeasured confounders. Synthetic data experiment demonstrates that our\nproposed model captures the counterfactual distribution more precisely than DCM\nunder the unmeasured confounders.\n","authors":["Tatsuhiro Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.03669v2.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.02293v2","updated":"2023-08-14T05:06:09Z","published":"2023-08-04T12:57:13Z","title":"A stochastic optimization approach to train non-linear neural networks\n with a higher-order variation regularization","summary":" While highly expressive parametric models including deep neural networks have\nan advantage to model complicated concepts, training such highly non-linear\nmodels is known to yield a high risk of notorious overfitting. To address this\nissue, this study considers a $(k,q)$th order variation regularization\n($(k,q)$-VR), which is defined as the $q$th-powered integral of the absolute\n$k$th order derivative of the parametric models to be trained; penalizing the\n$(k,q)$-VR is expected to yield a smoother function, which is expected to avoid\noverfitting. Particularly, $(k,q)$-VR encompasses the conventional\n(general-order) total variation with $q=1$. While the $(k,q)$-VR terms applied\nto general parametric models are computationally intractable due to the\nintegration, this study provides a stochastic optimization algorithm, that can\nefficiently train general models with the $(k,q)$-VR without conducting\nexplicit numerical integration. The proposed approach can be applied to the\ntraining of even deep neural networks whose structure is arbitrary, as it can\nbe implemented by only a simple stochastic gradient descent algorithm and\nautomatic differentiation. Our numerical experiments demonstrate that the\nneural networks trained with the $(k,q)$-VR terms are more ``resilient'' than\nthose with the conventional parameter regularization. The proposed algorithm\nalso can be extended to the physics-informed training of neural networks\n(PINNs).\n","authors":["Akifumi Okuno"],"pdf_url":"https://arxiv.org/pdf/2308.02293v2.pdf","comment":"13 pages, 24 figures"},{"id":"http://arxiv.org/abs/2308.06935v1","updated":"2023-08-14T04:44:56Z","published":"2023-08-14T04:44:56Z","title":"Insurance pricing on price comparison websites via reinforcement\n learning","summary":" The emergence of price comparison websites (PCWs) has presented insurers with\nunique challenges in formulating effective pricing strategies. Operating on\nPCWs requires insurers to strike a delicate balance between competitive\npremiums and profitability, amidst obstacles such as low historical conversion\nrates, limited visibility of competitors' actions, and a dynamic market\nenvironment. In addition to this, the capital intensive nature of the business\nmeans pricing below the risk levels of customers can result in solvency issues\nfor the insurer. To address these challenges, this paper introduces\nreinforcement learning (RL) framework that learns the optimal pricing policy by\nintegrating model-based and model-free methods. The model-based component is\nused to train agents in an offline setting, avoiding cold-start issues, while\nmodel-free algorithms are then employed in a contextual bandit (CB) manner to\ndynamically update the pricing policy to maximise the expected revenue. This\nfacilitates quick adaptation to evolving market dynamics and enhances algorithm\nefficiency and decision interpretability. The paper also highlights the\nimportance of evaluating pricing policies using an offline dataset in a\nconsistent fashion and demonstrates the superiority of the proposed methodology\nover existing off-the-shelf RL/CB approaches. We validate our methodology using\nsynthetic data, generated to reflect private commercially available data within\nreal-world insurers, and compare against 6 other benchmark approaches. Our\nhybrid agent outperforms these benchmarks in terms of sample efficiency and\ncumulative reward with the exception of an agent that has access to perfect\nmarket information which would not be available in a real-world set-up.\n","authors":["Tanut Treetanthiploet","Yufei Zhang","Lukasz Szpruch","Isaac Bowers-Barnard","Henrietta Ridley","James Hickey","Chris Pearce"],"pdf_url":"https://arxiv.org/pdf/2308.06935v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06929v1","updated":"2023-08-14T04:15:09Z","published":"2023-08-14T04:15:09Z","title":"Predicting Listing Prices In Dynamic Short Term Rental Markets Using\n Machine Learning Models","summary":" Our research group wanted to take on the difficult task of predicting prices\nin a dynamic market. And short term rentals such as Airbnb listings seemed to\nbe the perfect proving ground to do such a thing. Airbnb has revolutionized the\ntravel industry by providing a platform for homeowners to rent out their\nproperties to travelers. The pricing of Airbnb rentals is prone to high\nfluctuations, with prices changing frequently based on demand, seasonality, and\nother factors. Accurate prediction of Airbnb rental prices is crucial for hosts\nto optimize their revenue and for travelers to make informed booking decisions.\nIn this project, we aim to predict the prices of Airbnb rentals using a machine\nlearning modeling approach.\n Our project expands on earlier research in the area of analyzing Airbnb\nrental prices by taking a methodical machine learning approach as well as\nincorporating sentiment analysis into our feature engineering. We intend to\ngain a deeper understanding on periodic changes of Airbnb rental prices. The\nprimary objective of this study is to construct an accurate machine learning\nmodel for predicting Airbnb rental prices specifically in Austin, Texas. Our\nproject's secondary objective is to identify the key factors that drive Airbnb\nrental prices and to investigate how these factors vary across different\nlocations and property types.\n","authors":["Sam Chapman","Seifey Mohammad","Kimberly Villegas"],"pdf_url":"https://arxiv.org/pdf/2308.06929v1.pdf","comment":"40 pages, 10 tables, 12 figures"},{"id":"http://arxiv.org/abs/2308.06925v1","updated":"2023-08-14T04:03:51Z","published":"2023-08-14T04:03:51Z","title":"CBA: Improving Online Continual Learning via Continual Bias Adaptor","summary":" Online continual learning (CL) aims to learn new knowledge and consolidate\npreviously learned knowledge from non-stationary data streams. Due to the\ntime-varying training setting, the model learned from a changing distribution\neasily forgets the previously learned knowledge and biases toward the newly\nreceived task. To address this problem, we propose a Continual Bias Adaptor\n(CBA) module to augment the classifier network to adapt to catastrophic\ndistribution change during training, such that the classifier network is able\nto learn a stable consolidation of previously learned tasks. In the testing\nstage, CBA can be removed which introduces no additional computation cost and\nmemory overhead. We theoretically reveal the reason why the proposed method can\neffectively alleviate catastrophic distribution shifts, and empirically\ndemonstrate its effectiveness through extensive experiments based on four\nrehearsal-based baselines and three public continual learning benchmarks.\n","authors":["Quanziang Wang","Renzhen Wang","Yichen Wu","Xixi Jia","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2308.06925v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.06937v2","updated":"2023-08-14T03:46:51Z","published":"2023-03-13T09:11:54Z","title":"TARGET: Federated Class-Continual Learning via Exemplar-Free\n Distillation","summary":" This paper focuses on an under-explored yet important problem: Federated\nClass-Continual Learning (FCCL), where new classes are dynamically added in\nfederated learning. Existing FCCL works suffer from various limitations, such\nas requiring additional datasets or storing the private data from previous\ntasks. In response, we first demonstrate that non-IID data exacerbates\ncatastrophic forgetting issue in FL. Then we propose a novel method called\nTARGET (federat\\textbf{T}ed cl\\textbf{A}ss-continual lea\\textbf{R}nin\\textbf{G}\nvia \\textbf{E}xemplar-free dis\\textbf{T}illation), which alleviates\ncatastrophic forgetting in FCCL while preserving client data privacy. Our\nproposed method leverages the previously trained global model to transfer\nknowledge of old tasks to the current task at the model level. Moreover, a\ngenerator is trained to produce synthetic data to simulate the global\ndistribution of data on each client at the data level. Compared to previous\nFCCL methods, TARGET does not require any additional datasets or storing real\ndata from previous tasks, which makes it ideal for data-sensitive scenarios.\n","authors":["Jie Zhang","Chen Chen","Weiming Zhuang","Lingjuan Lv"],"pdf_url":"https://arxiv.org/pdf/2303.06937v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06053v2","updated":"2023-08-14T03:19:03Z","published":"2023-08-11T10:05:53Z","title":"Cost-effective On-device Continual Learning over Memory Hierarchy with\n Miro","summary":" Continual learning (CL) trains NN models incrementally from a continuous\nstream of tasks. To remember previously learned knowledge, prior studies store\nold samples over a memory hierarchy and replay them when new tasks arrive. Edge\ndevices that adopt CL to preserve data privacy are typically energy-sensitive\nand thus require high model accuracy while not compromising energy efficiency,\ni.e., cost-effectiveness. Our work is the first to explore the design space of\nhierarchical memory replay-based CL to gain insights into achieving\ncost-effectiveness on edge devices. We present Miro, a novel system runtime\nthat carefully integrates our insights into the CL framework by enabling it to\ndynamically configure the CL system based on resource states for the best\ncost-effectiveness. To reach this goal, Miro also performs online profiling on\nparameters with clear accuracy-energy trade-offs and adapts to optimal values\nwith low overhead. Extensive evaluations show that Miro significantly\noutperforms baseline systems we build for comparison, consistently achieving\nhigher cost-effectiveness.\n","authors":["Xinyue Ma","Suyeon Jeong","Minjia Zhang","Di Wang","Jonghyun Choi","Myeongjae Jeon"],"pdf_url":"https://arxiv.org/pdf/2308.06053v2.pdf","comment":"This paper is to be published in the 29th Annual International\n Conference on Mobile Computing and Networking (ACM MobiCom 23)"},{"id":"http://arxiv.org/abs/2308.06912v1","updated":"2023-08-14T03:14:38Z","published":"2023-08-14T03:14:38Z","title":"CausalLM is not optimal for in-context learning","summary":" Recent empirical evidence indicates that transformer based in-context\nlearning performs better when using a prefix language model (prefixLM), in\nwhich in-context samples can all attend to each other, compared to causal\nlanguage models (causalLM), which use auto-regressive attention that prohibits\nin-context samples to attend to future samples. While this result is intuitive,\nit is not understood from a theoretical perspective. In this paper we take a\ntheoretical approach and analyze the convergence behavior of prefixLM and\ncausalLM under a certain parameter construction. Our analysis shows that both\nLM types converge to their stationary points at a linear rate, but that while\nprefixLM converges to the optimal solution of linear regression, causalLM\nconvergence dynamics follows that of an online gradient descent algorithm,\nwhich is not guaranteed to be optimal even as the number of samples grows\ninfinitely. We supplement our theoretical claims with empirical experiments\nover synthetic and real tasks and using various types of transformers. Our\nexperiments verify that causalLM consistently underperforms prefixLM in all\nsettings.\n","authors":["Nan Ding","Tomer Levinboim","Jialin Wu","Sebastian Goodman","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2308.06912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06911v1","updated":"2023-08-14T03:12:29Z","published":"2023-08-14T03:12:29Z","title":"GIT-Mol: A Multi-modal Large Language Model for Molecular Science with\n Graph, Image, and Text","summary":" Large language models have made significant strides in natural language\nprocessing, paving the way for innovative applications including molecular\nrepresentation and generation. However, most existing single-modality\napproaches cannot capture the abundant and complex information in molecular\ndata. Here, we introduce GIT-Mol, a multi-modal large language model that\nintegrates the structure Graph, Image, and Text information, including the\nSimplified Molecular Input Line Entry System (SMILES) and molecular captions.\nTo facilitate the integration of multi-modal molecular data, we propose\nGIT-Former, a novel architecture capable of mapping all modalities into a\nunified latent space. Our study develops an innovative any-to-language\nmolecular translation strategy and achieves a 10%-15% improvement in molecular\ncaptioning, a 5%-10% accuracy increase in property prediction, and a 20% boost\nin molecule generation validity compared to baseline or single-modality models.\n","authors":["Pengfei Liu","Yiming Ren","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.06911v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.06907v1","updated":"2023-08-14T02:59:27Z","published":"2023-08-14T02:59:27Z","title":"Generative Interpretation","summary":" We introduce generative interpretation, a new approach to estimating\ncontractual meaning using large language models. As AI triumphalism is the\norder of the day, we proceed by way of grounded case studies, each illustrating\nthe capabilities of these novel tools in distinct ways. Taking well-known\ncontracts opinions, and sourcing the actual agreements that they adjudicated,\nwe show that AI models can help factfinders ascertain ordinary meaning in\ncontext, quantify ambiguity, and fill gaps in parties' agreements. We also\nillustrate how models can calculate the probative value of individual pieces of\nextrinsic evidence. After offering best practices for the use of these models\ngiven their limitations, we consider their implications for judicial practice\nand contract theory. Using LLMs permits courts to estimate what the parties\nintended cheaply and accurately, and as such generative interpretation\nunsettles the current interpretative stalemate. Their use responds to\nefficiency-minded textualists and justice-oriented contextualists, who argue\nabout whether parties will prefer cost and certainty or accuracy and fairness.\nParties--and courts--would prefer a middle path, in which adjudicators strive\nto predict what the contract really meant, admitting just enough context to\napproximate reality while avoiding unguided and biased assimilation of\nevidence. As generative interpretation offers this possibility, we argue it can\nbecome the new workhorse of contractual interpretation.\n","authors":["Yonathan A. Arbel","David Hoffman"],"pdf_url":"https://arxiv.org/pdf/2308.06907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04669v2","updated":"2023-08-14T02:52:02Z","published":"2023-08-09T02:27:23Z","title":"A General Implicit Framework for Fast NeRF Composition and Rendering","summary":" A variety of Neural Radiance Fields (NeRF) methods have recently achieved\nremarkable success in high render speed. However, current accelerating methods\nare specialized and incompatible with various implicit methods, preventing\nreal-time composition over various types of NeRF works. Because NeRF relies on\nsampling along rays, it is possible to provide general guidance for\nacceleration. To that end, we propose a general implicit pipeline for composing\nNeRF objects quickly. Our method enables the casting of dynamic shadows within\nor between objects using analytical light sources while allowing multiple NeRF\nobjects to be seamlessly placed and rendered together with any arbitrary rigid\ntransformations. Mainly, our work introduces a new surface representation known\nas Neural Depth Fields (NeDF) that quickly determines the spatial relationship\nbetween objects by allowing direct intersection computation between rays and\nimplicit surfaces. It leverages an intersection neural network to query NeRF\nfor acceleration instead of depending on an explicit spatial structure.Our\nproposed method is the first to enable both the progressive and interactive\ncomposition of NeRF objects. Additionally, it also serves as a previewing\nplugin for a range of existing NeRF works.\n","authors":["Xinyu Gao","Ziyi Yang","Yunlu Zhao","Yuxiang Sun","Xiaogang Jin","Changqing Zou"],"pdf_url":"https://arxiv.org/pdf/2308.04669v2.pdf","comment":"7 pages for main content"},{"id":"http://arxiv.org/abs/2308.06895v1","updated":"2023-08-14T02:25:48Z","published":"2023-08-14T02:25:48Z","title":"Federated Classification in Hyperbolic Spaces via Secure Aggregation of\n Convex Hulls","summary":" Hierarchical and tree-like data sets arise in many applications, including\nlanguage processing, graph data mining, phylogeny and genomics. It is known\nthat tree-like data cannot be embedded into Euclidean spaces of finite\ndimension with small distortion. This problem can be mitigated through the use\nof hyperbolic spaces. When such data also has to be processed in a distributed\nand privatized setting, it becomes necessary to work with new federated\nlearning methods tailored to hyperbolic spaces. As an initial step towards the\ndevelopment of the field of federated learning in hyperbolic spaces, we propose\nthe first known approach to federated classification in hyperbolic spaces. Our\ncontributions are as follows. First, we develop distributed versions of convex\nSVM classifiers for Poincar\\'e discs. In this setting, the information conveyed\nfrom clients to the global classifier are convex hulls of clusters present in\nindividual client data. Second, to avoid label switching issues, we introduce a\nnumber-theoretic approach for label recovery based on the so-called integer\n$B_h$ sequences. Third, we compute the complexity of the convex hulls in\nhyperbolic spaces to assess the extent of data leakage; at the same time, in\norder to limit the communication cost for the hulls, we propose a new\nquantization method for the Poincar\\'e disc coupled with Reed-Solomon-like\nencoding. Fourth, at server level, we introduce a new approach for aggregating\nconvex hulls of the clients based on balanced graph partitioning. We test our\nmethod on a collection of diverse data sets, including hierarchical single-cell\nRNA-seq data from different patients distributed across different repositories\nthat have stringent privacy constraints. The classification accuracy of our\nmethod is up to $\\sim 11\\%$ better than its Euclidean counterpart,\ndemonstrating the importance of privacy-preserving learning in hyperbolic\nspaces.\n","authors":["Saurav Prakash","Jin Sima","Chao Pan","Eli Chien","Olgica Milenkovic"],"pdf_url":"https://arxiv.org/pdf/2308.06895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06885v1","updated":"2023-08-14T01:37:02Z","published":"2023-08-14T01:37:02Z","title":"Bridging Offline-Online Evaluation with a Time-dependent and Popularity\n Bias-free Offline Metric for Recommenders","summary":" The evaluation of recommendation systems is a complex task. The offline and\nonline evaluation metrics for recommender systems are ambiguous in their true\nobjectives. The majority of recently published papers benchmark their methods\nusing ill-posed offline evaluation methodology that often fails to predict true\nonline performance. Because of this, the impact that academic research has on\nthe industry is reduced. The aim of our research is to investigate and compare\nthe online performance of offline evaluation metrics. We show that penalizing\npopular items and considering the time of transactions during the evaluation\nsignificantly improves our ability to choose the best recommendation model for\na live recommender system. Our results, averaged over five large-size\nreal-world live data procured from recommenders, aim to help the academic\ncommunity to understand better offline evaluation and optimization criteria\nthat are more relevant for real applications of recommender systems.\n","authors":["Petr Kasalický","Rodrigo Alves","Pavel Kordík"],"pdf_url":"https://arxiv.org/pdf/2308.06885v1.pdf","comment":"Accepted to evalRS 2023@KDD"},{"id":"http://arxiv.org/abs/2308.06884v1","updated":"2023-08-14T01:34:34Z","published":"2023-08-14T01:34:34Z","title":"Multi-Receiver Task-Oriented Communications via Multi-Task Deep Learning","summary":" This paper studies task-oriented, otherwise known as goal-oriented,\ncommunications, in a setting where a transmitter communicates with multiple\nreceivers, each with its own task to complete on a dataset, e.g., images,\navailable at the transmitter. A multi-task deep learning approach that involves\ntraining a common encoder at the transmitter and individual decoders at the\nreceivers is presented for joint optimization of completing multiple tasks and\ncommunicating with multiple receivers. By providing efficient resource\nallocation at the edge of 6G networks, the proposed approach allows the\ncommunications system to adapt to varying channel conditions and achieves\ntask-specific objectives while minimizing transmission overhead. Joint training\nof the encoder and decoders using multi-task learning captures shared\ninformation across tasks and optimizes the communication process accordingly.\nBy leveraging the broadcast nature of wireless communications, multi-receiver\ntask-oriented communications (MTOC) reduces the number of transmissions\nrequired to complete tasks at different receivers. Performance evaluation\nconducted on the MNIST, Fashion MNIST, and CIFAR-10 datasets (with image\nclassification considered for different tasks) demonstrates the effectiveness\nof MTOC in terms of classification accuracy and resource utilization compared\nto single-task-oriented communication systems.\n","authors":["Yalin E. Sagduyu","Tugba Erpek","Aylin Yener","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2308.06884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06882v1","updated":"2023-08-14T01:28:19Z","published":"2023-08-14T01:28:19Z","title":"Quantifying Outlierness of Funds from their Categories using Supervised\n Similarity","summary":" Mutual fund categorization has become a standard tool for the investment\nmanagement industry and is extensively used by allocators for portfolio\nconstruction and manager selection, as well as by fund managers for peer\nanalysis and competitive positioning. As a result, a (unintended)\nmiscategorization or lack of precision can significantly impact allocation\ndecisions and investment fund managers. Here, we aim to quantify the effect of\nmiscategorization of funds utilizing a machine learning based approach. We\nformulate the problem of miscategorization of funds as a distance-based outlier\ndetection problem, where the outliers are the data-points that are far from the\nrest of the data-points in the given feature space. We implement and employ a\nRandom Forest (RF) based method of distance metric learning, and compute the\nso-called class-wise outlier measures for each data-point to identify outliers\nin the data. We test our implementation on various publicly available data\nsets, and then apply it to mutual fund data. We show that there is a strong\nrelationship between the outlier measures of the funds and their future returns\nand discuss the implications of our findings.\n","authors":["Dhruv Desai","Ashmita Dhiman","Tushar Sharma","Deepika Sharma","Dhagash Mehta","Stefano Pasquali"],"pdf_url":"https://arxiv.org/pdf/2308.06882v1.pdf","comment":"8 pages, 5 tables, 8 figures"},{"id":"http://arxiv.org/abs/2308.06878v1","updated":"2023-08-14T01:23:37Z","published":"2023-08-14T01:23:37Z","title":"AutoSeqRec: Autoencoder for Efficient Sequential Recommendation","summary":" Sequential recommendation demonstrates the capability to recommend items by\nmodeling the sequential behavior of users. Traditional methods typically treat\nusers as sequences of items, overlooking the collaborative relationships among\nthem. Graph-based methods incorporate collaborative information by utilizing\nthe user-item interaction graph. However, these methods sometimes face\nchallenges in terms of time complexity and computational efficiency. To address\nthese limitations, this paper presents AutoSeqRec, an incremental\nrecommendation model specifically designed for sequential recommendation tasks.\nAutoSeqRec is based on autoencoders and consists of an encoder and three\ndecoders within the autoencoder architecture. These components consider both\nthe user-item interaction matrix and the rows and columns of the item\ntransition matrix. The reconstruction of the user-item interaction matrix\ncaptures user long-term preferences through collaborative filtering. In\naddition, the rows and columns of the item transition matrix represent the item\nout-degree and in-degree hopping behavior, which allows for modeling the user's\nshort-term interests. When making incremental recommendations, only the input\nmatrices need to be updated, without the need to update parameters, which makes\nAutoSeqRec very efficient. Comprehensive evaluations demonstrate that\nAutoSeqRec outperforms existing methods in terms of accuracy, while showcasing\nits robustness and efficiency.\n","authors":["Sijia Liu","Jiahao Liu","Hansu Gu","Dongsheng Li","Tun Lu","Peng Zhang","Ning Gu"],"pdf_url":"https://arxiv.org/pdf/2308.06878v1.pdf","comment":"10 pages, accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2210.07346v2","updated":"2023-08-14T01:07:38Z","published":"2022-10-13T20:39:21Z","title":"An Embarrassingly Simple Backdoor Attack on Self-supervised Learning","summary":" As a new paradigm in machine learning, self-supervised learning (SSL) is\ncapable of learning high-quality representations of complex data without\nrelying on labels. In addition to eliminating the need for labeled data,\nresearch has found that SSL improves the adversarial robustness over supervised\nlearning since lacking labels makes it more challenging for adversaries to\nmanipulate model predictions. However, the extent to which this robustness\nsuperiority generalizes to other types of attacks remains an open question.\n We explore this question in the context of backdoor attacks. Specifically, we\ndesign and evaluate CTRL, an embarrassingly simple yet highly effective\nself-supervised backdoor attack. By only polluting a tiny fraction of training\ndata (<= 1%) with indistinguishable poisoning samples, CTRL causes any\ntrigger-embedded input to be misclassified to the adversary's designated class\nwith a high probability (>= 99%) at inference time. Our findings suggest that\nSSL and supervised learning are comparably vulnerable to backdoor attacks. More\nimportantly, through the lens of CTRL, we study the inherent vulnerability of\nSSL to backdoor attacks. With both empirical and analytical evidence, we reveal\nthat the representation invariance property of SSL, which benefits adversarial\nrobustness, may also be the very reason making \\ssl highly susceptible to\nbackdoor attacks. Our findings also imply that the existing defenses against\nsupervised backdoor attacks are not easily retrofitted to the unique\nvulnerability of SSL.\n","authors":["Changjiang Li","Ren Pang","Zhaohan Xi","Tianyu Du","Shouling Ji","Yuan Yao","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2210.07346v2.pdf","comment":"The 2023 International Conference on Computer Vision (ICCV '23)"},{"id":"http://arxiv.org/abs/2308.06873v1","updated":"2023-08-14T01:01:19Z","published":"2023-08-14T01:01:19Z","title":"SpeechX: Neural Codec Language Model as a Versatile Speech Transformer","summary":" Recent advancements in generative speech models based on audio-text prompts\nhave enabled remarkable innovations like high-quality zero-shot text-to-speech.\nHowever, existing models still face limitations in handling diverse audio-text\nspeech generation tasks involving transforming input speech and processing\naudio captured in adverse acoustic conditions. This paper introduces SpeechX, a\nversatile speech generation model capable of zero-shot TTS and various speech\ntransformation tasks, dealing with both clean and noisy signals. SpeechX\ncombines neural codec language modeling with multi-task learning using\ntask-dependent prompting, enabling unified and extensible modeling and\nproviding a consistent way for leveraging textual input in speech enhancement\nand transformation tasks. Experimental results show SpeechX's efficacy in\nvarious tasks, including zero-shot TTS, noise suppression, target speaker\nextraction, speech removal, and speech editing with or without background\nnoise, achieving comparable or superior performance to specialized models\nacross tasks. See https://aka.ms/speechx for demo samples.\n","authors":["Xiaofei Wang","Manthan Thakker","Zhuo Chen","Naoyuki Kanda","Sefik Emre Eskimez","Sanyuan Chen","Min Tang","Shujie Liu","Jinyu Li","Takuya Yoshioka"],"pdf_url":"https://arxiv.org/pdf/2308.06873v1.pdf","comment":"See https://aka.ms/speechx for demo samples"},{"id":"http://arxiv.org/abs/2307.10644v2","updated":"2023-08-14T00:55:01Z","published":"2023-07-20T07:14:58Z","title":"Fisher-Rao distance and pullback SPD cone distances between multivariate\n normal distributions","summary":" Data sets of multivariate normal distributions abound in many scientific\nareas like diffusion tensor imaging, structure tensor computer vision, radar\nsignal processing, machine learning, just to name a few. In order to process\nthose normal data sets for downstream tasks like filtering, classification or\nclustering, one needs to define proper notions of dissimilarities between\nnormals and paths joining them. The Fisher-Rao distance defined as the\nRiemannian geodesic distance induced by the Fisher information metric is such a\nprincipled metric distance which however is not known in closed-form excepts\nfor a few particular cases. In this work, we first report a fast and robust\nmethod to approximate arbitrarily finely the Fisher-Rao distance between\nmultivariate normal distributions. Second, we introduce a class of distances\nbased on diffeomorphic embeddings of the normal manifold into a submanifold of\nthe higher-dimensional symmetric positive-definite cone corresponding to the\nmanifold of centered normal distributions. We show that the projective Hilbert\ndistance on the cone yields a metric on the embedded normal submanifold and we\npullback that cone distance with its associated straight line Hilbert cone\ngeodesics to obtain a distance and smooth paths between normal distributions.\nCompared to the Fisher-Rao distance approximation, the pullback Hilbert cone\ndistance is computationally light since it requires to compute only the extreme\nminimal and maximal eigenvalues of matrices. Finally, we show how to use those\ndistances in clustering tasks.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2307.10644v2.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2211.00164v2","updated":"2023-08-14T00:16:23Z","published":"2022-10-31T22:12:48Z","title":"Agent-Controller Representations: Principled Offline RL with Rich\n Exogenous Information","summary":" Learning to control an agent from data collected offline in a rich\npixel-based visual observation space is vital for real-world applications of\nreinforcement learning (RL). A major challenge in this setting is the presence\nof input information that is hard to model and irrelevant to controlling the\nagent. This problem has been approached by the theoretical RL community through\nthe lens of exogenous information, i.e, any control-irrelevant information\ncontained in observations. For example, a robot navigating in busy streets\nneeds to ignore irrelevant information, such as other people walking in the\nbackground, textures of objects, or birds in the sky. In this paper, we focus\non the setting with visually detailed exogenous information, and introduce new\noffline RL benchmarks offering the ability to study this problem. We find that\ncontemporary representation learning techniques can fail on datasets where the\nnoise is a complex and time dependent process, which is prevalent in practical\napplications. To address these, we propose to use multi-step inverse models,\nwhich have seen a great deal of interest in the RL theory community, to learn\nAgent-Controller Representations for Offline-RL (ACRO). Despite being simple\nand requiring no reward, we show theoretically and empirically that the\nrepresentation created by this objective greatly outperforms baselines.\n","authors":["Riashat Islam","Manan Tomar","Alex Lamb","Yonathan Efroni","Hongyu Zang","Aniket Didolkar","Dipendra Misra","Xin Li","Harm van Seijen","Remi Tachet des Combes","John Langford"],"pdf_url":"https://arxiv.org/pdf/2211.00164v2.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2308.07496v1","updated":"2023-08-14T23:34:59Z","published":"2023-08-14T23:34:59Z","title":"ST-MLP: A Cascaded Spatio-Temporal Linear Framework with\n Channel-Independence Strategy for Traffic Forecasting","summary":" The criticality of prompt and precise traffic forecasting in optimizing\ntraffic flow management in Intelligent Transportation Systems (ITS) has drawn\nsubstantial scholarly focus. Spatio-Temporal Graph Neural Networks (STGNNs)\nhave been lauded for their adaptability to road graph structures. Yet, current\nresearch on STGNNs architectures often prioritizes complex designs, leading to\nelevated computational burdens with only minor enhancements in accuracy. To\naddress this issue, we propose ST-MLP, a concise spatio-temporal model solely\nbased on cascaded Multi-Layer Perceptron (MLP) modules and linear layers.\nSpecifically, we incorporate temporal information, spatial information and\npredefined graph structure with a successful implementation of the\nchannel-independence strategy - an effective technique in time series\nforecasting. Empirical results demonstrate that ST-MLP outperforms\nstate-of-the-art STGNNs and other models in terms of accuracy and computational\nefficiency. Our finding encourages further exploration of more concise and\neffective neural network architectures in the field of traffic forecasting.\n","authors":["Zepu Wang","Yuqi Nie","Peng Sun","Nam H. Nguyen","John Mulvey","H. Vincent Poor"],"pdf_url":"https://arxiv.org/pdf/2308.07496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11582v4","updated":"2023-08-14T23:33:28Z","published":"2023-03-21T04:17:03Z","title":"Adaptive Experimentation at Scale: A Computational Framework for\n Flexible Batches","summary":" Standard bandit algorithms that assume continual reallocation of measurement\neffort are challenging to implement due to delayed feedback and\ninfrastructural/organizational difficulties. Motivated by practical instances\ninvolving a handful of reallocation epochs in which outcomes are measured in\nbatches, we develop a computation-driven adaptive experimentation framework\nthat can flexibly handle batching. Our main observation is that normal\napproximations, which are universal in statistical inference, can also guide\nthe design of adaptive algorithms. By deriving a Gaussian sequential\nexperiment, we formulate a dynamic program that can leverage prior information\non average rewards. Instead of the typical theory-driven paradigm, we leverage\ncomputational tools and empirical benchmarking for algorithm development. In\nparticular, our empirical analysis highlights a simple yet effective algorithm,\nResidual Horizon Optimization, which iteratively solves a planning problem\nusing stochastic gradient descent. Our approach significantly improves\nstatistical power over standard methods, even when compared to Bayesian bandit\nalgorithms (e.g., Thompson sampling) that require full distributional knowledge\nof individual rewards. Overall, we expand the scope of adaptive experimentation\nto settings that are difficult for standard methods, involving limited\nadaptivity, low signal-to-noise ratio, and unknown reward distributions.\n","authors":["Ethan Che","Hongseok Namkoong"],"pdf_url":"https://arxiv.org/pdf/2303.11582v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07491v1","updated":"2023-08-14T22:58:54Z","published":"2023-08-14T22:58:54Z","title":"Adaptive Tracking of a Single-Rigid-Body Character in Various\n Environments","summary":" Since the introduction of DeepMimic [Peng et al. 2018], subsequent research\nhas focused on expanding the repertoire of simulated motions across various\nscenarios. In this study, we propose an alternative approach for this goal, a\ndeep reinforcement learning method based on the simulation of a\nsingle-rigid-body character. Using the centroidal dynamics model (CDM) to\nexpress the full-body character as a single rigid body (SRB) and training a\npolicy to track a reference motion, we can obtain a policy that is capable of\nadapting to various unobserved environmental changes and controller transitions\nwithout requiring any additional learning. Due to the reduced dimension of\nstate and action space, the learning process is sample-efficient. The final\nfull-body motion is kinematically generated in a physically plausible way,\nbased on the state of the simulated SRB character. The SRB simulation is\nformulated as a quadratic programming (QP) problem, and the policy outputs an\naction that allows the SRB character to follow the reference motion. We\ndemonstrate that our policy, efficiently trained within 30 minutes on an\nultraportable laptop, has the ability to cope with environments that have not\nbeen experienced during learning, such as running on uneven terrain or pushing\na box, and transitions between learned policies, without any additional\nlearning.\n","authors":["Taesoo Kwon","Taehong Gu","Jaewon Ahn","Yoonsang Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.11317v2","updated":"2023-08-14T22:45:57Z","published":"2022-12-21T19:27:51Z","title":"End-to-end AI framework for interpretable prediction of molecular and\n crystal properties","summary":" We introduce an end-to-end computational framework that allows for\nhyperparameter optimization using the DeepHyper library, accelerated model\ntraining, and interpretable AI inference. The framework is based on\nstate-of-the-art AI models including CGCNN, PhysNet, SchNet, MPNN,\nMPNN-transformer, and TorchMD-NET. We employ these AI models along with the\nbenchmark QM9, hMOF, and MD17 datasets to showcase how the models can predict\nuser-specified material properties within modern computing environments. We\ndemonstrate transferable applications in the modeling of small molecules,\ninorganic crystals and nanoporous metal organic frameworks with a unified,\nstandalone framework. We have deployed and tested this framework in the\nThetaGPU supercomputer at the Argonne Leadership Computing Facility, and in the\nDelta supercomputer at the National Center for Supercomputing Applications to\nprovide researchers with modern tools to conduct accelerated AI-driven\ndiscovery in leadership-class computing environments. We release these digital\nassets as open source scientific software in GitLab, and ready-to-use Jupyter\nnotebooks in Google Colab.\n","authors":["Hyun Park","Ruijie Zhu","E. A. Huerta","Santanu Chaudhuri","Emad Tajkhorshid","Donny Cooper"],"pdf_url":"https://arxiv.org/pdf/2212.11317v2.pdf","comment":"20 pages, 10 images, 6 tables; v2: accepted to Machine Learning:\n Science and Technology"},{"id":"http://arxiv.org/abs/2308.07486v1","updated":"2023-08-14T22:36:27Z","published":"2023-08-14T22:36:27Z","title":"O-1: Self-training with Oracle and 1-best Hypothesis","summary":" We introduce O-1, a new self-training objective to reduce training bias and\nunify training and evaluation metrics for speech recognition. O-1 is a faster\nvariant of Expected Minimum Bayes Risk (EMBR), that boosts the oracle\nhypothesis and can accommodate both supervised and unsupervised data. We\ndemonstrate the effectiveness of our approach in terms of recognition on\npublicly available SpeechStew datasets and a large-scale, in-house data set. On\nSpeechstew, the O-1 objective closes the gap between the actual and oracle\nperformance by 80\\% relative compared to EMBR which bridges the gap by 43\\%\nrelative. O-1 achieves 13\\% to 25\\% relative improvement over EMBR on the\nvarious datasets that SpeechStew comprises of, and a 12\\% relative gap\nreduction with respect to the oracle WER over EMBR training on the in-house\ndataset. Overall, O-1 results in a 9\\% relative improvement in WER over EMBR,\nthereby speaking to the scalability of the proposed objective for large-scale\ndatasets.\n","authors":["Murali Karthick Baskar","Andrew Rosenberg","Bhuvana Ramabhadran","Kartik Audhkhasi"],"pdf_url":"https://arxiv.org/pdf/2308.07486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07480v1","updated":"2023-08-14T22:17:33Z","published":"2023-08-14T22:17:33Z","title":"OCDaf: Ordered Causal Discovery with Autoregressive Flows","summary":" We propose OCDaf, a novel order-based method for learning causal graphs from\nobservational data. We establish the identifiability of causal graphs within\nmultivariate heteroscedastic noise models, a generalization of additive noise\nmodels that allow for non-constant noise variances. Drawing upon the structural\nsimilarities between these models and affine autoregressive normalizing flows,\nwe introduce a continuous search algorithm to find causal structures. Our\nexperiments demonstrate state-of-the-art performance across the Sachs and\nSynTReN benchmarks in Structural Hamming Distance (SHD) and Structural\nIntervention Distance (SID). Furthermore, we validate our identifiability\ntheory across various parametric and nonparametric synthetic datasets and\nshowcase superior performance compared to existing baselines.\n","authors":["Hamidreza Kamkari","Vahid Zehtab","Vahid Balazadeh","Rahul G. Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.07480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07470v1","updated":"2023-08-14T21:46:37Z","published":"2023-08-14T21:46:37Z","title":"Symphony: Optimized Model Serving using Centralized Orchestration","summary":" The orchestration of deep neural network (DNN) model inference on GPU\nclusters presents two significant challenges: achieving high accelerator\nefficiency given the batching properties of model inference while meeting\nlatency service level objectives (SLOs), and adapting to workload changes both\nin terms of short-term fluctuations and long-term resource allocation. To\naddress these challenges, we propose Symphony, a centralized scheduling system\nthat can scale to millions of requests per second and coordinate tens of\nthousands of GPUs. Our system utilizes a non-work-conserving scheduling\nalgorithm capable of achieving high batch efficiency while also enabling robust\nautoscaling. Additionally, we developed an epoch-scale algorithm that allocates\nmodels to sub-clusters based on the compute and memory needs of the models.\nThrough extensive experiments, we demonstrate that Symphony outperforms prior\nsystems by up to 4.7x higher goodput.\n","authors":["Lequn Chen","Weixin Deng","Anirudh Canumalla","Yu Xin","Matthai Philipose","Arvind Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2308.07470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07469v1","updated":"2023-08-14T21:40:23Z","published":"2023-08-14T21:40:23Z","title":"Omega-Regular Reward Machines","summary":" Reinforcement learning (RL) is a powerful approach for training agents to\nperform tasks, but designing an appropriate reward mechanism is critical to its\nsuccess. However, in many cases, the complexity of the learning objectives goes\nbeyond the capabilities of the Markovian assumption, necessitating a more\nsophisticated reward mechanism. Reward machines and omega-regular languages are\ntwo formalisms used to express non-Markovian rewards for quantitative and\nqualitative objectives, respectively. This paper introduces omega-regular\nreward machines, which integrate reward machines with omega-regular languages\nto enable an expressive and effective reward mechanism for RL. We present a\nmodel-free RL algorithm to compute epsilon-optimal strategies against\nomega-egular reward machines and evaluate the effectiveness of the proposed\nalgorithm through experiments.\n","authors":["Ernst Moritz Hahn","Mateo Perez","Sven Schewe","Fabio Somenzi","Ashutosh Trivedi","Dominik Wojtczak"],"pdf_url":"https://arxiv.org/pdf/2308.07469v1.pdf","comment":"To appear in ECAI-2023"},{"id":"http://arxiv.org/abs/2308.07464v1","updated":"2023-08-14T21:21:03Z","published":"2023-08-14T21:21:03Z","title":"There Is a Digital Art History","summary":" In this paper, we revisit Johanna Drucker's question, \"Is there a digital art\nhistory?\" -- posed exactly a decade ago -- in the light of the emergence of\nlarge-scale, transformer-based vision models. While more traditional types of\nneural networks have long been part of digital art history, and digital\nhumanities projects have recently begun to use transformer models, their\nepistemic implications and methodological affordances have not yet been\nsystematically analyzed. We focus our analysis on two main aspects that,\ntogether, seem to suggest a coming paradigm shift towards a \"digital\" art\nhistory in Drucker's sense. On the one hand, the visual-cultural repertoire\nnewly encoded in large-scale vision models has an outsized effect on digital\nart history. The inclusion of significant numbers of non-photographic images\nallows for the extraction and automation of different forms of visual logics.\nLarge-scale vision models have \"seen\" large parts of the Western visual canon\nmediated by Net visual culture, and they continuously solidify and concretize\nthis canon through their already widespread application in all aspects of\ndigital life. On the other hand, based on two technical case studies of\nutilizing a contemporary large-scale visual model to investigate basic\nquestions from the fields of art history and urbanism, we suggest that such\nsystems require a new critical methodology that takes into account the\nepistemic entanglement of a model and its applications. This new methodology\nreads its corpora through a neural model's training data, and vice versa: the\nvisual ideologies of research datasets and training datasets become entangled.\n","authors":["Leonardo Impett","Fabian Offert"],"pdf_url":"https://arxiv.org/pdf/2308.07464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08612v2","updated":"2023-08-14T21:19:08Z","published":"2023-02-16T22:34:28Z","title":"Robust expected improvement for Bayesian optimization","summary":" Bayesian Optimization (BO) links Gaussian Process (GP) surrogates with\nsequential design toward optimizing expensive-to-evaluate black-box functions.\nExample design heuristics, or so-called acquisition functions, like expected\nimprovement (EI), balance exploration and exploitation to furnish global\nsolutions under stringent evaluation budgets. However, they fall short when\nsolving for robust optima, meaning a preference for solutions in a wider domain\nof attraction. Robust solutions are useful when inputs are imprecisely\nspecified, or where a series of solutions is desired. A common mathematical\nprogramming technique in such settings involves an adversarial objective,\nbiasing a local solver away from ``sharp'' troughs. Here we propose a surrogate\nmodeling and active learning technique called robust expected improvement (REI)\nthat ports adversarial methodology into the BO/GP framework. After describing\nthe methods, we illustrate and draw comparisons to several competitors on\nbenchmark synthetic exercises and real problems of varying complexity.\n","authors":["Ryan B. Christianson","Robert B. Gramacy"],"pdf_url":"https://arxiv.org/pdf/2302.08612v2.pdf","comment":"27 pages, 17 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.07452v1","updated":"2023-08-14T20:46:16Z","published":"2023-08-14T20:46:16Z","title":"GRU-D-Weibull: A Novel Real-Time Individualized Endpoint Prediction","summary":" Accurate prediction models for individual-level endpoints and\ntime-to-endpoints are crucial in clinical practice. In this study, we propose a\nnovel approach, GRU-D-Weibull, which combines gated recurrent units with decay\n(GRU-D) to model the Weibull distribution. Our method enables real-time\nindividualized endpoint prediction and population-level risk management. Using\na cohort of 6,879 patients with stage 4 chronic kidney disease (CKD4), we\nevaluated the performance of GRU-D-Weibull in endpoint prediction. The C-index\nof GRU-D-Weibull was ~0.7 at the index date and increased to ~0.77 after 4.3\nyears of follow-up, similar to random survival forest. Our approach achieved an\nabsolute L1-loss of ~1.1 years (SD 0.95) at the CKD4 index date and a minimum\nof ~0.45 years (SD0.3) at 4 years of follow-up, outperforming competing methods\nsignificantly. GRU-D-Weibull consistently constrained the predicted survival\nprobability at the time of an event within a smaller and more fixed range\ncompared to other models throughout the follow-up period. We observed\nsignificant correlations between the error in point estimates and missing\nproportions of input features at the index date (correlations from ~0.1 to\n~0.3), which diminished within 1 year as more data became available. By\npost-training recalibration, we successfully aligned the predicted and observed\nsurvival probabilities across multiple prediction horizons at different time\npoints during follow-up. Our findings demonstrate the considerable potential of\nGRU-D-Weibull as the next-generation architecture for endpoint risk management,\ncapable of generating various endpoint estimates for real-time monitoring using\nclinical data.\n","authors":["Xiaoyang Ruan","Liwei Wang","Charat Thongprayoon","Wisit Cheungpasitporn","Hongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07452v1.pdf","comment":"30 pages, 7 figures, 4 supplementary figures"},{"id":"http://arxiv.org/abs/2211.01134v2","updated":"2023-08-14T20:40:23Z","published":"2022-11-02T14:11:25Z","title":"Faster variational quantum algorithms with quantum kernel-based\n surrogate models","summary":" We present a new optimization method for small-to-intermediate scale\nvariational algorithms on noisy near-term quantum processors which uses a\nGaussian process surrogate model equipped with a classically-evaluated quantum\nkernel. Variational algorithms are typically optimized using gradient-based\napproaches however these are difficult to implement on current noisy devices,\nrequiring large numbers of objective function evaluations. Our scheme shifts\nthis computational burden onto the classical optimizer component of these\nhybrid algorithms, greatly reducing the number of queries to the quantum\nprocessor. We focus on the variational quantum eigensolver (VQE) algorithm and\ndemonstrate numerically that such surrogate models are particularly well suited\nto the algorithm's objective function. Next, we apply these models to both\nnoiseless and noisy VQE simulations and show that they exhibit better\nperformance than widely-used classical kernels in terms of final accuracy and\nconvergence speed. Compared to the typically-used stochastic gradient-descent\napproach for VQAs, our quantum kernel-based approach is found to consistently\nachieve significantly higher accuracy while requiring less than an order of\nmagnitude fewer quantum circuit evaluations. We analyse the performance of the\nquantum kernel-based models in terms of the kernels' induced feature spaces and\nexplicitly construct their feature maps. Finally, we describe a scheme for\napproximating the best-performing quantum kernel using a classically-efficient\ntensor network representation of its input state and so provide a pathway for\nscaling these methods to larger systems.\n","authors":["Alistair W. R. Smith","A. J. Paige","M. S. Kim"],"pdf_url":"https://arxiv.org/pdf/2211.01134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07445v1","updated":"2023-08-14T20:34:54Z","published":"2023-08-14T20:34:54Z","title":"Open-set Face Recognition using Ensembles trained on Clustered Data","summary":" Open-set face recognition describes a scenario where unknown subjects, unseen\nduring the training stage, appear on test time. Not only it requires methods\nthat accurately identify individuals of interest, but also demands approaches\nthat effectively deal with unfamiliar faces. This work details a scalable\nopen-set face identification approach to galleries composed of hundreds and\nthousands of subjects. It is composed of clustering and an ensemble of binary\nlearning algorithms that estimates when query face samples belong to the face\ngallery and then retrieves their correct identity. The approach selects the\nmost suitable gallery subjects and uses the ensemble to improve prediction\nperformance. We carry out experiments on well-known LFW and YTF benchmarks.\nResults show that competitive performance can be achieved even when targeting\nscalability.\n","authors":["Rafael Henrique Vareto","William Robson Schwartz"],"pdf_url":"https://arxiv.org/pdf/2308.07445v1.pdf","comment":"[Original paper title: Unconstrained Face Identification using\n Ensembles trained on Clustered Data] [2020 IEEE International Joint\n Conference on Biometrics (IJCB)]\n [https://ieeexplore.ieee.org/document/9304882]"},{"id":"http://arxiv.org/abs/2308.07441v1","updated":"2023-08-14T20:26:23Z","published":"2023-08-14T20:26:23Z","title":"Physics-Informed Deep Learning to Reduce the Bias in Joint Prediction of\n Nitrogen Oxides","summary":" Atmospheric nitrogen oxides (NOx) primarily from fuel combustion have\nrecognized acute and chronic health and environmental effects. Machine learning\n(ML) methods have significantly enhanced our capacity to predict NOx\nconcentrations at ground-level with high spatiotemporal resolution but may\nsuffer from high estimation bias since they lack physical and chemical\nknowledge about air pollution dynamics. Chemical transport models (CTMs)\nleverage this knowledge; however, accurate predictions of ground-level\nconcentrations typically necessitate extensive post-calibration. Here, we\npresent a physics-informed deep learning framework that encodes\nadvection-diffusion mechanisms and fluid dynamics constraints to jointly\npredict NO2 and NOx and reduce ML model bias by 21-42%. Our approach captures\nfine-scale transport of NO2 and NOx, generates robust spatial extrapolation,\nand provides explicit uncertainty estimation. The framework fuses\nknowledge-driven physicochemical principles of CTMs with the predictive power\nof ML for air quality exposure, health, and policy applications. Our approach\noffers significant improvements over purely data-driven ML methods and has\nunprecedented bias reduction in joint NO2 and NOx prediction.\n","authors":["Lianfa Li","Roxana Khalili","Frederick Lurmann","Nathan Pavlovic","Jun Wu","Yan Xu","Yisi Liu","Karl O'Sharkey","Beate Ritz","Luke Oman","Meredith Franklin","Theresa Bastain","Shohreh F. Farzan","Carrie Breton","Rima Habre"],"pdf_url":"https://arxiv.org/pdf/2308.07441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16740v2","updated":"2023-08-14T20:24:50Z","published":"2023-06-29T07:31:43Z","title":"Principles and Guidelines for Evaluating Social Robot Navigation\n Algorithms","summary":" A major challenge to deploying robots widely is navigation in human-populated\nenvironments, commonly referred to as social robot navigation. While the field\nof social navigation has advanced tremendously in recent years, the fair\nevaluation of algorithms that tackle social navigation remains hard because it\ninvolves not just robotic agents moving in static environments but also dynamic\nhuman agents and their perceptions of the appropriateness of robot behavior. In\ncontrast, clear, repeatable, and accessible benchmarks have accelerated\nprogress in fields like computer vision, natural language processing and\ntraditional robot navigation by enabling researchers to fairly compare\nalgorithms, revealing limitations of existing solutions and illuminating\npromising new directions. We believe the same approach can benefit social\nnavigation. In this paper, we pave the road towards common, widely accessible,\nand repeatable benchmarking criteria to evaluate social robot navigation. Our\ncontributions include (a) a definition of a socially navigating robot as one\nthat respects the principles of safety, comfort, legibility, politeness, social\ncompetency, agent understanding, proactivity, and responsiveness to context,\n(b) guidelines for the use of metrics, development of scenarios, benchmarks,\ndatasets, and simulators to evaluate social navigation, and (c) a design of a\nsocial navigation metrics framework to make it easier to compare results from\ndifferent simulators, robots and datasets.\n","authors":["Anthony Francis","Claudia Perez-D'Arpino","Chengshu Li","Fei Xia","Alexandre Alahi","Rachid Alami","Aniket Bera","Abhijat Biswas","Joydeep Biswas","Rohan Chandra","Hao-Tien Lewis Chiang","Michael Everett","Sehoon Ha","Justin Hart","Jonathan P. How","Haresh Karnan","Tsang-Wei Edward Lee","Luis J. Manso","Reuth Mirksy","Soeren Pirk","Phani Teja Singamaneni","Peter Stone","Ada V. Taylor","Peter Trautman","Nathan Tsoi","Marynel Vazquez","Xuesu Xiao","Peng Xu","Naoki Yokoyama","Alexander Toshev","Roberto Martin-Martin"],"pdf_url":"https://arxiv.org/pdf/2306.16740v2.pdf","comment":"42 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2210.09887v5","updated":"2023-08-14T20:24:24Z","published":"2022-10-18T14:23:05Z","title":"MotionDeltaCNN: Sparse CNN Inference of Frame Differences in Moving\n Camera Videos","summary":" Convolutional neural network inference on video input is computationally\nexpensive and requires high memory bandwidth. Recently, DeltaCNN managed to\nreduce the cost by only processing pixels with significant updates over the\nprevious frame. However, DeltaCNN relies on static camera input. Moving cameras\nadd new challenges in how to fuse newly unveiled image regions with already\nprocessed regions efficiently to minimize the update rate - without increasing\nmemory overhead and without knowing the camera extrinsics of future frames. In\nthis work, we propose MotionDeltaCNN, a sparse CNN inference framework that\nsupports moving cameras. We introduce spherical buffers and padded convolutions\nto enable seamless fusion of newly unveiled regions and previously processed\nregions -- without increasing memory footprint. Our evaluation shows that we\noutperform DeltaCNN by up to 90% for moving camera videos.\n","authors":["Mathias Parger","Chengcheng Tang","Thomas Neff","Christopher D. Twigg","Cem Keskin","Robert Wang","Markus Steinberger"],"pdf_url":"https://arxiv.org/pdf/2210.09887v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07439v1","updated":"2023-08-14T20:20:26Z","published":"2023-08-14T20:20:26Z","title":"Interaction-Aware Personalized Vehicle Trajectory Prediction Using\n Temporal Graph Neural Networks","summary":" Accurate prediction of vehicle trajectories is vital for advanced driver\nassistance systems and autonomous vehicles. Existing methods mainly rely on\ngeneric trajectory predictions derived from large datasets, overlooking the\npersonalized driving patterns of individual drivers. To address this gap, we\npropose an approach for interaction-aware personalized vehicle trajectory\nprediction that incorporates temporal graph neural networks. Our method\nutilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to\nmodel the spatio-temporal interactions between target vehicles and their\nsurrounding traffic. To personalize the predictions, we establish a pipeline\nthat leverages transfer learning: the model is initially pre-trained on a\nlarge-scale trajectory dataset and then fine-tuned for each driver using their\nspecific driving data. We employ human-in-the-loop simulation to collect\npersonalized naturalistic driving trajectories and corresponding surrounding\nvehicle trajectories. Experimental results demonstrate the superior performance\nof our personalized GCN-LSTM model, particularly for longer prediction\nhorizons, compared to its generic counterpart. Moreover, the personalized model\noutperforms individual models created without pre-training, emphasizing the\nsignificance of pre-training on a large dataset to avoid overfitting. By\nincorporating personalization, our approach enhances trajectory prediction\naccuracy.\n","authors":["Amr Abdelraouf","Rohit Gupta","Kyungtae Han"],"pdf_url":"https://arxiv.org/pdf/2308.07439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07436v1","updated":"2023-08-14T20:06:19Z","published":"2023-08-14T20:06:19Z","title":"A Hybrid Deep Spatio-Temporal Attention-Based Model for Parkinson's\n Disease Diagnosis Using Resting State EEG Signals","summary":" Parkinson's disease (PD), a severe and progressive neurological illness,\naffects millions of individuals worldwide. For effective treatment and\nmanagement of PD, an accurate and early diagnosis is crucial. This study\npresents a deep learning-based model for the diagnosis of PD using resting\nstate electroencephalogram (EEG) signal. The objective of the study is to\ndevelop an automated model that can extract complex hidden nonlinear features\nfrom EEG and demonstrate its generalizability on unseen data. The model is\ndesigned using a hybrid model, consists of convolutional neural network (CNN),\nbidirectional gated recurrent unit (Bi-GRU), and attention mechanism. The\nproposed method is evaluated on three public datasets (Uc San Diego Dataset,\nPRED-CT, and University of Iowa (UI) dataset), with one dataset used for\ntraining and the other two for evaluation. The results show that the proposed\nmodel can accurately diagnose PD with high performance on both the training and\nhold-out datasets. The model also performs well even when some part of the\ninput information is missing. The results of this work have significant\nimplications for patient treatment and for ongoing investigations into the\nearly detection of Parkinson's disease. The suggested model holds promise as a\nnon-invasive and reliable technique for PD early detection utilizing resting\nstate EEG.\n","authors":["Niloufar Delfan","Mohammadreza Shahsavari","Sadiq Hussain","Robertas Damaševičius","U. Rajendra Acharya"],"pdf_url":"https://arxiv.org/pdf/2308.07436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07424v1","updated":"2023-08-14T19:31:58Z","published":"2023-08-14T19:31:58Z","title":"Addressing Distribution Shift in RTB Markets via Exponential Tilting","summary":" Distribution shift in machine learning models can be a primary cause of\nperformance degradation. This paper delves into the characteristics of these\nshifts, primarily motivated by Real-Time Bidding (RTB) market models. We\nemphasize the challenges posed by class imbalance and sample selection bias,\nboth potent instigators of distribution shifts. This paper introduces the\nExponential Tilt Reweighting Alignment (ExTRA) algorithm, as proposed by Marty\net al. (2023), to address distribution shifts in data. The ExTRA method is\ndesigned to determine the importance weights on the source data, aiming to\nminimize the KL divergence between the weighted source and target datasets. A\nnotable advantage of this method is its ability to operate using labeled source\ndata and unlabeled target data. Through simulated real-world data, we\ninvestigate the nature of distribution shift and evaluate the applicacy of the\nproposed model.\n","authors":["Minji Kim","Seong Jin Lee","Bumsik Kim"],"pdf_url":"https://arxiv.org/pdf/2308.07424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06239v2","updated":"2023-08-14T19:26:43Z","published":"2023-08-11T17:15:12Z","title":"Private Distribution Learning with Public Data: The View from Sample\n Compression","summary":" We study the problem of private distribution learning with access to public\ndata. In this setup, which we refer to as public-private learning, the learner\nis given public and private samples drawn from an unknown distribution $p$\nbelonging to a class $\\mathcal Q$, with the goal of outputting an estimate of\n$p$ while adhering to privacy constraints (here, pure differential privacy)\nonly with respect to the private samples.\n We show that the public-private learnability of a class $\\mathcal Q$ is\nconnected to the existence of a sample compression scheme for $\\mathcal Q$, as\nwell as to an intermediate notion we refer to as list learning. Leveraging this\nconnection: (1) approximately recovers previous results on Gaussians over\n$\\mathbb R^d$; and (2) leads to new ones, including sample complexity upper\nbounds for arbitrary $k$-mixtures of Gaussians over $\\mathbb R^d$, results for\nagnostic and distribution-shift resistant learners, as well as closure\nproperties for public-private learnability under taking mixtures and products\nof distributions. Finally, via the connection to list learning, we show that\nfor Gaussians in $\\mathbb R^d$, at least $d$ public samples are necessary for\nprivate learnability, which is close to the known upper bound of $d+1$ public\nsamples.\n","authors":["Shai Ben-David","Alex Bie","Clément L. Canonne","Gautam Kamath","Vikrant Singhal"],"pdf_url":"https://arxiv.org/pdf/2308.06239v2.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2308.07421v1","updated":"2023-08-14T19:21:28Z","published":"2023-08-14T19:21:28Z","title":"U-Turn Diffusion","summary":" We present a comprehensive examination of score-based diffusion models of AI\nfor generating synthetic images. These models hinge upon a dynamic auxiliary\ntime mechanism driven by stochastic differential equations, wherein the score\nfunction is acquired from input images. Our investigation unveils a criterion\nfor evaluating efficiency of the score-based diffusion models: the power of the\ngenerative process depends on the ability to de-construct fast correlations\nduring the reverse/de-noising phase. To improve the quality of the produced\nsynthetic images, we introduce an approach coined \"U-Turn Diffusion\". The\nU-Turn Diffusion technique starts with the standard forward diffusion process,\nalbeit with a condensed duration compared to conventional settings.\nSubsequently, we execute the standard reverse dynamics, initialized with the\nconcluding configuration from the forward process. This U-Turn Diffusion\nprocedure, combining forward, U-turn, and reverse processes, creates a\nsynthetic image approximating an independent and identically distributed\n(i.i.d.) sample from the probability distribution implicitly described via\ninput samples. To analyze relevant time scales we employ various analytical\ntools, including auto-correlation analysis, weighted norm of the score-function\nanalysis, and Kolmogorov-Smirnov Gaussianity test. The tools guide us to\nestablishing that the Kernel Intersection Distance, a metric comparing the\nquality of synthetic samples with real data samples, is minimized at the\noptimal U-turn time.\n","authors":["Hamidreza Behjoo","Michael Chertkov"],"pdf_url":"https://arxiv.org/pdf/2308.07421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07418v1","updated":"2023-08-14T19:12:40Z","published":"2023-08-14T19:12:40Z","title":"Locally Adaptive and Differentiable Regression","summary":" Over-parameterized models like deep nets and random forests have become very\npopular in machine learning. However, the natural goals of continuity and\ndifferentiability, common in regression models, are now often ignored in modern\noverparametrized, locally-adaptive models. We propose a general framework to\nconstruct a global continuous and differentiable model based on a weighted\naverage of locally learned models in corresponding local regions. This model is\ncompetitive in dealing with data with different densities or scales of function\nvalues in different local regions. We demonstrate that when we mix kernel ridge\nand polynomial regression terms in the local models, and stitch them together\ncontinuously, we achieve faster statistical convergence in theory and improved\nperformance in various practical settings.\n","authors":["Mingxuan Han","Varun Shankar","Jeff M Phillips","Chenglong Ye"],"pdf_url":"https://arxiv.org/pdf/2308.07418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07395v1","updated":"2023-08-14T18:28:04Z","published":"2023-08-14T18:28:04Z","title":"Text Injection for Capitalization and Turn-Taking Prediction in Speech\n Models","summary":" Text injection for automatic speech recognition (ASR), wherein unpaired\ntext-only data is used to supplement paired audio-text data, has shown\npromising improvements for word error rate. This study examines the use of text\ninjection for auxiliary tasks, which are the non-ASR tasks often performed by\nan E2E model. In this work, we use joint end-to-end and internal language model\ntraining (JEIT) as our text injection algorithm to train an ASR model which\nperforms two auxiliary tasks. The first is capitalization, which is a\nde-normalization task. The second is turn-taking prediction, which attempts to\nidentify whether a user has completed their conversation turn in a digital\nassistant interaction. We show results demonstrating that our text injection\nmethod boosts capitalization performance for long-tail data, and improves\nturn-taking detection recall.\n","authors":["Shaan Bijwadia","Shuo-yiin Chang","Weiran Wang","Zhong Meng","Hao Zhang","Tara N. Sainath"],"pdf_url":"https://arxiv.org/pdf/2308.07395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17550v2","updated":"2023-08-14T18:18:05Z","published":"2022-10-31T17:59:29Z","title":"Nesterov Meets Optimism: Rate-Optimal Separable Minimax Optimization","summary":" We propose a new first-order optimization algorithm --\nAcceleratedGradient-OptimisticGradient (AG-OG) Descent Ascent -- for separable\nconvex-concave minimax optimization. The main idea of our algorithm is to\ncarefully leverage the structure of the minimax problem, performing Nesterov\nacceleration on the individual component and optimistic gradient on the\ncoupling component. Equipped with proper restarting, we show that AG-OG\nachieves the optimal convergence rate (up to a constant) for a variety of\nsettings, including bilinearly coupled strongly convex-strongly concave minimax\noptimization (bi-SC-SC), bilinearly coupled convex-strongly concave minimax\noptimization (bi-C-SC), and bilinear games. We also extend our algorithm to the\nstochastic setting and achieve the optimal convergence rate in both bi-SC-SC\nand bi-C-SC settings. AG-OG is the first single-call algorithm with optimal\nconvergence rates in both deterministic and stochastic settings for bilinearly\ncoupled minimax optimization problems.\n","authors":["Chris Junchi Li","Angela Yuan","Gauthier Gidel","Quanquan Gu","Michael I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2210.17550v2.pdf","comment":"44 pages. This version matches the camera-ready that appeared at ICML\n 2023 under the same title"},{"id":"http://arxiv.org/abs/2308.00924v2","updated":"2023-08-14T18:10:29Z","published":"2023-08-02T03:47:19Z","title":"Continual Domain Adaptation on Aerial Images under Gradually Degrading\n Weather","summary":" Domain adaptation (DA) strives to mitigate the domain gap between the source\ndomain where a model is trained, and the target domain where the model is\ndeployed. When a deep learning model is deployed on an aerial platform, it may\nface gradually degrading weather conditions during operation, leading to\nwidening domain gaps between the training data and the encountered evaluation\ndata. We synthesize two such gradually worsening weather conditions on real\nimages from two existing aerial imagery datasets, generating a total of four\nbenchmark datasets. Under the continual, or test-time adaptation setting, we\nevaluate three DA models on our datasets: a baseline standard DA model and two\ncontinual DA models. In such setting, the models can access only one small\nportion, or one batch of the target data at a time, and adaptation takes place\ncontinually, and over only one epoch of the data. The combination of the\nconstraints of continual adaptation, and gradually deteriorating weather\nconditions provide the practical DA scenario for aerial deployment. Among the\nevaluated models, we consider both convolutional and transformer architectures\nfor comparison. We discover stability issues during adaptation for existing\nbuffer-fed continual DA methods, and offer gradient normalization as a simple\nsolution to curb training instability.\n","authors":["Chowdhury Sadman Jahan","Andreas Savakis"],"pdf_url":"https://arxiv.org/pdf/2308.00924v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07387v1","updated":"2023-08-14T18:09:58Z","published":"2023-08-14T18:09:58Z","title":"DISBELIEVE: Distance Between Client Models is Very Essential for\n Effective Local Model Poisoning Attacks","summary":" Federated learning is a promising direction to tackle the privacy issues\nrelated to sharing patients' sensitive data. Often, federated systems in the\nmedical image analysis domain assume that the participating local clients are\n\\textit{honest}. Several studies report mechanisms through which a set of\nmalicious clients can be introduced that can poison the federated setup,\nhampering the performance of the global model. To overcome this, robust\naggregation methods have been proposed that defend against those attacks. We\nobserve that most of the state-of-the-art robust aggregation methods are\nheavily dependent on the distance between the parameters or gradients of\nmalicious clients and benign clients, which makes them prone to local model\npoisoning attacks when the parameters or gradients of malicious and benign\nclients are close. Leveraging this, we introduce DISBELIEVE, a local model\npoisoning attack that creates malicious parameters or gradients such that their\ndistance to benign clients' parameters or gradients is low respectively but at\nthe same time their adverse effect on the global model's performance is high.\nExperiments on three publicly available medical image datasets demonstrate the\nefficacy of the proposed DISBELIEVE attack as it significantly lowers the\nperformance of the state-of-the-art \\textit{robust aggregation} methods for\nmedical image analysis. Furthermore, compared to state-of-the-art local model\npoisoning attacks, DISBELIEVE attack is also effective on natural images where\nwe observe a severe drop in classification performance of the global model for\nmulti-class classification on benchmark dataset CIFAR-10.\n","authors":["Indu Joshi","Priyank Upadhya","Gaurav Kumar Nayak","Peter Schüffler","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2308.07387v1.pdf","comment":"Accepted by MICCAI 2023 - DeCaF"},{"id":"http://arxiv.org/abs/2302.01538v4","updated":"2023-08-14T18:03:24Z","published":"2023-02-03T04:24:49Z","title":"A deep complementary energy method for solid mechanics using minimum\n complementary energy principle","summary":" In recent years, the rapid advancement of deep learning has significantly\nimpacted various fields, particularly in solving partial differential equations\n(PDEs) in solid mechanics, benefiting greatly from the remarkable approximation\ncapabilities of neural networks. In solving PDEs, Physics-Informed Neural\nNetworks (PINNs) and the Deep Energy Method (DEM) have garnered substantial\nattention. The principle of minimum potential energy and complementary energy\nare two important variational principles in solid mechanics. However,DEM is\nbased on the principle of minimum potential energy, but it lacks the important\nform of minimum complementary energy. To bridge this gap, we propose the deep\ncomplementary energy method (DCEM) based on the principle of minimum\ncomplementary energy. The output function of DCEM is the stress function. We\nextend DCEM to DCEM-Plus (DCEM-P), adding terms that satisfy partial\ndifferential equations. Furthermore, we propose a deep complementary energy\noperator method (DCEM-O) by combining operator learning with physical\nequations. We train DCEM-O using existing high-fidelity numerical results and\nthe complementary energy together. We present numerical results using the\nPrandtl and Airy stress functions and compare DCEM with existing PINNs and DEM\nwhen modeling representative mechanical problems. The results demonstrate that\nDCEM outperforms DEM in terms of stress accuracy and efficiency and has an\nadvantage in dealing with complex displacement boundary conditions. DCEM-P and\nDCEM-O further enhance the accuracy and efficiency of DCEM. In summary, our\nproposed DCEM marks the first time that complementary energy is extended to the\nenergy-based physics-informed neural network and provides an essential\nsupplementary energy form to the DEM in solid mechanics, offering promising\nresearch prospects in computational mechanics.\n","authors":["Yizheng Wang","Jia Sun","Timon Rabczuk","Pipi Hu","Yinghua Liu"],"pdf_url":"https://arxiv.org/pdf/2302.01538v4.pdf","comment":"58 pages, 30 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.07316v1","updated":"2023-08-14T17:59:31Z","published":"2023-08-14T17:59:31Z","title":"Jurassic World Remake: Bringing Ancient Fossils Back to Life via\n Zero-Shot Long Image-to-Image Translation","summary":" With a strong understanding of the target domain from natural language, we\nproduce promising results in translating across large domain gaps and bringing\nskeletons back to life. In this work, we use text-guided latent diffusion\nmodels for zero-shot image-to-image translation (I2I) across large domain gaps\n(longI2I), where large amounts of new visual features and new geometry need to\nbe generated to enter the target domain. Being able to perform translations\nacross large domain gaps has a wide variety of real-world applications in\ncriminology, astrology, environmental conservation, and paleontology. In this\nwork, we introduce a new task Skull2Animal for translating between skulls and\nliving animals. On this task, we find that unguided Generative Adversarial\nNetworks (GANs) are not capable of translating across large domain gaps.\nInstead of these traditional I2I methods, we explore the use of guided\ndiffusion and image editing models and provide a new benchmark model,\nRevive-2I, capable of performing zero-shot I2I via text-prompting latent\ndiffusion models. We find that guidance is necessary for longI2I because, to\nbridge the large domain gap, prior knowledge about the target domain is needed.\nIn addition, we find that prompting provides the best and most scalable\ninformation about the target domain as classifier-guided diffusion models\nrequire retraining for specific use cases and lack stronger constraints on the\ntarget domain because of the wide variety of images they are trained on.\n","authors":["Alexander Martin","Haitian Zheng","Jie An","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2308.07316v1.pdf","comment":"9 pages, 10 figures, ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2304.09571v2","updated":"2023-08-14T15:15:46Z","published":"2023-04-19T11:19:10Z","title":"SLIC: Large Receptive Field Learning with Self-Conditioned Adaptability\n for Learned Image Compression","summary":" Recently, transformers are trending as replacements for CNNs in vision tasks,\nincluding compression. This trend compels us to question the inherent\nlimitations of CNNs compared to transformers and to explore if CNNs can be\nenhanced to achieve the same or even better performance than transformers. We\nwant to design a pure CNN based model for compression as most devices are\noptimized for CNNs well. In our analysis, we find that the key strengths of\ntransformers lie in their dynamic weights and large receptive fields. To enable\nCNNs with such properties, we propose a novel transform module with large\nreceptive filed learning and self-conditioned adaptability for learned image\ncompression, named SLIC. Specifically, we enlarge the receptive field of\ndepth-wise convolution with suitable complexity and generate the weights\naccording to given conditions. In addition, we also investigate the\nself-conditioned factor for channels. To prove the effectiveness of our\nproposed transform module, we equip it with existing entropy models ChARM,\nSCCTX, and SWAtten and we obtain models SLIC-ChARM, SLIC-SCCTX, and\nSLIC-SWAtten. Extensive experiments demonstrate our SLIC-ChARM, SLIC-SCCTX, and\nSLIC-SWAtten have significant improvements over corresponding baselines and\nachieve SOTA performances with suitable complexity on 5 test datasets (Kodak,\nTecnick, CLIC 20, CLIC 21, JPEGAI). Code will be available at\nhttps://github.com/JiangWeibeta/SLIC.\n","authors":["Wei Jiang","Peirong Ning","Jiayu Yang","Yongqi Zhai","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.09571v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2308.07146v1","updated":"2023-08-14T13:53:18Z","published":"2023-08-14T13:53:18Z","title":"CTP: Towards Vision-Language Continual Pretraining via Compatible\n Momentum Contrast and Topology Preservation","summary":" Vision-Language Pretraining (VLP) has shown impressive results on diverse\ndownstream tasks by offline training on large-scale datasets. Regarding the\ngrowing nature of real-world data, such an offline training paradigm on\never-expanding data is unsustainable, because models lack the continual\nlearning ability to accumulate knowledge constantly. However, most continual\nlearning studies are limited to uni-modal classification and existing\nmulti-modal datasets cannot simulate continual non-stationary data stream\nscenarios. To support the study of Vision-Language Continual Pretraining\n(VLCP), we first contribute a comprehensive and unified benchmark dataset P9D\nwhich contains over one million product image-text pairs from 9 industries. The\ndata from each industry as an independent task supports continual learning and\nconforms to the real-world long-tail nature to simulate pretraining on web\ndata. We comprehensively study the characteristics and challenges of VLCP, and\npropose a new algorithm: Compatible momentum contrast with Topology\nPreservation, dubbed CTP. The compatible momentum model absorbs the knowledge\nof the current and previous-task models to flexibly update the modal feature.\nMoreover, Topology Preservation transfers the knowledge of embedding across\ntasks while preserving the flexibility of feature adjustment. The experimental\nresults demonstrate our method not only achieves superior performance compared\nwith other baselines but also does not bring an expensive training burden.\nDataset and codes are available at https://github.com/KevinLight831/CTP.\n","authors":["Hongguang Zhu","Yunchao Wei","Xiaodan Liang","Chunjie Zhang","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.07146v1.pdf","comment":"Accepted by ICCV 2023. Code: https://github.com/KevinLight831/CTP"},{"id":"http://arxiv.org/abs/2308.07102v1","updated":"2023-08-14T12:30:58Z","published":"2023-08-14T12:30:58Z","title":"Temporal Sentence Grounding in Streaming Videos","summary":" This paper aims to tackle a novel task - Temporal Sentence Grounding in\nStreaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance\nbetween a video stream and a given sentence query. Unlike regular videos,\nstreaming videos are acquired continuously from a particular source, and are\nalways desired to be processed on-the-fly in many applications such as\nsurveillance and live-stream analysis. Thus, TSGSV is challenging since it\nrequires the model to infer without future frames and process long historical\nframes effectively, which is untouched in the early methods. To specifically\naddress the above challenges, we propose two novel methods: (1) a TwinNet\nstructure that enables the model to learn about upcoming events; and (2) a\nlanguage-guided feature compressor that eliminates redundant visual frames and\nreinforces the frames that are relevant to the query. We conduct extensive\nexperiments using ActivityNet Captions, TACoS, and MAD datasets. The results\ndemonstrate the superiority of our proposed methods. A systematic ablation\nstudy also confirms their effectiveness.\n","authors":["Tian Gan","Xiao Wang","Yan Sun","Jianlong Wu","Qingpei Guo","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2308.07102v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.07056v1","updated":"2023-08-14T10:31:29Z","published":"2023-08-14T10:31:29Z","title":"VoxSnap: X-Large Speaker Verification Dataset on Camera","summary":" In this paper, we contribute a novel and extensive dataset for speaker\nverification, which contains noisy 38k identities/1.45M utterances (VoxSnap)\nand relatively cleaned 18k identities/1.02M (VoxSnap-Clean) utterances for\ntraining. Firstly, we collect a 60K+ users' list as well as their avatar and\ndownload their SHORT videos on the YouTube. Then, an automatically pipeline is\ndevised to extract target user's speech segments and videos, which is efficient\nand scalable. To the best of our knowledge, the VoxSnap dataset is the largest\nspeaker recognition dataset. Secondly, we develop a series of experiments based\non VoxSnap-clean together with VoxCeleb2. Our findings highlight a notable\nimprovement in performance, ranging from 15% to 30%, across different backbone\narchitectures, upon integrating our dataset for training. The dataset will be\nreleased SOON~.\n","authors":["Yuke Lin","Xiaoyi Qin","Ming Cheng","Ning Jiang","Guoqing Zhao","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2308.07056v1.pdf","comment":"submit to ICASSP2023"},{"id":"http://arxiv.org/abs/2306.02898v4","updated":"2023-08-14T07:37:27Z","published":"2023-06-05T14:06:24Z","title":"Towards Unified Text-based Person Retrieval: A Large-scale\n Multi-Attribute and Language Search Benchmark","summary":" In this paper, we introduce a large Multi-Attribute and Language Search\ndataset for text-based person retrieval, called MALS, and explore the\nfeasibility of performing pre-training on both attribute recognition and\nimage-text matching tasks in one stone. In particular, MALS contains 1,510,330\nimage-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES,\nand all images are annotated with 27 attributes. Considering the privacy\nconcerns and annotation costs, we leverage the off-the-shelf diffusion models\nto generate the dataset. To verify the feasibility of learning from the\ngenerated data, we develop a new joint Attribute Prompt Learning and Text\nMatching Learning (APTM) framework, considering the shared knowledge between\nattribute and text. As the name implies, APTM contains an attribute prompt\nlearning stream and a text matching learning stream. (1) The attribute prompt\nlearning leverages the attribute prompts for image-attribute alignment, which\nenhances the text matching learning. (2) The text matching learning facilitates\nthe representation learning on fine-grained details, and in turn, boosts the\nattribute prompt learning. Extensive experiments validate the effectiveness of\nthe pre-training on MALS, achieving state-of-the-art retrieval performance via\nAPTM on three challenging real-world benchmarks. In particular, APTM achieves a\nconsistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on\nCUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively.\n","authors":["Shuyu Yang","Yinan Zhou","Yaxiong Wang","Yujiao Wu","Li Zhu","Zhedong Zheng"],"pdf_url":"https://arxiv.org/pdf/2306.02898v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05995v2","updated":"2023-08-14T03:27:40Z","published":"2023-08-11T08:03:28Z","title":"Audio is all in one: speech-driven gesture synthetics using WavLM\n pre-trained model","summary":" The generation of co-speech gestures for digital humans is an emerging area\nin the field of virtual human creation. Prior research has made progress by\nusing acoustic and semantic information as input and adopting classify method\nto identify the person's ID and emotion for driving co-speech gesture\ngeneration. However, this endeavour still faces significant challenges. These\nchallenges go beyond the intricate interplay between co-speech gestures, speech\nacoustic, and semantics; they also encompass the complexities associated with\npersonality, emotion, and other obscure but important factors. This paper\nintroduces \"diffmotion-v2,\" a speech-conditional diffusion-based and\nnon-autoregressive transformer-based generative model with WavLM pre-trained\nmodel. It can produce individual and stylized full-body co-speech gestures only\nusing raw speech audio, eliminating the need for complex multimodal processing\nand manually annotated. Firstly, considering that speech audio not only\ncontains acoustic and semantic features but also conveys personality traits,\nemotions, and more subtle information related to accompanying gestures, we\npioneer the adaptation of WavLM, a large-scale pre-trained model, to extract\nlow-level and high-level audio information. Secondly, we introduce an adaptive\nlayer norm architecture in the transformer-based layer to learn the\nrelationship between speech information and accompanying gestures. Extensive\nsubjective evaluation experiments are conducted on the Trinity, ZEGGS, and BEAT\ndatasets to confirm the WavLM and the model's ability to synthesize natural\nco-speech gestures with various styles.\n","authors":["Fan Zhang","Naye Ji","Fuxing Gao","Siyuan Zhao","Zhaohan Wang","Shunman Li"],"pdf_url":"https://arxiv.org/pdf/2308.05995v2.pdf","comment":"10 pages, 5 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.06897v1","updated":"2023-08-14T02:26:49Z","published":"2023-08-14T02:26:49Z","title":"Orthogonal Temporal Interpolation for Zero-Shot Video Recognition","summary":" Zero-shot video recognition (ZSVR) is a task that aims to recognize video\ncategories that have not been seen during the model training process. Recently,\nvision-language models (VLMs) pre-trained on large-scale image-text pairs have\ndemonstrated impressive transferability for ZSVR. To make VLMs applicable to\nthe video domain, existing methods often use an additional temporal learning\nmodule after the image-level encoder to learn the temporal relationships among\nvideo frames. Unfortunately, for video from unseen categories, we observe an\nabnormal phenomenon where the model that uses spatial-temporal feature performs\nmuch worse than the model that removes temporal learning module and uses only\nspatial feature. We conjecture that improper temporal modeling on video\ndisrupts the spatial feature of the video. To verify our hypothesis, we propose\nFeature Factorization to retain the orthogonal temporal feature of the video\nand use interpolation to construct refined spatial-temporal feature. The model\nusing appropriately refined spatial-temporal feature performs better than the\none using only spatial feature, which verifies the effectiveness of the\northogonal temporal feature for the ZSVR task. Therefore, an Orthogonal\nTemporal Interpolation module is designed to learn a better refined\nspatial-temporal video feature during training. Additionally, a Matching Loss\nis introduced to improve the quality of the orthogonal temporal feature. We\npropose a model called OTI for ZSVR by employing orthogonal temporal\ninterpolation and the matching loss based on VLMs. The ZSVR accuracies on\npopular video datasets (i.e., Kinetics-600, UCF101 and HMDB51) show that OTI\noutperforms the previous state-of-the-art method by a clear margin.\n","authors":["Yan Zhu","Junbao Zhuo","Bin Ma","Jiajia Geng","Xiaoming Wei","Xiaolin Wei","Shuhui Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06897v1.pdf","comment":null}]},"2023-08-13T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2206.07920v2","updated":"2023-08-13T20:53:43Z","published":"2022-06-16T04:50:00Z","title":"PInKS: Preconditioned Commonsense Inference with Minimal Supervision","summary":" Reasoning with preconditions such as \"glass can be used for drinking water\nunless the glass is shattered\" remains an open problem for language models. The\nmain challenge lies in the scarcity of preconditions data and the model's lack\nof support for such reasoning. We present PInKS, Preconditioned Commonsense\nInference with WeaK Supervision, an improved model for reasoning with\npreconditions through minimum supervision. We show, both empirically and\ntheoretically, that PInKS improves the results on benchmarks focused on\nreasoning with the preconditions of commonsense knowledge (up to 40% Macro-F1\nscores). We further investigate PInKS through PAC-Bayesian informativeness\nanalysis, precision measures, and ablation study.\n","authors":["Ehsan Qasemi","Piyush Khanna","Qiang Ning","Muhao Chen"],"pdf_url":"https://arxiv.org/pdf/2206.07920v2.pdf","comment":"AACL 2022"},{"id":"http://arxiv.org/abs/2104.08712v3","updated":"2023-08-13T20:32:28Z","published":"2021-04-18T04:37:54Z","title":"PaCo: Preconditions Attributed to Commonsense Knowledge","summary":" Humans can seamlessly reason with circumstantial preconditions of commonsense\nknowledge. We understand that a glass is used for drinking water, unless the\nglass is broken or the water is toxic. Despite state-of-the-art (SOTA) language\nmodels' (LMs) impressive performance on inferring commonsense knowledge, it is\nunclear whether they understand the circumstantial preconditions. To address\nthis gap, we propose a novel challenge of reasoning with circumstantial\npreconditions. We collect a dataset, called PaCo, consisting of 12.4 thousand\npreconditions of commonsense statements expressed in natural language. Based on\nthis dataset, we create three canonical evaluation tasks and use them to\nexamine the capability of existing LMs to understand situational preconditions.\nOur results reveal a 10-30% gap between machine and human performance on our\ntasks, which shows that reasoning with preconditions is an open challenge.\n","authors":["Ehsan Qasemi","Filip Ilievski","Muhao Chen","Pedro Szekely"],"pdf_url":"https://arxiv.org/pdf/2104.08712v3.pdf","comment":"EMNLP 2022 (Findings)"},{"id":"http://arxiv.org/abs/2308.06834v1","updated":"2023-08-13T19:04:07Z","published":"2023-08-13T19:04:07Z","title":"Diagnostic Reasoning Prompts Reveal the Potential for Large Language\n Model Interpretability in Medicine","summary":" One of the major barriers to using large language models (LLMs) in medicine\nis the perception they use uninterpretable methods to make clinical decisions\nthat are inherently different from the cognitive processes of clinicians. In\nthis manuscript we develop novel diagnostic reasoning prompts to study whether\nLLMs can perform clinical reasoning to accurately form a diagnosis. We find\nthat GPT4 can be prompted to mimic the common clinical reasoning processes of\nclinicians without sacrificing diagnostic accuracy. This is significant because\nan LLM that can use clinical reasoning to provide an interpretable rationale\noffers physicians a means to evaluate whether LLMs can be trusted for patient\ncare. Novel prompting methods have the potential to expose the black box of\nLLMs, bringing them one step closer to safe and effective use in medicine.\n","authors":["Thomas Savage","Ashwin Nayak","Robert Gallo","Ekanath Rangan","Jonathan H Chen"],"pdf_url":"https://arxiv.org/pdf/2308.06834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06828v1","updated":"2023-08-13T18:14:10Z","published":"2023-08-13T18:14:10Z","title":"An Ensemble Approach to Question Classification: Integrating Electra\n Transformer, GloVe, and LSTM","summary":" This paper introduces a novel ensemble approach for question classification\nusing state-of-the-art models -- Electra, GloVe, and LSTM. The proposed model\nis trained and evaluated on the TREC dataset, a well-established benchmark for\nquestion classification tasks. The ensemble model combines the strengths of\nElectra, a transformer-based model for language understanding, GloVe, a global\nvectors for word representation, and LSTM, a recurrent neural network variant,\nproviding a robust and efficient solution for question classification.\nExtensive experiments were carried out to compare the performance of the\nproposed ensemble approach with other cutting-edge models, such as BERT,\nRoBERTa, and DistilBERT. Our results demonstrate that the ensemble model\noutperforms these models across all evaluation metrics, achieving an accuracy\nof 0.8 on the test set. These findings underscore the effectiveness of the\nensemble approach in enhancing the performance of question classification\ntasks, and invite further exploration of ensemble methods in natural language\nprocessing.\n","authors":["Sanad Aburass","Osama Dorgham"],"pdf_url":"https://arxiv.org/pdf/2308.06828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06795v1","updated":"2023-08-13T15:44:39Z","published":"2023-08-13T15:44:39Z","title":"Faithful to Whom? Questioning Interpretability Measures in NLP","summary":" A common approach to quantifying model interpretability is to calculate\nfaithfulness metrics based on iteratively masking input tokens and measuring\nhow much the predicted label changes as a result. However, we show that such\nmetrics are generally not suitable for comparing the interpretability of\ndifferent neural text classifiers as the response to masked inputs is highly\nmodel-specific. We demonstrate that iterative masking can produce large\nvariation in faithfulness scores between comparable models, and show that\nmasked samples are frequently outside the distribution seen during training. We\nfurther investigate the impact of adversarial attacks and adversarial training\non faithfulness scores, and demonstrate the relevance of faithfulness measures\nfor analyzing feature salience in text adversarial attacks. Our findings\nprovide new insights into the limitations of current faithfulness metrics and\nkey considerations to utilize them appropriately.\n","authors":["Evan Crothers","Herna Viktor","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2308.06795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13099v3","updated":"2023-08-13T15:06:43Z","published":"2023-03-23T08:30:35Z","title":"Multi-View Zero-Shot Open Intent Induction from Dialogues: Multi Domain\n Batch and Proxy Gradient Transfer","summary":" In Task Oriented Dialogue (TOD) system, detecting and inducing new intents\nare two main challenges to apply the system in the real world. In this paper,\nwe suggest the semantic multi-view model to resolve these two challenges: (1)\nSBERT for General Embedding (GE), (2) Multi Domain Batch (MDB) for dialogue\ndomain knowledge, and (3) Proxy Gradient Transfer (PGT) for cluster-specialized\nsemantic. MDB feeds diverse dialogue datasets to the model at once to tackle\nthe multi-domain problem by learning the multiple domain knowledge. We\nintroduce a novel method PGT, which employs the Siamese network to fine-tune\nthe model with a clustering method directly.Our model can learn how to cluster\ndialogue utterances by using PGT. Experimental results demonstrate that our\nmulti-view model with MDB and PGT significantly improves the Open Intent\nInduction performance compared to baseline systems.\n","authors":["Hyukhun Koh","Haesung Pyun","Nakyeong Yang","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2303.13099v3.pdf","comment":"8 pages, 3 figures, SIGDIAL DSTC 2023 workshop"},{"id":"http://arxiv.org/abs/2308.06788v1","updated":"2023-08-13T15:03:31Z","published":"2023-08-13T15:03:31Z","title":"Modeling the Dashboard Provenance","summary":" Organizations of all kinds, whether public or private, profit-driven or\nnon-profit, and across various industries and sectors, rely on dashboards for\neffective data visualization. However, the reliability and efficacy of these\ndashboards rely on the quality of the visual and data they present. Studies\nshow that less than a quarter of dashboards provide information about their\nsources, which is just one of the expected metadata when provenance is\nseriously considered. Provenance is a record that describes people,\norganizations, entities, and activities that had a role in the production,\ninfluence, or delivery of a piece of data or an object. This paper aims to\nprovide a provenance representation model, that entitles standardization,\nmodeling, generation, capture, and visualization, specifically designed for\ndashboards and its visual and data components. The proposed model will offer a\ncomprehensive set of essential provenance metadata that enables users to\nevaluate the quality, consistency, and reliability of the information presented\non dashboards. This will allow a clear and precise understanding of the context\nin which a specific dashboard was developed, ultimately leading to better\ndecision-making.\n","authors":["Johne Jarske","Jorge Rady","Lucia V. L. Filgueiras","Leandro M. Velloso","Tania L. Santos"],"pdf_url":"https://arxiv.org/pdf/2308.06788v1.pdf","comment":"8 pages, 4 figures, one table, to be published in VIS 2023 (Vis +\n Prov) x Domain"},{"id":"http://arxiv.org/abs/2307.06281v3","updated":"2023-08-13T13:12:47Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":" Large vision-language models have recently achieved remarkable progress,\nexhibiting great perception and reasoning abilities concerning visual\ninformation. However, how to effectively evaluate these large vision-language\nmodels remains a major obstacle, hindering future model development.\nTraditional benchmarks like VQAv2 or COCO Caption provide quantitative\nperformance measurements but suffer from a lack of fine-grained ability\nassessment and non-robust evaluation metrics. Recent subjective benchmarks,\nsuch as OwlEval, offer comprehensive evaluations of a model's abilities by\nincorporating human labor, but they are not scalable and display significant\nbias. In response to these challenges, we propose MMBench, a novel\nmulti-modality benchmark. MMBench methodically develops a comprehensive\nevaluation pipeline, primarily comprised of two elements. The first element is\na meticulously curated dataset that surpasses existing similar benchmarks in\nterms of the number and variety of evaluation questions and abilities. The\nsecond element introduces a novel CircularEval strategy and incorporates the\nuse of ChatGPT. This implementation is designed to convert free-form\npredictions into pre-defined choices, thereby facilitating a more robust\nevaluation of the model's predictions. MMBench is a systematically-designed\nobjective benchmark for robustly evaluating the various abilities of\nvision-language models. We hope MMBench will assist the research community in\nbetter evaluating their models and encourage future advancements in this\ndomain. Project page: https://opencompass.org.cn/mmbench.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14104v3","updated":"2023-08-13T12:17:51Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n Supervision","summary":" Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v3.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2308.06744v1","updated":"2023-08-13T11:07:55Z","published":"2023-08-13T11:07:55Z","title":"Token-Scaled Logit Distillation for Ternary Weight Generative Language\n Models","summary":" Generative Language Models (GLMs) have shown impressive performance in tasks\nsuch as text generation, understanding, and reasoning. However, the large model\nsize poses challenges for practical deployment. To solve this problem,\nQuantization-Aware Training (QAT) has become increasingly popular. However,\ncurrent QAT methods for generative models have resulted in a noticeable loss of\naccuracy. To counteract this issue, we propose a novel knowledge distillation\nmethod specifically designed for GLMs. Our method, called token-scaled logit\ndistillation, prevents overfitting and provides superior learning from the\nteacher model and ground truth. This research marks the first evaluation of\nternary weight quantization-aware training of large-scale GLMs with less than\n1.0 degradation in perplexity and no loss of accuracy in a reasoning task.\n","authors":["Minsoo Kim","Sihwa Lee","Janghwan Lee","Sukjin Hong","Du-Seong Chang","Wonyong Sung","Jungwook Choi"],"pdf_url":"https://arxiv.org/pdf/2308.06744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03365v2","updated":"2023-08-13T07:02:16Z","published":"2023-08-07T07:39:43Z","title":"Improving Few-shot and Zero-shot Entity Linking with Coarse-to-Fine\n Lexicon-based Retriever","summary":" Few-shot and zero-shot entity linking focus on the tail and emerging\nentities, which are more challenging but closer to real-world scenarios. The\nmainstream method is the ''retrieve and rerank'' two-stage framework. In this\npaper, we propose a coarse-to-fine lexicon-based retriever to retrieve entity\ncandidates in an effective manner, which operates in two layers. The first\nlayer retrieves coarse-grained candidates by leveraging entity names, while the\nsecond layer narrows down the search to fine-grained candidates within the\ncoarse-grained ones. In addition, this second layer utilizes entity\ndescriptions to effectively disambiguate tail or new entities that share names\nwith existing popular entities. Experimental results indicate that our approach\ncan obtain superior performance without requiring extensive finetuning in the\nretrieval stage. Notably, our approach ranks the 1st in NLPCC 2023 Shared Task\n6 on Chinese Few-shot and Zero-shot Entity Linking.\n","authors":["Shijue Huang","Bingbing Wang","Libo Qin","Qin Zhao","Ruifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2308.03365v2.pdf","comment":"Accepted to NLPCC2023"},{"id":"http://arxiv.org/abs/2308.06696v1","updated":"2023-08-13T06:29:38Z","published":"2023-08-13T06:29:38Z","title":"MACO: A Modality Adversarial and Contrastive Framework for\n Modality-missing Multi-modal Knowledge Graph Completion","summary":" Recent years have seen significant advancements in multi-modal knowledge\ngraph completion (MMKGC). MMKGC enhances knowledge graph completion (KGC) by\nintegrating multi-modal entity information, thereby facilitating the discovery\nof unobserved triples in the large-scale knowledge graphs (KGs). Nevertheless,\nexisting methods emphasize the design of elegant KGC models to facilitate\nmodality interaction, neglecting the real-life problem of missing modalities in\nKGs. The missing modality information impedes modal interaction, consequently\nundermining the model's performance. In this paper, we propose a modality\nadversarial and contrastive framework (MACO) to solve the modality-missing\nproblem in MMKGC. MACO trains a generator and discriminator adversarially to\ngenerate missing modality features that can be incorporated into the MMKGC\nmodel. Meanwhile, we design a cross-modal contrastive loss to improve the\nperformance of the generator. Experiments on public benchmarks with further\nexplorations demonstrate that MACO could achieve state-of-the-art results and\nserve as a versatile framework to bolster various MMKGC models. Our code and\nbenchmark data are available at https://github.com/zjukg/MACO.\n","authors":["Yichi Zhang","Zhuo Chen","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.06696v1.pdf","comment":"This is the ArXiv version of our paper accepted by NLPCC 2023. The\n code will be released soon"},{"id":"http://arxiv.org/abs/2302.08068v2","updated":"2023-08-13T03:04:08Z","published":"2023-02-16T04:06:25Z","title":"LabelPrompt: Effective Prompt-based Learning for Relation Classification","summary":" Recently, prompt-based learning has gained popularity across many natural\nlanguage processing (NLP) tasks by reformulating them into a cloze-style format\nto better align pre-trained language models (PLMs) with downstream tasks.\nHowever, applying this approach to relation classification poses unique\nchallenges. Specifically, associating natural language words that fill the\nmasked token with semantic relation labels (\\textit{e.g.}\n\\textit{``org:founded\\_by}'') is difficult. To address this challenge, this\npaper presents a novel prompt-based learning method, namely LabelPrompt, for\nthe relation classification task. Motivated by the intuition to ``GIVE MODEL\nCHOICES!'', we first define additional tokens to represent relation labels,\nwhich regard these tokens as the verbaliser with semantic initialisation and\nexplicitly construct them with a prompt template method. Then, to mitigate\ninconsistency between predicted relations and given entities, we implement an\nentity-aware module with contrastive learning. Last, we conduct an attention\nquery strategy within the self-attention layer to differentiates prompt tokens\nand sequence tokens. Together, these strategies enhance the adaptability of\nprompt-based learning, especially when only small labelled datasets is\navailable. Comprehensive experiments on benchmark datasets demonstrate the\nsuperiority of our method, particularly in the few-shot scenario.\n","authors":["Wenjie Zhang","Xiaoning Song","Zhenhua Feng","Tianyang Xu","Xiaojun Wu"],"pdf_url":"https://arxiv.org/pdf/2302.08068v2.pdf","comment":"20 pages, 5 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.06866v1","updated":"2023-08-13T23:52:15Z","published":"2023-08-13T23:52:15Z","title":"Improving Face Recognition from Caption Supervision with Multi-Granular\n Contextual Feature Aggregation","summary":" We introduce caption-guided face recognition (CGFR) as a new framework to\nimprove the performance of commercial-off-the-shelf (COTS) face recognition\n(FR) systems. In contrast to combining soft biometrics (eg., facial marks,\ngender, and age) with face images, in this work, we use facial descriptions\nprovided by face examiners as a piece of auxiliary information. However, due to\nthe heterogeneity of the modalities, improving the performance by directly\nfusing the textual and facial features is very challenging, as both lie in\ndifferent embedding spaces. In this paper, we propose a contextual feature\naggregation module (CFAM) that addresses this issue by effectively exploiting\nthe fine-grained word-region interaction and global image-caption association.\nSpecifically, CFAM adopts a self-attention and a cross-attention scheme for\nimproving the intra-modality and inter-modality relationship between the image\nand textual features, respectively. Additionally, we design a textual feature\nrefinement module (TFRM) that refines the textual features of the pre-trained\nBERT encoder by updating the contextual embeddings. This module enhances the\ndiscriminative power of textual features with a cross-modal projection loss and\nrealigns the word and caption embeddings with visual features by incorporating\na visual-semantic alignment loss. We implemented the proposed CGFR framework on\ntwo face recognition models (ArcFace and AdaFace) and evaluated its performance\non the Multi-Modal CelebA-HQ dataset. Our framework significantly improves the\nperformance of ArcFace in both 1:1 verification and 1:N identification\nprotocol.\n","authors":["Md Mahedi Hasan","Nasser Nasrabadi"],"pdf_url":"https://arxiv.org/pdf/2308.06866v1.pdf","comment":"This article has been accepted for publication in the IEEE\n International Joint Conference on Biometrics (IJCB), 2023"},{"id":"http://arxiv.org/abs/2308.06861v1","updated":"2023-08-13T23:33:33Z","published":"2023-08-13T23:33:33Z","title":"Manifold DivideMix: A Semi-Supervised Contrastive Learning Framework for\n Severe Label Noise","summary":" Deep neural networks have proven to be highly effective when large amounts of\ndata with clean labels are available. However, their performance degrades when\ntraining data contains noisy labels, leading to poor generalization on the test\nset. Real-world datasets contain noisy label samples that either have similar\nvisual semantics to other classes (in-distribution) or have no semantic\nrelevance to any class (out-of-distribution) in the dataset. Most\nstate-of-the-art methods leverage ID labeled noisy samples as unlabeled data\nfor semi-supervised learning, but OOD labeled noisy samples cannot be used in\nthis way because they do not belong to any class within the dataset. Hence, in\nthis paper, we propose incorporating the information from all the training data\nby leveraging the benefits of self-supervised training. Our method aims to\nextract a meaningful and generalizable embedding space for each sample\nregardless of its label. Then, we employ a simple yet effective K-nearest\nneighbor method to remove portions of out-of-distribution samples. By\ndiscarding these samples, we propose an iterative \"Manifold DivideMix\"\nalgorithm to find clean and noisy samples, and train our model in a\nsemi-supervised way. In addition, we propose \"MixEMatch\", a new algorithm for\nthe semi-supervised step that involves mixup augmentation at the input and\nfinal hidden representations of the model. This will extract better\nrepresentations by interpolating both in the input and manifold spaces.\nExtensive experiments on multiple synthetic-noise image benchmarks and\nreal-world web-crawled datasets demonstrate the effectiveness of our proposed\nframework. Code is available at https://github.com/Fahim-F/ManifoldDivideMix.\n","authors":["Fahimeh Fooladgar","Minh Nguyen Nhat To","Parvin Mousavi","Purang Abolmaesumi"],"pdf_url":"https://arxiv.org/pdf/2308.06861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06853v1","updated":"2023-08-13T22:14:01Z","published":"2023-08-13T22:14:01Z","title":"UGC Quality Assessment: Exploring the Impact of Saliency in Deep\n Feature-Based Quality Assessment","summary":" The volume of User Generated Content (UGC) has increased in recent years. The\nchallenge with this type of content is assessing its quality. So far, the\nstate-of-the-art metrics are not exhibiting a very high correlation with\nperceptual quality. In this paper, we explore state-of-the-art metrics that\nextract/combine natural scene statistics and deep neural network features. We\nexperiment with these by introducing saliency maps to improve perceptibility.\nWe train and test our models using public datasets, namely, YouTube-UGC and\nKoNViD-1k. Preliminary results indicate that high correlations are achieved by\nusing only deep features while adding saliency is not always boosting the\nperformance. Our results and code will be made publicly available to serve as a\nbenchmark for the research community and can be found on our project page:\nhttps://github.com/xinyiW915/SPIE-2023-Supplementary.\n","authors":["Xinyi Wang","Angeliki Katsenou","David Bull"],"pdf_url":"https://arxiv.org/pdf/2308.06853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.04083v2","updated":"2023-08-13T20:49:39Z","published":"2022-04-08T14:01:41Z","title":"POSTER: A Pyramid Cross-Fusion Transformer Network for Facial Expression\n Recognition","summary":" Facial expression recognition (FER) is an important task in computer vision,\nhaving practical applications in areas such as human-computer interaction,\neducation, healthcare, and online monitoring. In this challenging FER task,\nthere are three key issues especially prevalent: inter-class similarity,\nintra-class discrepancy, and scale sensitivity. While existing works typically\naddress some of these issues, none have fully addressed all three challenges in\na unified framework. In this paper, we propose a two-stream Pyramid\ncrOss-fuSion TransformER network (POSTER), that aims to holistically solve all\nthree issues. Specifically, we design a transformer-based cross-fusion method\nthat enables effective collaboration of facial landmark features and image\nfeatures to maximize proper attention to salient facial regions. Furthermore,\nPOSTER employs a pyramid structure to promote scale invariance. Extensive\nexperimental results demonstrate that our POSTER achieves new state-of-the-art\nresults on RAF-DB (92.05%), FERPlus (91.62%), as well as AffectNet 7 class\n(67.31%) and 8 class (63.34%). The code is available at\nhttps://github.com/zczcwh/POSTER.\n","authors":["Ce Zheng","Matias Mendieta","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2204.04083v2.pdf","comment":"ICCV Workshop (AMFG) 2023"},{"id":"http://arxiv.org/abs/2303.16874v2","updated":"2023-08-13T20:11:23Z","published":"2023-03-29T17:30:53Z","title":"CheckerPose: Progressive Dense Keypoint Localization for Object Pose\n Estimation with Graph Neural Network","summary":" Estimating the 6-DoF pose of a rigid object from a single RGB image is a\ncrucial yet challenging task. Recent studies have shown the great potential of\ndense correspondence-based solutions, yet improvements are still needed to\nreach practical deployment. In this paper, we propose a novel pose estimation\nalgorithm named CheckerPose, which improves on three main aspects. Firstly,\nCheckerPose densely samples 3D keypoints from the surface of the 3D object and\nfinds their 2D correspondences progressively in the 2D image. Compared to\nprevious solutions that conduct dense sampling in the image space, our strategy\nenables the correspondence searching in a 2D grid (i.e., pixel coordinate).\nSecondly, for our 3D-to-2D correspondence, we design a compact binary code\nrepresentation for 2D image locations. This representation not only allows for\nprogressive correspondence refinement but also converts the correspondence\nregression to a more efficient classification problem. Thirdly, we adopt a\ngraph neural network to explicitly model the interactions among the sampled 3D\nkeypoints, further boosting the reliability and accuracy of the\ncorrespondences. Together, these novel components make CheckerPose a strong\npose estimation algorithm. When evaluated on the popular Linemod, Linemod-O,\nand YCB-V object pose estimation benchmarks, CheckerPose clearly boosts the\naccuracy of correspondence-based methods and achieves state-of-the-art\nperformances. Code is available at https://github.com/RuyiLian/CheckerPose.\n","authors":["Ruyi Lian","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2303.16874v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.06821v1","updated":"2023-08-13T17:30:32Z","published":"2023-08-13T17:30:32Z","title":"Optimizing Brain Tumor Classification: A Comprehensive Study on Transfer\n Learning and Imbalance Handling in Deep Learning Models","summary":" Deep learning has emerged as a prominent field in recent literature,\nshowcasing the introduction of models that utilize transfer learning to achieve\nremarkable accuracies in the classification of brain tumor MRI images. However,\nthe majority of these proposals primarily focus on balanced datasets,\nneglecting the inherent data imbalance present in real-world scenarios.\nConsequently, there is a pressing need for approaches that not only address the\ndata imbalance but also prioritize precise classification of brain cancer. In\nthis work, we present a novel deep learning-based approach, called Transfer\nLearning-CNN, for brain tumor classification using MRI data. The proposed model\nleverages the predictive capabilities of existing publicly available models by\nutilizing their pre-trained weights and transferring those weights to the CNN.\nBy leveraging a publicly available Brain MRI dataset, the experiment evaluated\nvarious transfer learning models for classifying different tumor types,\nincluding meningioma, glioma, and pituitary tumors. We investigate the impact\nof different loss functions, including focal loss, and oversampling methods,\nsuch as SMOTE and ADASYN, in addressing the data imbalance issue. Notably, the\nproposed strategy, which combines VGG-16 and CNN, achieved an impressive\naccuracy rate of 96%, surpassing alternative approaches significantly.\n","authors":["Raza Imam","Mohammed Talha Alam"],"pdf_url":"https://arxiv.org/pdf/2308.06821v1.pdf","comment":"Our code is available at\n https://github.com/Razaimam45/AI701-Project-Transfer-Learning-approach-for-imbalance-classification-of-Brain-Tumor-MRI-"},{"id":"http://arxiv.org/abs/2301.06719v5","updated":"2023-08-13T17:25:45Z","published":"2023-01-17T06:24:08Z","title":"FemtoDet: An Object Detection Baseline for Energy Versus Performance\n Tradeoffs","summary":" Efficient detectors for edge devices are often optimized for parameters or\nspeed count metrics, which remain in weak correlation with the energy of\ndetectors.\n However, some vision applications of convolutional neural networks, such as\nalways-on surveillance cameras, are critical for energy constraints.\n This paper aims to serve as a baseline by designing detectors to reach\ntradeoffs between energy and performance from two perspectives:\n 1) We extensively analyze various CNNs to identify low-energy architectures,\nincluding selecting activation functions, convolutions operators, and feature\nfusion structures on necks. These underappreciated details in past work\nseriously affect the energy consumption of detectors;\n 2) To break through the dilemmatic energy-performance problem, we propose a\nbalanced detector driven by energy using discovered low-energy components named\n\\textit{FemtoDet}.\n In addition to the novel construction, we improve FemtoDet by considering\nconvolutions and training strategy optimizations.\n Specifically, we develop a new instance boundary enhancement (IBE) module for\nconvolution optimization to overcome the contradiction between the limited\ncapacity of CNNs and detection tasks in diverse spatial representations, and\npropose a recursive warm-restart (RecWR) for optimizing training strategy to\nescape the sub-optimization of light-weight detectors by considering the data\nshift produced in popular augmentations.\n As a result, FemtoDet with only 68.77k parameters achieves a competitive\nscore of 46.3 AP50 on PASCAL VOC and 1.11 W $\\&$ 64.47 FPS on Qualcomm\nSnapdragon 865 CPU platforms.\n Extensive experiments on COCO and TJU-DHD datasets indicate that the proposed\nmethod achieves competitive results in diverse scenes.\n","authors":["Peng Tu","Xu Xie","Guo AI","Yuexiang Li","Yawen Huang","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2301.06719v5.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2211.14860v3","updated":"2023-08-13T16:37:17Z","published":"2022-11-27T15:29:39Z","title":"Foiling Explanations in Deep Neural Networks","summary":" Deep neural networks (DNNs) have greatly impacted numerous fields over the\npast decade. Yet despite exhibiting superb performance over many problems,\ntheir black-box nature still poses a significant challenge with respect to\nexplainability. Indeed, explainable artificial intelligence (XAI) is crucial in\nseveral fields, wherein the answer alone -- sans a reasoning of how said answer\nwas derived -- is of little value. This paper uncovers a troubling property of\nexplanation methods for image-based DNNs: by making small visual changes to the\ninput image -- hardly influencing the network's output -- we demonstrate how\nexplanations may be arbitrarily manipulated through the use of evolution\nstrategies. Our novel algorithm, AttaXAI, a model-agnostic, adversarial attack\non XAI algorithms, only requires access to the output logits of a classifier\nand to the explanation map; these weak assumptions render our approach highly\nuseful where real-world models and data are concerned. We compare our method's\nperformance on two benchmark datasets -- CIFAR100 and ImageNet -- using four\ndifferent pretrained deep-learning models: VGG16-CIFAR100, VGG16-ImageNet,\nMobileNet-CIFAR100, and Inception-v3-ImageNet. We find that the XAI methods can\nbe manipulated without the use of gradients or other model internals. Our novel\nalgorithm is successfully able to manipulate an image in a manner imperceptible\nto the human eye, such that the XAI method outputs a specific explanation map.\nTo our knowledge, this is the first such method in a black-box setting, and we\nbelieve it has significant value where explainability is desired, required, or\nlegally mandatory.\n","authors":["Snir Vitrack Tamam","Raz Lapid","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2211.14860v3.pdf","comment":"Snir Vitrack Tamam and Raz Lapid contributed equally"},{"id":"http://arxiv.org/abs/2208.12697v5","updated":"2023-08-13T15:52:52Z","published":"2022-08-26T14:48:02Z","title":"Voxurf: Voxel-based Efficient and Accurate Neural Surface Reconstruction","summary":" Neural surface reconstruction aims to reconstruct accurate 3D surfaces based\non multi-view images. Previous methods based on neural volume rendering mostly\ntrain a fully implicit model with MLPs, which typically require hours of\ntraining for a single scene. Recent efforts explore the explicit volumetric\nrepresentation to accelerate the optimization via memorizing significant\ninformation with learnable voxel grids. However, existing voxel-based methods\noften struggle in reconstructing fine-grained geometry, even when combined with\nan SDF-based volume rendering scheme. We reveal that this is because 1) the\nvoxel grids tend to break the color-geometry dependency that facilitates\nfine-geometry learning, and 2) the under-constrained voxel grids lack spatial\ncoherence and are vulnerable to local minima. In this work, we present Voxurf,\na voxel-based surface reconstruction approach that is both efficient and\naccurate. Voxurf addresses the aforementioned issues via several key designs,\nincluding 1) a two-stage training procedure that attains a coherent coarse\nshape and recovers fine details successively, 2) a dual color network that\nmaintains color-geometry dependency, and 3) a hierarchical geometry feature to\nencourage information propagation across voxels. Extensive experiments show\nthat Voxurf achieves high efficiency and high quality at the same time. On the\nDTU benchmark, Voxurf achieves higher reconstruction quality with a 20x\ntraining speedup compared to previous fully implicit methods. Our code is\navailable at https://github.com/wutong16/Voxurf.\n","authors":["Tong Wu","Jiaqi Wang","Xingang Pan","Xudong Xu","Christian Theobalt","Ziwei Liu","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2208.12697v5.pdf","comment":"ICLR 2023 Spotlight. Our code is available at\n https://github.com/wutong16/Voxurf"},{"id":"http://arxiv.org/abs/2308.06796v1","updated":"2023-08-13T15:45:08Z","published":"2023-08-13T15:45:08Z","title":"Modified Topological Image Preprocessing for Skin Lesion Classifications","summary":" This paper proposes a modified Topological Data Analysis model for skin\nimages preprocessing and enhancements. The skin lesion dataset HAM10000 used\nwith the intention of identifying the important objects in relevant regions of\nthe images. In order to evaluate both the original dataset and the preprocessed\ndataset, Deep Convolutional Neural Network and Vision Transformer models were\nutilized to train both models. After training, the experimental results\ndemonstrate that the images preprocessed using the Modified Topological Data\nAnalysis consistently perform better.\n","authors":["Hong Cheng","Rebekah Leamons","Ahmad Al Shami"],"pdf_url":"https://arxiv.org/pdf/2308.06796v1.pdf","comment":"Presented at CSCE 2022, The 2022 World Congress in Computer Science,\n Computer Engineering & Applied Computing, July 25-28, 2022, Las Vegas, USA"},{"id":"http://arxiv.org/abs/2305.09533v3","updated":"2023-08-13T15:31:58Z","published":"2023-05-16T15:26:09Z","title":"NightHazeFormer: Single Nighttime Haze Removal Using Prior Query\n Transformer","summary":" Nighttime image dehazing is a challenging task due to the presence of\nmultiple types of adverse degrading effects including glow, haze, blurry,\nnoise, color distortion, and so on. However, most previous studies mainly focus\non daytime image dehazing or partial degradations presented in nighttime hazy\nscenes, which may lead to unsatisfactory restoration results. In this paper, we\npropose an end-to-end transformer-based framework for nighttime haze removal,\ncalled NightHazeFormer. Our proposed approach consists of two stages:\nsupervised pre-training and semi-supervised fine-tuning. During the\npre-training stage, we introduce two powerful priors into the transformer\ndecoder to generate the non-learnable prior queries, which guide the model to\nextract specific degradations. For the fine-tuning, we combine the generated\npseudo ground truths with input real-world nighttime hazy images as paired\nimages and feed into the synthetic domain to fine-tune the pre-trained model.\nThis semi-supervised fine-tuning paradigm helps improve the generalization to\nreal domain. In addition, we also propose a large-scale synthetic dataset\ncalled UNREAL-NH, to simulate the real-world nighttime haze scenarios\ncomprehensively. Extensive experiments on several synthetic and real-world\ndatasets demonstrate the superiority of our NightHazeFormer over\nstate-of-the-art nighttime haze removal methods in terms of both visually and\nquantitatively.\n","authors":["Yun Liu","Zhongsheng Yan","Sixiang Chen","Tian Ye","Wenqi Ren","Erkang Chen"],"pdf_url":"https://arxiv.org/pdf/2305.09533v3.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.06791v1","updated":"2023-08-13T15:30:02Z","published":"2023-08-13T15:30:02Z","title":"PV-SSD: A Projection and Voxel-based Double Branch Single-Stage 3D\n Object Detector","summary":" LIDAR-based 3D object detection and classification is crucial for autonomous\ndriving. However, inference in real-time from extremely sparse 3D data poses a\nformidable challenge. To address this issue, a common approach is to project\npoint clouds onto a bird's-eye or perspective view, effectively converting them\ninto an image-like data format. However, this excessive compression of point\ncloud data often leads to the loss of information. This paper proposes a 3D\nobject detector based on voxel and projection double branch feature extraction\n(PV-SSD) to address the problem of information loss. We add voxel features\ninput containing rich local semantic information, which is fully fused with the\nprojected features in the feature extraction stage to reduce the local\ninformation loss caused by projection. A good performance is achieved compared\nto the previous work. In addition, this paper makes the following\ncontributions: 1) a voxel feature extraction method with variable receptive\nfields is proposed; 2) a feature point sampling method by weight sampling is\nused to filter out the feature points that are more conducive to the detection\ntask; 3) the MSSFA module is proposed based on the SSFA module. To verify the\neffectiveness of our method, we designed comparison experiments.\n","authors":["Yongxin Shao","Aihong Tan","Zhetao Sun","Enhui Zheng","Tianhong Yan"],"pdf_url":"https://arxiv.org/pdf/2308.06791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11077v2","updated":"2023-08-13T15:23:43Z","published":"2023-07-20T17:55:14Z","title":"AlignDet: Aligning Pre-training and Fine-tuning in Object Detection","summary":" The paradigm of large-scale pre-training followed by downstream fine-tuning\nhas been widely employed in various object detection algorithms. In this paper,\nwe reveal discrepancies in data, model, and task between the pre-training and\nfine-tuning procedure in existing practices, which implicitly limit the\ndetector's performance, generalization ability, and convergence speed. To this\nend, we propose AlignDet, a unified pre-training framework that can be adapted\nto various existing detectors to alleviate the discrepancies. AlignDet\ndecouples the pre-training process into two stages, i.e., image-domain and\nbox-domain pre-training. The image-domain pre-training optimizes the detection\nbackbone to capture holistic visual abstraction, and box-domain pre-training\nlearns instance-level semantics and task-aware concepts to initialize the parts\nout of the backbone. By incorporating the self-supervised pre-trained\nbackbones, we can pre-train all modules for various detectors in an\nunsupervised paradigm. As depicted in Figure 1, extensive experiments\ndemonstrate that AlignDet can achieve significant improvements across diverse\nprotocols, such as detection algorithm, model backbone, data setting, and\ntraining schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by\n2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.\n","authors":["Ming Li","Jie Wu","Xionghui Wang","Chen Chen","Jie Qin","Xuefeng Xiao","Rui Wang","Min Zheng","Xin Pan"],"pdf_url":"https://arxiv.org/pdf/2307.11077v2.pdf","comment":"Camera Ready Version on ICCV 2023. Code and Models are publicly\n available. Project Page: https://liming-ai.github.io/AlignDet"},{"id":"http://arxiv.org/abs/2308.06787v1","updated":"2023-08-13T14:59:27Z","published":"2023-08-13T14:59:27Z","title":"RMP-Loss: Regularizing Membrane Potential Distribution for Spiking\n Neural Networks","summary":" Spiking Neural Networks (SNNs) as one of the biology-inspired models have\nreceived much attention recently. It can significantly reduce energy\nconsumption since they quantize the real-valued membrane potentials to 0/1\nspikes to transmit information thus the multiplications of activations and\nweights can be replaced by additions when implemented on hardware. However,\nthis quantization mechanism will inevitably introduce quantization error, thus\ncausing catastrophic information loss. To address the quantization error\nproblem, we propose a regularizing membrane potential loss (RMP-Loss) to adjust\nthe distribution which is directly related to quantization error to a range\nclose to the spikes. Our method is extremely simple to implement and\nstraightforward to train an SNN. Furthermore, it is shown to consistently\noutperform previous state-of-the-art methods over different network\narchitectures and datasets.\n","authors":["Yufei Guo","Xiaode Liu","Yuanpei Chen","Liwen Zhang","Weihang Peng","Yuhan Zhang","Xuhui Huang","Zhe Ma"],"pdf_url":"https://arxiv.org/pdf/2308.06787v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.06781v1","updated":"2023-08-13T14:27:28Z","published":"2023-08-13T14:27:28Z","title":"Shape-guided Conditional Latent Diffusion Models for Synthesising Brain\n Vasculature","summary":" The Circle of Willis (CoW) is the part of cerebral vasculature responsible\nfor delivering blood to the brain. Understanding the diverse anatomical\nvariations and configurations of the CoW is paramount to advance research on\ncerebrovascular diseases and refine clinical interventions. However,\ncomprehensive investigation of less prevalent CoW variations remains\nchallenging because of the dominance of a few commonly occurring\nconfigurations. We propose a novel generative approach utilising a conditional\nlatent diffusion model with shape and anatomical guidance to generate realistic\n3D CoW segmentations, including different phenotypical variations. Our\nconditional latent diffusion model incorporates shape guidance to better\npreserve vessel continuity and demonstrates superior performance when compared\nto alternative generative models, including conditional variants of 3D GAN and\n3D VAE. We observed that our model generated CoW variants that are more\nrealistic and demonstrate higher visual fidelity than competing approaches with\nan FID score 53\\% better than the best-performing GAN-based model.\n","authors":["Yash Deo","Haoran Dou","Nishant Ravikumar","Alejandro F. Frangi","Toni Lassila"],"pdf_url":"https://arxiv.org/pdf/2308.06781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06780v1","updated":"2023-08-13T14:25:54Z","published":"2023-08-13T14:25:54Z","title":"Neural Networks at a Fraction with Pruned Quaternions","summary":" Contemporary state-of-the-art neural networks have increasingly large numbers\nof parameters, which prevents their deployment on devices with limited\ncomputational power. Pruning is one technique to remove unnecessary weights and\nreduce resource requirements for training and inference. In addition, for ML\ntasks where the input data is multi-dimensional, using higher-dimensional data\nembeddings such as complex numbers or quaternions has been shown to reduce the\nparameter count while maintaining accuracy. In this work, we conduct pruning on\nreal and quaternion-valued implementations of different architectures on\nclassification tasks. We find that for some architectures, at very high\nsparsity levels, quaternion models provide higher accuracies than their real\ncounterparts. For example, at the task of image classification on CIFAR-10\nusing Conv-4, at $3\\%$ of the number of parameters as the original model, the\npruned quaternion version outperforms the pruned real by more than $10\\%$.\nExperiments on various network architectures and datasets show that for\ndeployment in extremely resource-constrained environments, a sparse quaternion\nnetwork might be a better candidate than a real sparse model of similar\narchitecture.\n","authors":["Sahel Mohammad Iqbal","Subhankar Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.06780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06777v1","updated":"2023-08-13T14:05:24Z","published":"2023-08-13T14:05:24Z","title":"Shrinking Class Space for Enhanced Certainty in Semi-Supervised Learning","summary":" Semi-supervised learning is attracting blooming attention, due to its success\nin combining unlabeled data. To mitigate potentially incorrect pseudo labels,\nrecent frameworks mostly set a fixed confidence threshold to discard uncertain\nsamples. This practice ensures high-quality pseudo labels, but incurs a\nrelatively low utilization of the whole unlabeled set. In this work, our key\ninsight is that these uncertain samples can be turned into certain ones, as\nlong as the confusion classes for the top-1 class are detected and removed.\nInvoked by this, we propose a novel method dubbed ShrinkMatch to learn\nuncertain samples. For each uncertain sample, it adaptively seeks a shrunk\nclass space, which merely contains the original top-1 class, as well as\nremaining less likely classes. Since the confusion ones are removed in this\nspace, the re-calculated top-1 confidence can satisfy the pre-defined\nthreshold. We then impose a consistency regularization between a pair of\nstrongly and weakly augmented samples in the shrunk space to strive for\ndiscriminative representations. Furthermore, considering the varied reliability\namong uncertain samples and the gradually improved model during training, we\ncorrespondingly design two reweighting principles for our uncertain loss. Our\nmethod exhibits impressive performance on widely adopted benchmarks. Code is\navailable at https://github.com/LiheYoung/ShrinkMatch.\n","authors":["Lihe Yang","Zhen Zhao","Lei Qi","Yu Qiao","Yinghuan Shi","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.06777v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06776v1","updated":"2023-08-13T14:04:46Z","published":"2023-08-13T14:04:46Z","title":"Unsupervised Image Denoising in Real-World Scenarios via\n Self-Collaboration Parallel Generative Adversarial Branches","summary":" Deep learning methods have shown remarkable performance in image denoising,\nparticularly when trained on large-scale paired datasets. However, acquiring\nsuch paired datasets for real-world scenarios poses a significant challenge.\nAlthough unsupervised approaches based on generative adversarial networks offer\na promising solution for denoising without paired datasets, they are difficult\nin surpassing the performance limitations of conventional GAN-based\nunsupervised frameworks without significantly modifying existing structures or\nincreasing the computational complexity of denoisers. To address this problem,\nwe propose a SC strategy for multiple denoisers. This strategy can achieve\nsignificant performance improvement without increasing the inference complexity\nof the GAN-based denoising framework. Its basic idea is to iteratively replace\nthe previous less powerful denoiser in the filter-guided noise extraction\nmodule with the current powerful denoiser. This process generates better\nsynthetic clean-noisy image pairs, leading to a more powerful denoiser for the\nnext iteration. This baseline ensures the stability and effectiveness of the\ntraining network. The experimental results demonstrate the superiority of our\nmethod over state-of-the-art unsupervised methods.\n","authors":["Xin Lin","Chao Ren","Xiao Liu","Jie Huang","Yinjie Lei"],"pdf_url":"https://arxiv.org/pdf/2308.06776v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06774v1","updated":"2023-08-13T14:02:27Z","published":"2023-08-13T14:02:27Z","title":"Dual Meta-Learning with Longitudinally Generalized Regularization for\n One-Shot Brain Tissue Segmentation Across the Human Lifespan","summary":" Brain tissue segmentation is essential for neuroscience and clinical studies.\nHowever, segmentation on longitudinal data is challenging due to dynamic brain\nchanges across the lifespan. Previous researches mainly focus on\nself-supervision with regularizations and will lose longitudinal generalization\nwhen fine-tuning on a specific age group. In this paper, we propose a dual\nmeta-learning paradigm to learn longitudinally consistent representations and\npersist when fine-tuning. Specifically, we learn a plug-and-play feature\nextractor to extract longitudinal-consistent anatomical representations by\nmeta-feature learning and a well-initialized task head for fine-tuning by\nmeta-initialization learning. Besides, two class-aware regularizations are\nproposed to encourage longitudinal consistency. Experimental results on the\niSeg2019 and ADNI datasets demonstrate the effectiveness of our method. Our\ncode is available at https://github.com/ladderlab-xjtu/DuMeta.\n","authors":["Yongheng Sun","Fan Wang","Jun Shu","Haifeng Wang","Li Wang. Deyu Meng","Chunfeng Lian"],"pdf_url":"https://arxiv.org/pdf/2308.06774v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06767v1","updated":"2023-08-13T13:34:04Z","published":"2023-08-13T13:34:04Z","title":"A Survey on Deep Neural Network Pruning-Taxonomy, Comparison, Analysis,\n and Recommendations","summary":" Modern deep neural networks, particularly recent large language models, come\nwith massive model sizes that require significant computational and storage\nresources. To enable the deployment of modern models on resource-constrained\nenvironments and accelerate inference time, researchers have increasingly\nexplored pruning techniques as a popular research direction in neural network\ncompression. However, there is a dearth of up-to-date comprehensive review\npapers on pruning. To address this issue, in this survey, we provide a\ncomprehensive review of existing research works on deep neural network pruning\nin a taxonomy of 1) universal/specific speedup, 2) when to prune, 3) how to\nprune, and 4) fusion of pruning and other compression techniques. We then\nprovide a thorough comparative analysis of seven pairs of contrast settings for\npruning (e.g., unstructured/structured) and explore emerging topics, including\npost-training pruning, different levels of supervision for pruning, and broader\napplications (e.g., adversarial robustness) to shed light on the commonalities\nand differences of existing methods and lay the foundation for further method\ndevelopment. To facilitate future research, we build a curated collection of\ndatasets, networks, and evaluations on different applications. Finally, we\nprovide some valuable recommendations on selecting pruning methods and prospect\npromising research directions. We build a repository at\nhttps://github.com/hrcheng1066/awesome-pruning.\n","authors":["Hongrong Cheng","Miao Zhang","Javen Qinfeng Shi"],"pdf_url":"https://arxiv.org/pdf/2308.06767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06281v3","updated":"2023-08-13T13:12:47Z","published":"2023-07-12T16:23:09Z","title":"MMBench: Is Your Multi-modal Model an All-around Player?","summary":" Large vision-language models have recently achieved remarkable progress,\nexhibiting great perception and reasoning abilities concerning visual\ninformation. However, how to effectively evaluate these large vision-language\nmodels remains a major obstacle, hindering future model development.\nTraditional benchmarks like VQAv2 or COCO Caption provide quantitative\nperformance measurements but suffer from a lack of fine-grained ability\nassessment and non-robust evaluation metrics. Recent subjective benchmarks,\nsuch as OwlEval, offer comprehensive evaluations of a model's abilities by\nincorporating human labor, but they are not scalable and display significant\nbias. In response to these challenges, we propose MMBench, a novel\nmulti-modality benchmark. MMBench methodically develops a comprehensive\nevaluation pipeline, primarily comprised of two elements. The first element is\na meticulously curated dataset that surpasses existing similar benchmarks in\nterms of the number and variety of evaluation questions and abilities. The\nsecond element introduces a novel CircularEval strategy and incorporates the\nuse of ChatGPT. This implementation is designed to convert free-form\npredictions into pre-defined choices, thereby facilitating a more robust\nevaluation of the model's predictions. MMBench is a systematically-designed\nobjective benchmark for robustly evaluating the various abilities of\nvision-language models. We hope MMBench will assist the research community in\nbetter evaluating their models and encourage future advancements in this\ndomain. Project page: https://opencompass.org.cn/mmbench.\n","authors":["Yuan Liu","Haodong Duan","Yuanhan Zhang","Bo Li","Songyang Zhang","Wangbo Zhao","Yike Yuan","Jiaqi Wang","Conghui He","Ziwei Liu","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2307.06281v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06762v1","updated":"2023-08-13T12:51:15Z","published":"2023-08-13T12:51:15Z","title":"Tissue Segmentation of Thick-Slice Fetal Brain MR Scans with Guidance\n from High-Quality Isotropic Volumes","summary":" Accurate tissue segmentation of thick-slice fetal brain magnetic resonance\n(MR) scans is crucial for both reconstruction of isotropic brain MR volumes and\nthe quantification of fetal brain development. However, this task is\nchallenging due to the use of thick-slice scans in clinically-acquired fetal\nbrain data. To address this issue, we propose to leverage high-quality\nisotropic fetal brain MR volumes (and also their corresponding annotations) as\nguidance for segmentation of thick-slice scans. Due to existence of significant\ndomain gap between high-quality isotropic volume (i.e., source data) and\nthick-slice scans (i.e., target data), we employ a domain adaptation technique\nto achieve the associated knowledge transfer (from high-quality \nvolumes to thick-slice scans). Specifically, we first register the\navailable high-quality isotropic fetal brain MR volumes across different\ngestational weeks to construct longitudinally-complete source data. To capture\ndomain-invariant information, we then perform Fourier decomposition to extract\nimage content and style codes. Finally, we propose a novel Cycle-Consistent\nDomain Adaptation Network (C2DA-Net) to efficiently transfer the knowledge\nlearned from high-quality isotropic volumes for accurate tissue segmentation of\nthick-slice scans. Our C2DA-Net can fully utilize a small set of annotated\nisotropic volumes to guide tissue segmentation on unannotated thick-slice\nscans. Extensive experiments on a large-scale dataset of 372 clinically\nacquired thick-slice MR scans demonstrate that our C2DA-Net achieves much\nbetter performance than cutting-edge methods quantitatively and qualitatively.\n","authors":["Shijie Huang","Xukun Zhang","Zhiming Cui","He Zhang","Geng Chen","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2308.06762v1.pdf","comment":"10 pages, 9 figures, 5 tables, Fetal MRI, Brain tissue segmentation,\n Unsupervised domain adaptation, Cycle-consistency"},{"id":"http://arxiv.org/abs/2308.06755v1","updated":"2023-08-13T12:23:06Z","published":"2023-08-13T12:23:06Z","title":"Influence Function Based Second-Order Channel Pruning-Evaluating True\n Loss Changes For Pruning Is Possible Without Retraining","summary":" A challenge of channel pruning is designing efficient and effective criteria\nto select channels to prune. A widely used criterion is minimal performance\ndegeneration. To accurately evaluate the truth performance degeneration\nrequires retraining the survived weights to convergence, which is prohibitively\nslow. Hence existing pruning methods use previous weights (without retraining)\nto evaluate the performance degeneration. However, we observe the loss changes\ndiffer significantly with and without retraining. It motivates us to develop a\ntechnique to evaluate true loss changes without retraining, with which channels\nto prune can be selected more reliably and confidently. We first derive a\nclosed-form estimator of the true loss change per pruning mask change, using\ninfluence functions without retraining. Influence function which is from robust\nstatistics reveals the impacts of a training sample on the model's prediction\nand is repurposed by us to assess impacts on true loss changes. We then show\nhow to assess the importance of all channels simultaneously and develop a novel\nglobal channel pruning algorithm accordingly. We conduct extensive experiments\nto verify the effectiveness of the proposed algorithm. To the best of our\nknowledge, we are the first that shows evaluating true loss changes for pruning\nwithout retraining is possible. This finding will open up opportunities for a\nseries of new paradigms to emerge that differ from existing pruning methods.\nThe code is available at https://github.com/hrcheng1066/IFSO.\n","authors":["Hongrong Cheng","Miao Zhang","Javen Qinfeng Shi"],"pdf_url":"https://arxiv.org/pdf/2308.06755v1.pdf","comment":"chrome-extension://ogjibjphoadhljaoicdnjnmgokohngcc/assets/icon-50207e67.png"},{"id":"http://arxiv.org/abs/2304.14104v3","updated":"2023-08-13T12:17:51Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n Supervision","summary":" Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v3.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2308.06749v1","updated":"2023-08-13T11:54:14Z","published":"2023-08-13T11:54:14Z","title":"FastLLVE: Real-Time Low-Light Video Enhancement with Intensity-Aware\n Lookup Table","summary":" Low-Light Video Enhancement (LLVE) has received considerable attention in\nrecent years. One of the critical requirements of LLVE is inter-frame\nbrightness consistency, which is essential for maintaining the temporal\ncoherence of the enhanced video. However, most existing single-image-based\nmethods fail to address this issue, resulting in flickering effect that\ndegrades the overall quality after enhancement. Moreover, 3D Convolution Neural\nNetwork (CNN)-based methods, which are designed for video to maintain\ninter-frame consistency, are computationally expensive, making them impractical\nfor real-time applications. To address these issues, we propose an efficient\npipeline named FastLLVE that leverages the Look-Up-Table (LUT) technique to\nmaintain inter-frame brightness consistency effectively. Specifically, we\ndesign a learnable Intensity-Aware LUT (IA-LUT) module for adaptive\nenhancement, which addresses the low-dynamic problem in low-light scenarios.\nThis enables FastLLVE to perform low-latency and low-complexity enhancement\noperations while maintaining high-quality results. Experimental results on\nbenchmark datasets demonstrate that our method achieves the State-Of-The-Art\n(SOTA) performance in terms of both image quality and inter-frame brightness\nconsistency. More importantly, our FastLLVE can process 1,080p videos at\n$\\mathit{50+}$ Frames Per Second (FPS), which is $\\mathit{2 \\times}$ faster\nthan SOTA CNN-based methods in inference time, making it a promising solution\nfor real-time applications. The code is available at\nhttps://github.com/Wenhao-Li-777/FastLLVE.\n","authors":["Wenhao Li","Guangyang Wu","Wenyi Wang","Peiran Ren","Xiaohong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.06749v1.pdf","comment":"11pages, 9 Figures, and 6 Tables. Accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.06748v1","updated":"2023-08-13T11:49:05Z","published":"2023-08-13T11:49:05Z","title":"Target before Shooting: Accurate Anomaly Detection and Localization\n under One Millisecond via Cascade Patch Retrieval","summary":" In this work, by re-examining the \"matching\" nature of Anomaly Detection\n(AD), we propose a new AD framework that simultaneously enjoys new records of\nAD accuracy and dramatically high running speed. In this framework, the anomaly\ndetection problem is solved via a cascade patch retrieval procedure that\nretrieves the nearest neighbors for each test image patch in a coarse-to-fine\nfashion. Given a test sample, the top-K most similar training images are first\nselected based on a robust histogram matching process. Secondly, the nearest\nneighbor of each test patch is retrieved over the similar geometrical locations\non those \"global nearest neighbors\", by using a carefully trained local metric.\nFinally, the anomaly score of each test image patch is calculated based on the\ndistance to its \"local nearest neighbor\" and the \"non-background\" probability.\nThe proposed method is termed \"Cascade Patch Retrieval\" (CPR) in this work.\nDifferent from the conventional patch-matching-based AD algorithms, CPR selects\nproper \"targets\" (reference images and locations) before \"shooting\"\n(patch-matching). On the well-acknowledged MVTec AD, BTAD and MVTec-3D AD\ndatasets, the proposed algorithm consistently outperforms all the comparing\nSOTA methods by remarkable margins, measured by various AD metrics.\nFurthermore, CPR is extremely efficient. It runs at the speed of 113 FPS with\nthe standard setting while its simplified version only requires less than 1 ms\nto process an image at the cost of a trivial accuracy drop. The code of CPR is\navailable at https://github.com/flyinghu123/CPR.\n","authors":["Hanxi Li","Jianfei Hu","Bo Li","Hao Chen","Yongbin Zheng","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2308.06748v1.pdf","comment":"13 pages,8 figures"},{"id":"http://arxiv.org/abs/2304.08842v2","updated":"2023-08-13T11:31:34Z","published":"2023-04-18T09:13:52Z","title":"UDTIRI: An Open-Source Intelligent Road Inspection Benchmark Suite","summary":" It is seen that there is enormous potential to leverage powerful deep\nlearning methods in the emerging field of urban digital twins. It is\nparticularly in the area of intelligent road inspection where there is\ncurrently limited research and data available. To facilitate progress in this\nfield, we have developed a well-labeled road pothole dataset named Urban\nDigital Twins Intelligent Road Inspection (UDTIRI) dataset. We hope this\ndataset will enable the use of powerful deep learning methods in urban road\ninspection, providing algorithms with a more comprehensive understanding of the\nscene and maximizing their potential. Our dataset comprises 1000 images of\npotholes, captured in various scenarios with different lighting and humidity\nconditions. Our intention is to employ this dataset for object detection,\nsemantic segmentation, and instance segmentation tasks. Our team has devoted\nsignificant effort to conducting a detailed statistical analysis, and\nbenchmarking a selection of representative algorithms from recent years. We\nalso provide a multi-task platform for researchers to fully exploit the\nperformance of various algorithms with the support of UDTIRI dataset.\n","authors":["Sicen Guo","Jiahang Li","Shuai Su","Yi Feng","Dacheng Zhou","Chen Chen","Denghuang Zhang","Xingyi Zhu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2304.08842v2.pdf","comment":"Database webpage: https://www.udtiri.com/, Kaggle webpage:\n https://www.kaggle.com/datasets/jiahangli617/udtiri"},{"id":"http://arxiv.org/abs/2308.06746v1","updated":"2023-08-13T11:26:56Z","published":"2023-08-13T11:26:56Z","title":"Self-supervised Noise2noise Method Utilizing Corrupted Images with a\n Modular Network for LDCT Denoising","summary":" Deep learning is a very promising technique for low-dose computed tomography\n(LDCT) image denoising. However, traditional deep learning methods require\npaired noisy and clean datasets, which are often difficult to obtain. This\npaper proposes a new method for performing LDCT image denoising with only LDCT\ndata, which means that normal-dose CT (NDCT) is not needed. We adopt a\ncombination including the self-supervised noise2noise model and the\nnoisy-as-clean strategy. First, we add a second yet similar type of noise to\nLDCT images multiple times. Note that we use LDCT images based on the\nnoisy-as-clean strategy for corruption instead of NDCT images. Then, the\nnoise2noise model is executed with only the secondary corrupted images for\ntraining. We select a modular U-Net structure from several candidates with\nshared parameters to perform the task, which increases the receptive field\nwithout increasing the parameter size. The experimental results obtained on the\nMayo LDCT dataset show the effectiveness of the proposed method compared with\nthat of state-of-the-art deep learning methods. The developed code is available\nat https://github.com/XYuan01/Self-supervised-Noise2Noise-for-LDCT.\n","authors":["Yuting Zhu","Qiang He","Yudong Yao","Yueyang Teng"],"pdf_url":"https://arxiv.org/pdf/2308.06746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06743v1","updated":"2023-08-13T11:02:16Z","published":"2023-08-13T11:02:16Z","title":"TextDiff: Mask-Guided Residual Diffusion Models for Scene Text Image\n Super-Resolution","summary":" The goal of scene text image super-resolution is to reconstruct\nhigh-resolution text-line images from unrecognizable low-resolution inputs. The\nexisting methods relying on the optimization of pixel-level loss tend to yield\ntext edges that exhibit a notable degree of blurring, thereby exerting a\nsubstantial impact on both the readability and recognizability of the text. To\naddress these issues, we propose TextDiff, the first diffusion-based framework\ntailored for scene text image super-resolution. It contains two modules: the\nText Enhancement Module (TEM) and the Mask-Guided Residual Diffusion Module\n(MRD). The TEM generates an initial deblurred text image and a mask that\nencodes the spatial location of the text. The MRD is responsible for\neffectively sharpening the text edge by modeling the residuals between the\nground-truth images and the initial deblurred images. Extensive experiments\ndemonstrate that our TextDiff achieves state-of-the-art (SOTA) performance on\npublic benchmark datasets and can improve the readability of scene text images.\nMoreover, our proposed MRD module is plug-and-play that effectively sharpens\nthe text edges produced by SOTA methods. This enhancement not only improves the\nreadability and recognizability of the results generated by SOTA methods but\nalso does not require any additional joint training. Available\nCodes:https://github.com/Lenubolim/TextDiff.\n","authors":["Baolin Liu","Zongyuan Yang","Pengfei Wang","Junjie Zhou","Ziqi Liu","Ziyi Song","Yan Liu","Yongping Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.06743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06739v1","updated":"2023-08-13T10:07:46Z","published":"2023-08-13T10:07:46Z","title":"Free-ATM: Exploring Unsupervised Learning on Diffusion-Generated Images\n with Free Attention Masks","summary":" Despite the rapid advancement of unsupervised learning in visual\nrepresentation, it requires training on large-scale datasets that demand costly\ndata collection, and pose additional challenges due to concerns regarding data\nprivacy. Recently, synthetic images generated by text-to-image diffusion\nmodels, have shown great potential for benefiting image recognition. Although\npromising, there has been inadequate exploration dedicated to unsupervised\nlearning on diffusion-generated images. To address this, we start by uncovering\nthat diffusion models' cross-attention layers inherently provide\nannotation-free attention masks aligned with corresponding text inputs on\ngenerated images. We then investigate the problems of three prevalent\nunsupervised learning techniques ( i.e., contrastive learning, masked modeling,\nand vision-language pretraining) and introduce customized solutions by fully\nexploiting the aforementioned free attention masks. Our approach is validated\nthrough extensive experiments that show consistent improvements in baseline\nmodels across various downstream tasks, including image classification,\ndetection, segmentation, and image-text retrieval. By utilizing our method, it\nis possible to close the performance gap between unsupervised pretraining on\nsynthetic data and real-world scenarios.\n","authors":["David Junhao Zhang","Mutian Xu","Chuhui Xue","Wenqing Zhang","Xiaoguang Han","Song Bai","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2308.06739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06735v1","updated":"2023-08-13T09:55:04Z","published":"2023-08-13T09:55:04Z","title":"AerialVLN: Vision-and-Language Navigation for UAVs","summary":" Recently emerged Vision-and-Language Navigation (VLN) tasks have drawn\nsignificant attention in both computer vision and natural language processing\ncommunities. Existing VLN tasks are built for agents that navigate on the\nground, either indoors or outdoors. However, many tasks require intelligent\nagents to carry out in the sky, such as UAV-based goods delivery,\ntraffic/security patrol, and scenery tour, to name a few. Navigating in the sky\nis more complicated than on the ground because agents need to consider the\nflying height and more complex spatial relationship reasoning. To fill this gap\nand facilitate research in this field, we propose a new task named AerialVLN,\nwhich is UAV-based and towards outdoor environments. We develop a 3D simulator\nrendered by near-realistic pictures of 25 city-level scenarios. Our simulator\nsupports continuous navigation, environment extension and configuration. We\nalso proposed an extended baseline model based on the widely-used\ncross-modal-alignment (CMA) navigation methods. We find that there is still a\nsignificant gap between the baseline model and human performance, which\nsuggests AerialVLN is a new challenging task. Dataset and code is available at\nhttps://github.com/AirVLN/AirVLN.\n","authors":["Shubo Liu","Hongsheng Zhang","Yuankai Qi","Peng Wang","Yaning Zhang","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.06735v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.16196v2","updated":"2023-08-13T09:35:26Z","published":"2023-03-28T17:58:05Z","title":"SparseNeRF: Distilling Depth Ranking for Few-shot Novel View Synthesis","summary":" Neural Radiance Field (NeRF) significantly degrades when only a limited\nnumber of views are available. To complement the lack of 3D information,\ndepth-based models, such as DSNeRF and MonoSDF, explicitly assume the\navailability of accurate depth maps of multiple views. They linearly scale the\naccurate depth maps as supervision to guide the predicted depth of few-shot\nNeRFs. However, accurate depth maps are difficult and expensive to capture due\nto wide-range depth distances in the wild.\n In this work, we present a new Sparse-view NeRF (SparseNeRF) framework that\nexploits depth priors from real-world inaccurate observations. The inaccurate\ndepth observations are either from pre-trained depth models or coarse depth\nmaps of consumer-level depth sensors. Since coarse depth maps are not strictly\nscaled to the ground-truth depth maps, we propose a simple yet effective\nconstraint, a local depth ranking method, on NeRFs such that the expected depth\nranking of the NeRF is consistent with that of the coarse depth maps in local\npatches. To preserve the spatial continuity of the estimated depth of NeRF, we\nfurther propose a spatial continuity constraint to encourage the consistency of\nthe expected depth continuity of NeRF with coarse depth maps. Surprisingly,\nwith simple depth ranking constraints, SparseNeRF outperforms all\nstate-of-the-art few-shot NeRF methods (including depth-based models) on\nstandard LLFF and DTU datasets. Moreover, we collect a new dataset NVS-RGBD\nthat contains real-world depth maps from Azure Kinect, ZED 2, and iPhone 13\nPro. Extensive experiments on NVS-RGBD dataset also validate the superiority\nand generalizability of SparseNeRF. Code and dataset are available at\nhttps://sparsenerf.github.io/.\n","authors":["Guangcong Wang","Zhaoxi Chen","Chen Change Loy","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2303.16196v2.pdf","comment":"Accepted by ICCV 2023, Project page: https://sparsenerf.github.io/"},{"id":"http://arxiv.org/abs/2305.01643v2","updated":"2023-08-13T09:25:18Z","published":"2023-05-02T17:55:38Z","title":"Neural LiDAR Fields for Novel View Synthesis","summary":" We present Neural Fields for LiDAR (NFL), a method to optimise a neural field\nscene representation from LiDAR measurements, with the goal of synthesizing\nrealistic LiDAR scans from novel viewpoints. NFL combines the rendering power\nof neural fields with a detailed, physically motivated model of the LiDAR\nsensing process, thus enabling it to accurately reproduce key sensor behaviors\nlike beam divergence, secondary returns, and ray dropping. We evaluate NFL on\nsynthetic and real LiDAR scans and show that it outperforms explicit\nreconstruct-then-simulate methods as well as other NeRF-style methods on LiDAR\nnovel view synthesis task. Moreover, we show that the improved realism of the\nsynthesized views narrows the domain gap to real scans and translates to better\nregistration and semantic segmentation performance.\n","authors":["Shengyu Huang","Zan Gojcic","Zian Wang","Francis Williams","Yoni Kasten","Sanja Fidler","Konrad Schindler","Or Litany"],"pdf_url":"https://arxiv.org/pdf/2305.01643v2.pdf","comment":"ICCV 2023 - camera ready. Project page:\n https://research.nvidia.com/labs/toronto-ai/nfl/"},{"id":"http://arxiv.org/abs/2308.06725v1","updated":"2023-08-13T09:05:56Z","published":"2023-08-13T09:05:56Z","title":"CLE Diffusion: Controllable Light Enhancement Diffusion Model","summary":" Low light enhancement has gained increasing importance with the rapid\ndevelopment of visual creation and editing. However, most existing enhancement\nalgorithms are designed to homogeneously increase the brightness of images to a\npre-defined extent, limiting the user experience. To address this issue, we\npropose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a\nnovel diffusion framework to provide users with rich controllability. Built\nwith a conditional diffusion model, we introduce an illumination embedding to\nlet users control their desired brightness level. Additionally, we incorporate\nthe Segment-Anything Model (SAM) to enable user-friendly region\ncontrollability, where users can click on objects to specify the regions they\nwish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves\ncompetitive performance regarding quantitative metrics, qualitative results,\nand versatile controllability. Project page:\n\\url{https://yuyangyin.github.io/CLEDiffusion/}\n","authors":["Yuyang Yin","Dejia Xu","Chuangchuang Tan","Ping Liu","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2308.06725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04512v2","updated":"2023-08-13T08:41:40Z","published":"2023-04-10T11:05:20Z","title":"Defense-Prefix for Preventing Typographic Attacks on CLIP","summary":" Vision-language pre-training models (VLPs) have exhibited revolutionary\nimprovements in various vision-language tasks. In VLP, some adversarial attacks\nfool a model into false or absurd classifications. Previous studies addressed\nthese attacks by fine-tuning the model or changing its architecture. However,\nthese methods risk losing the original model's performance and are difficult to\napply to downstream tasks. In particular, their applicability to other tasks\nhas not been considered. In this study, we addressed the reduction of the\nimpact of typographic attacks on CLIP without changing the model parameters. To\nachieve this, we expand the idea of ``prefix learning'' and introduce our\nsimple yet effective method: Defense-Prefix (DP), which inserts the DP token\nbefore a class name to make words ``robust'' against typographic attacks. Our\nmethod can be easily applied to downstream tasks, such as object detection,\nbecause the proposed method is independent of the model parameters. Our method\nsignificantly improves the accuracy of classification tasks for typographic\nattack datasets, while maintaining the zero-shot capabilities of the model. In\naddition, we leverage our proposed method for object detection, demonstrating\nits high applicability and effectiveness. The codes and datasets are available\nat https://github.com/azuma164/Defense-Prefix.\n","authors":["Hiroki Azuma","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2304.04512v2.pdf","comment":"ICCV2023 Workshop"},{"id":"http://arxiv.org/abs/2308.06721v1","updated":"2023-08-13T08:34:51Z","published":"2023-08-13T08:34:51Z","title":"IP-Adapter: Text Compatible Image Prompt Adapter for Text-to-Image\n Diffusion Models","summary":" Recent years have witnessed the strong power of large text-to-image diffusion\nmodels for the impressive generative capability to create high-fidelity images.\nHowever, it is very tricky to generate desired images using only text prompt as\nit often involves complex prompt engineering. An alternative to text prompt is\nimage prompt, as the saying goes: \"an image is worth a thousand words\".\nAlthough existing methods of direct fine-tuning from pretrained models are\neffective, they require large computing resources and are not compatible with\nother base models, text prompt, and structural controls. In this paper, we\npresent IP-Adapter, an effective and lightweight adapter to achieve image\nprompt capability for the pretrained text-to-image diffusion models. The key\ndesign of our IP-Adapter is decoupled cross-attention mechanism that separates\ncross-attention layers for text features and image features. Despite the\nsimplicity of our method, an IP-Adapter with only 22M parameters can achieve\ncomparable or even better performance to a fully fine-tuned image prompt model.\nAs we freeze the pretrained diffusion model, the proposed IP-Adapter can be\ngeneralized not only to other custom models fine-tuned from the same base\nmodel, but also to controllable generation using existing controllable tools.\nWith the benefit of the decoupled cross-attention strategy, the image prompt\ncan also work well with the text prompt to achieve multimodal image generation.\nThe project page is available at \\url{https://ip-adapter.github.io}.\n","authors":["Hu Ye","Jun Zhang","Sibo Liu","Xiao Han","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06719v1","updated":"2023-08-13T08:20:17Z","published":"2023-08-13T08:20:17Z","title":"3D Scene Graph Prediction on Point Clouds Using Knowledge Graphs","summary":" 3D scene graph prediction is a task that aims to concurrently predict object\nclasses and their relationships within a 3D environment. As these environments\nare primarily designed by and for humans, incorporating commonsense knowledge\nregarding objects and their relationships can significantly constrain and\nenhance the prediction of the scene graph. In this paper, we investigate the\napplication of commonsense knowledge graphs for 3D scene graph prediction on\npoint clouds of indoor scenes. Through experiments conducted on a real-world\nindoor dataset, we demonstrate that integrating external commonsense knowledge\nvia the message-passing method leads to a 15.0 % improvement in scene graph\nprediction accuracy with external knowledge and $7.96\\%$ with internal\nknowledge when compared to state-of-the-art algorithms. We also tested in the\nreal world with 10 frames per second for scene graph generation to show the\nusage of the model in a more realistic robotics setting.\n","authors":["Yiding Qiu","Henrik I. Christensen"],"pdf_url":"https://arxiv.org/pdf/2308.06719v1.pdf","comment":"accepted at CASE 2023"},{"id":"http://arxiv.org/abs/2308.06715v1","updated":"2023-08-13T08:11:40Z","published":"2023-08-13T08:11:40Z","title":"StairNetV3: Depth-aware Stair Modeling using Deep Learning","summary":" Vision-based stair perception can help autonomous mobile robots deal with the\nchallenge of climbing stairs, especially in unfamiliar environments. To address\nthe problem that current monocular vision methods are difficult to model stairs\naccurately without depth information, this paper proposes a depth-aware stair\nmodeling method for monocular vision. Specifically, we take the extraction of\nstair geometric features and the prediction of depth images as joint tasks in a\nconvolutional neural network (CNN), with the designed information propagation\narchitecture, we can achieve effective supervision for stair geometric feature\nlearning by depth information. In addition, to complete the stair modeling, we\ntake the convex lines, concave lines, tread surfaces and riser surfaces as\nstair geometric features and apply Gaussian kernels to enable the network to\npredict contextual information within the stair lines. Combined with the depth\ninformation obtained by depth sensors, we propose a stair point cloud\nreconstruction method that can quickly get point clouds belonging to the stair\nstep surfaces. Experiments on our dataset show that our method has a\nsignificant improvement over the previous best monocular vision method, with an\nintersection over union (IOU) increase of 3.4 %, and the lightweight version\nhas a fast detection speed and can meet the requirements of most real-time\napplications. Our dataset is available at\nhttps://data.mendeley.com/datasets/6kffmjt7g2/1.\n","authors":["Chen Wang","Zhongcai Pei","Shuang Qiu","Yachun Wang","Zhiyong Tang"],"pdf_url":"https://arxiv.org/pdf/2308.06715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06713v1","updated":"2023-08-13T08:06:18Z","published":"2023-08-13T08:06:18Z","title":"LAW-Diffusion: Complex Scene Generation by Diffusion with Layouts","summary":" Thanks to the rapid development of diffusion models, unprecedented progress\nhas been witnessed in image synthesis. Prior works mostly rely on pre-trained\nlinguistic models, but a text is often too abstract to properly specify all the\nspatial properties of an image, e.g., the layout configuration of a scene,\nleading to the sub-optimal results of complex scene generation. In this paper,\nwe achieve accurate complex scene generation by proposing a semantically\ncontrollable Layout-AWare diffusion model, termed LAW-Diffusion. Distinct from\nthe previous Layout-to-Image generation (L2I) methods that only explore\ncategory-aware relationships, LAW-Diffusion introduces a spatial dependency\nparser to encode the location-aware semantic coherence across objects as a\nlayout embedding and produces a scene with perceptually harmonious object\nstyles and contextual relations. To be specific, we delicately instantiate each\nobject's regional semantics as an object region map and leverage a\nlocation-aware cross-object attention module to capture the spatial\ndependencies among those disentangled representations. We further propose an\nadaptive guidance schedule for our layout guidance to mitigate the trade-off\nbetween the regional semantic alignment and the texture fidelity of generated\nobjects. Moreover, LAW-Diffusion allows for instance reconfiguration while\nmaintaining the other regions in a synthesized image by introducing a\nlayout-aware latent grafting mechanism to recompose its local regional\nsemantics. To better verify the plausibility of generated scenes, we propose a\nnew evaluation metric for the L2I task, dubbed Scene Relation Score (SRS) to\nmeasure how the images preserve the rational and harmonious relations among\ncontextual objects. Comprehensive experiments demonstrate that our\nLAW-Diffusion yields the state-of-the-art generative performance, especially\nwith coherent object relations.\n","authors":["Binbin Yang","Yi Luo","Ziliang Chen","Guangrun Wang","Xiaodan Liang","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2308.06713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06712v1","updated":"2023-08-13T08:02:14Z","published":"2023-08-13T08:02:14Z","title":"Compositional Feature Augmentation for Unbiased Scene Graph Generation","summary":" Scene Graph Generation (SGG) aims to detect all the visual relation triplets\n in a given image. With the emergence of various advanced\ntechniques for better utilizing both the intrinsic and extrinsic information in\neach relation triplet, SGG has achieved great progress over the recent years.\nHowever, due to the ubiquitous long-tailed predicate distributions, today's SGG\nmodels are still easily biased to the head predicates. Currently, the most\nprevalent debiasing solutions for SGG are re-balancing methods, e.g., changing\nthe distributions of original training samples. In this paper, we argue that\nall existing re-balancing strategies fail to increase the diversity of the\nrelation triplet features of each predicate, which is critical for robust SGG.\nTo this end, we propose a novel Compositional Feature Augmentation (CFA)\nstrategy, which is the first unbiased SGG work to mitigate the bias issue from\nthe perspective of increasing the diversity of triplet features. Specifically,\nwe first decompose each relation triplet feature into two components: intrinsic\nfeature and extrinsic feature, which correspond to the intrinsic\ncharacteristics and extrinsic contexts of a relation triplet, respectively.\nThen, we design two different feature augmentation modules to enrich the\nfeature diversity of original relation triplets by replacing or mixing up\neither their intrinsic or extrinsic features from other samples. Due to its\nmodel-agnostic nature, CFA can be seamlessly incorporated into various SGG\nframeworks. Extensive ablations have shown that CFA achieves a new\nstate-of-the-art performance on the trade-off between different metrics.\n","authors":["Lin Li","Guikun Chen","Jun Xiao","Yi Yang","Chunping Wang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2308.06712v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.06707v1","updated":"2023-08-13T07:51:59Z","published":"2023-08-13T07:51:59Z","title":"Condition-Adaptive Graph Convolution Learning for Skeleton-Based Gait\n Recognition","summary":" Graph convolutional networks have been widely applied in skeleton-based gait\nrecognition. A key challenge in this task is to distinguish the individual\nwalking styles of different subjects across various views. Existing\nstate-of-the-art methods employ uniform convolutions to extract features from\ndiverse sequences and ignore the effects of viewpoint changes. To overcome\nthese limitations, we propose a condition-adaptive graph (CAG) convolution\nnetwork that can dynamically adapt to the specific attributes of each skeleton\nsequence and the corresponding view angle. In contrast to using fixed weights\nfor all joints and sequences, we introduce a joint-specific filter learning\n(JSFL) module in the CAG method, which produces sequence-adaptive filters at\nthe joint level. The adaptive filters capture fine-grained patterns that are\nunique to each joint, enabling the extraction of diverse spatial-temporal\ninformation about body parts. Additionally, we design a view-adaptive topology\nlearning (VATL) module that generates adaptive graph topologies. These graph\ntopologies are used to correlate the joints adaptively according to the\nspecific view conditions. Thus, CAG can simultaneously adjust to various\nwalking styles and viewpoints. Experiments on the two most widely used datasets\n(i.e., CASIA-B and OU-MVLP) show that CAG surpasses all previous skeleton-based\nmethods. Moreover, the recognition performance can be enhanced by simply\ncombining CAG with appearance-based methods, demonstrating the ability of CAG\nto provide useful complementary information.The source code will be available\nat https://github.com/OliverHxh/CAG.\n","authors":["Xiaohu Huang","Xinggang Wang","Zhidianqiu Jin","Bo Yang","Botao He","Bin Feng","Wenyu Liu"],"pdf_url":"https://arxiv.org/pdf/2308.06707v1.pdf","comment":"Accepted by TIP journal"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.04661v2","updated":"2023-08-13T03:30:28Z","published":"2023-08-09T01:58:28Z","title":"Unified Matrix Factorization with Dynamic Multi-view Clustering","summary":" Matrix factorization (MF) is a classical collaborative filtering algorithm\nfor recommender systems. It decomposes the user-item interaction matrix into a\nproduct of low-dimensional user representation matrix and item representation\nmatrix. In typical recommendation scenarios, the user-item interaction paradigm\nis usually a two-stage process and requires static clustering analysis of the\nobtained user and item representations. The above process, however, is time and\ncomputationally intensive, making it difficult to apply in real-time to\ne-commerce or Internet of Things environments with billions of users and\ntrillions of items. To address this, we propose a unified matrix factorization\nmethod based on dynamic multi-view clustering (MFDMC) that employs an\nend-to-end training paradigm. Specifically, in each view, a user/item\nrepresentation is regarded as a weighted projection of all clusters. The\nrepresentation of each cluster is learnable, enabling the dynamic discarding of\nbad clusters. Furthermore, we employ multi-view clustering to represent\nmultiple roles of users/items, effectively utilizing the representation space\nand improving the interpretability of the user/item representations for\ndownstream tasks. Extensive experiments show that our proposed MFDMC achieves\nstate-of-the-art performance on real-world recommendation datasets.\nAdditionally, comprehensive visualization and ablation studies interpretably\nconfirm that our method provides meaningful representations for downstream\ntasks of users/items.\n","authors":["Shangde Gao","Ke Liu","Yichao Fu"],"pdf_url":"https://arxiv.org/pdf/2308.04661v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08068v2","updated":"2023-08-13T03:04:08Z","published":"2023-02-16T04:06:25Z","title":"LabelPrompt: Effective Prompt-based Learning for Relation Classification","summary":" Recently, prompt-based learning has gained popularity across many natural\nlanguage processing (NLP) tasks by reformulating them into a cloze-style format\nto better align pre-trained language models (PLMs) with downstream tasks.\nHowever, applying this approach to relation classification poses unique\nchallenges. Specifically, associating natural language words that fill the\nmasked token with semantic relation labels (\\textit{e.g.}\n\\textit{``org:founded\\_by}'') is difficult. To address this challenge, this\npaper presents a novel prompt-based learning method, namely LabelPrompt, for\nthe relation classification task. Motivated by the intuition to ``GIVE MODEL\nCHOICES!'', we first define additional tokens to represent relation labels,\nwhich regard these tokens as the verbaliser with semantic initialisation and\nexplicitly construct them with a prompt template method. Then, to mitigate\ninconsistency between predicted relations and given entities, we implement an\nentity-aware module with contrastive learning. Last, we conduct an attention\nquery strategy within the self-attention layer to differentiates prompt tokens\nand sequence tokens. Together, these strategies enhance the adaptability of\nprompt-based learning, especially when only small labelled datasets is\navailable. Comprehensive experiments on benchmark datasets demonstrate the\nsuperiority of our method, particularly in the few-shot scenario.\n","authors":["Wenjie Zhang","Xiaoning Song","Zhenhua Feng","Tianyang Xu","Xiaojun Wu"],"pdf_url":"https://arxiv.org/pdf/2302.08068v2.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.08500v1","updated":"2023-08-13T18:28:56Z","published":"2023-08-13T18:28:56Z","title":"InTune: Reinforcement Learning-based Data Pipeline Optimization for Deep\n Recommendation Models","summary":" Deep learning-based recommender models (DLRMs) have become an essential\ncomponent of many modern recommender systems. Several companies are now\nbuilding large compute clusters reserved only for DLRM training, driving new\ninterest in cost- and time- saving optimizations. The systems challenges faced\nin this setting are unique; while typical deep learning training jobs are\ndominated by model execution, the most important factor in DLRM training\nperformance is often online data ingestion.\n In this paper, we explore the unique characteristics of this data ingestion\nproblem and provide insights into DLRM training pipeline bottlenecks and\nchallenges. We study real-world DLRM data processing pipelines taken from our\ncompute cluster at Netflix to observe the performance impacts of online\ningestion and to identify shortfalls in existing pipeline optimizers. We find\nthat current tooling either yields sub-optimal performance, frequent crashes,\nor else requires impractical cluster re-organization to adopt. Our studies lead\nus to design and build a new solution for data pipeline optimization, InTune.\n InTune employs a reinforcement learning (RL) agent to learn how to distribute\nthe CPU resources of a trainer machine across a DLRM data pipeline to more\neffectively parallelize data loading and improve throughput. Our experiments\nshow that InTune can build an optimized data pipeline configuration within only\na few minutes, and can easily be integrated into existing training workflows.\nBy exploiting the responsiveness and adaptability of RL, InTune achieves higher\nonline data ingestion rates than existing optimizers, thus reducing idle times\nin model execution and increasing efficiency. We apply InTune to our real-world\ncluster, and find that it increases data ingestion throughput by as much as\n2.29X versus state-of-the-art data pipeline optimizers while also improving\nboth CPU & GPU utilization.\n","authors":["Kabir Nagrecha","Lingyi Liu","Pablo Delgado","Prasanna Padmanabhan"],"pdf_url":"https://arxiv.org/pdf/2308.08500v1.pdf","comment":"Accepted at RecSys 2023. 11 pages, 2 pages of references. 8 figures\n with 2 tables"},{"id":"http://arxiv.org/abs/2308.07935v1","updated":"2023-08-13T09:20:47Z","published":"2023-08-13T09:20:47Z","title":"Transforming Sentiment Analysis in the Financial Domain with ChatGPT","summary":" Financial sentiment analysis plays a crucial role in decoding market trends\nand guiding strategic trading decisions. Despite the deployment of advanced\ndeep learning techniques and language models to refine sentiment analysis in\nfinance, this study breaks new ground by investigating the potential of large\nlanguage models, particularly ChatGPT 3.5, in financial sentiment analysis,\nwith a strong emphasis on the foreign exchange market (forex). Employing a\nzero-shot prompting approach, we examine multiple ChatGPT prompts on a\nmeticulously curated dataset of forex-related news headlines, measuring\nperformance using metrics such as precision, recall, f1-score, and Mean\nAbsolute Error (MAE) of the sentiment class. Additionally, we probe the\ncorrelation between predicted sentiment and market returns as an additional\nevaluation approach. ChatGPT, compared to FinBERT, a well-established sentiment\nanalysis model for financial texts, exhibited approximately 35\\% enhanced\nperformance in sentiment classification and a 36\\% higher correlation with\nmarket returns. By underlining the significance of prompt engineering,\nparticularly in zero-shot contexts, this study spotlights ChatGPT's potential\nto substantially boost sentiment analysis in financial applications. By sharing\nthe utilized dataset, our intention is to stimulate further research and\nadvancements in the field of financial services.\n","authors":["Georgios Fatouros","John Soldatos","Kalliopi Kouroumali","Georgios Makridis","Dimosthenis Kyriazis"],"pdf_url":"https://arxiv.org/pdf/2308.07935v1.pdf","comment":"10 pages, 8 figures, Preprint submitted to Machine Learning with\n Applications"},{"id":"http://arxiv.org/abs/2308.08461v1","updated":"2023-08-13T08:10:56Z","published":"2023-08-13T08:10:56Z","title":"CDR: Conservative Doubly Robust Learning for Debiased Recommendation","summary":" In recommendation systems (RS), user behavior data is observational rather\nthan experimental, resulting in widespread bias in the data. Consequently,\ntackling bias has emerged as a major challenge in the field of recommendation\nsystems. Recently, Doubly Robust Learning (DR) has gained significant attention\ndue to its remarkable performance and robust properties. However, our\nexperimental findings indicate that existing DR methods are severely impacted\nby the presence of so-called Poisonous Imputation, where the imputation\nsignificantly deviates from the truth and becomes counterproductive.\n To address this issue, this work proposes Conservative Doubly Robust strategy\n(CDR) which filters imputations by scrutinizing their mean and variance.\nTheoretical analyses show that CDR offers reduced variance and improved tail\nbounds.In addition, our experimental investigations illustrate that CDR\nsignificantly enhances performance and can indeed reduce the frequency of\npoisonous imputation.\n","authors":["ZiJie Song","JiaWei Chen","Sheng Zhou","QiHao Shi","Yan Feng","Chun Chen","Can Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08461v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.06862v1","updated":"2023-08-13T23:34:36Z","published":"2023-08-13T23:34:36Z","title":"Effect of Choosing Loss Function when Using T-batching for\n Representation Learning on Dynamic Networks","summary":" Representation learning methods have revolutionized machine learning on\nnetworks by converting discrete network structures into continuous domains.\nHowever, dynamic networks that evolve over time pose new challenges. To address\nthis, dynamic representation learning methods have gained attention, offering\nbenefits like reduced learning time and improved accuracy by utilizing temporal\ninformation.\n T-batching is a valuable technique for training dynamic network models that\nreduces training time while preserving vital conditions for accurate modeling.\nHowever, we have identified a limitation in the training loss function used\nwith t-batching. Through mathematical analysis, we propose two alternative loss\nfunctions that overcome these issues, resulting in enhanced training\nperformance.\n We extensively evaluate the proposed loss functions on synthetic and\nreal-world dynamic networks. The results consistently demonstrate superior\nperformance compared to the original loss function. Notably, in a real-world\nnetwork characterized by diverse user interaction histories, the proposed loss\nfunctions achieved more than 26.9% enhancement in Mean Reciprocal Rank (MRR)\nand more than 11.8% improvement in Recall@10. These findings underscore the\nefficacy of the proposed loss functions in dynamic network modeling.\n","authors":["Erfan Loghmani","MohammadAmin Fazli"],"pdf_url":"https://arxiv.org/pdf/2308.06862v1.pdf","comment":"29 pages, 10 figures, 4 tables, Submitted to Information Sciences"},{"id":"http://arxiv.org/abs/2205.10952v2","updated":"2023-08-13T23:17:26Z","published":"2022-05-22T23:14:27Z","title":"Analysis of functional neural codes of deep learning models","summary":" Deep neural networks (DNNs), the agents of deep learning (DL), require a\nmassive number of parallel/sequential operations. This makes it difficult to\ncomprehend DNNs' operations and impedes proper diagnosis. Without better\nknowledge of their internal process, deploying DNNs in high-stakes domains can\nlead to catastrophic failures. Therefore, to build more reliable DNNs/DL to be\ndeployed in high-stakes real-world problems, it is imperative that we gain\ninsights into DNNs' internal operations underlying their decision-making. Here,\nwe use the self-organizing map (SOM) to analyze DL models' internal codes\nassociated with DNNs' decision-making. Our analyses suggest that shallow layers\nclose to the input layer compress features into condensed space and that deep\nlayers close to the output layer expand feature space. We also found evidence\nindicating that compressed features may underlie DNNs' vulnerabilities to\nadversarial perturbations.\n","authors":["Jung Hoon Lee","Sujith Vijayan"],"pdf_url":"https://arxiv.org/pdf/2205.10952v2.pdf","comment":"13 pages, 8 main figures, 3 supplemental figures, 3 supplemental\n tables"},{"id":"http://arxiv.org/abs/2306.12086v2","updated":"2023-08-13T22:59:19Z","published":"2023-06-21T08:05:05Z","title":"What Constitutes Good Contrastive Learning in Time-Series Forecasting?","summary":" In recent years, the introduction of self-supervised contrastive learning\n(SSCL) has demonstrated remarkable improvements in representation learning\nacross various domains, including natural language processing and computer\nvision. By leveraging the inherent benefits of self-supervision, SSCL enables\nthe pre-training of representation models using vast amounts of unlabeled data.\nDespite these advances, there remains a significant gap in understanding the\nimpact of different SSCL strategies on time series forecasting performance, as\nwell as the specific benefits that SSCL can bring. This paper aims to address\nthese gaps by conducting a comprehensive analysis of the effectiveness of\nvarious training variables, including different SSCL algorithms, learning\nstrategies, model architectures, and their interplay. Additionally, to gain\ndeeper insights into the improvements brought about by SSCL in the context of\ntime-series forecasting, a qualitative analysis of the empirical receptive\nfield is performed. Through our experiments, we demonstrate that the end-to-end\ntraining of a Transformer model using the Mean Squared Error (MSE) loss and\nSSCL emerges as the most effective approach in time series forecasting.\nNotably, the incorporation of the contrastive objective enables the model to\nprioritize more pertinent information for forecasting, such as scale and\nperiodic relationships. These findings contribute to a better understanding of\nthe benefits of SSCL in time series forecasting and provide valuable insights\nfor future research in this area. Our codes are available at\nhttps://github.com/chiyuzhang94/contrastive_learning_time-series_e2e.\n","authors":["Chiyu Zhang","Qi Yan","Lili Meng","Tristan Sylvain"],"pdf_url":"https://arxiv.org/pdf/2306.12086v2.pdf","comment":"Accepted at IJCAI'22 Workshop-AI4TS: AI for Time Series Analysis"},{"id":"http://arxiv.org/abs/2308.06851v1","updated":"2023-08-13T22:03:35Z","published":"2023-08-13T22:03:35Z","title":"Optimizing Offensive Gameplan in the National Basketball Association\n with Machine Learning","summary":" Throughout the analytical revolution that has occurred in the NBA, the\ndevelopment of specific metrics and formulas has given teams, coaches, and\nplayers a new way to see the game. However - the question arises - how can we\nverify any metrics? One method would simply be eyeball approximation (trying\nout many different gameplans) and/or trial and error - an estimation-based and\ncostly approach. Another approach is to try to model already existing metrics\nwith a unique set of features using machine learning techniques. The key to\nthis approach is that with these features that are selected, we can try to\ngauge the effectiveness of these features combined, rather than using\nindividual analysis in simple metric evaluation. If we have an accurate model,\nit can particularly help us determine the specifics of gameplan execution. In\nthis paper, the statistic ORTG (Offensive Rating, developed by Dean Oliver) was\nfound to have a correlation with different NBA playtypes using both a linear\nregression model and a neural network regression model, although ultimately, a\nneural network worked slightly better than linear regression. Using the\naccuracy of the models as a justification, the next step was to optimize the\noutput of the model with test examples, which would demonstrate the combination\nof features to best achieve a highly functioning offense.\n","authors":["Eamon Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2308.06851v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.06849v1","updated":"2023-08-13T21:42:31Z","published":"2023-08-13T21:42:31Z","title":"When Monte-Carlo Dropout Meets Multi-Exit: Optimizing Bayesian Neural\n Networks on FPGA","summary":" Bayesian Neural Networks (BayesNNs) have demonstrated their capability of\nproviding calibrated prediction for safety-critical applications such as\nmedical imaging and autonomous driving. However, the high algorithmic\ncomplexity and the poor hardware performance of BayesNNs hinder their\ndeployment in real-life applications. To bridge this gap, this paper proposes a\nnovel multi-exit Monte-Carlo Dropout (MCD)-based BayesNN that achieves\nwell-calibrated predictions with low algorithmic complexity. To further reduce\nthe barrier to adopting BayesNNs, we propose a transformation framework that\ncan generate FPGA-based accelerators for multi-exit MCD-based BayesNNs. Several\nnovel optimization techniques are introduced to improve hardware performance.\nOur experiments demonstrate that our auto-generated accelerator achieves higher\nenergy efficiency than CPU, GPU, and other state-of-the-art hardware\nimplementations.\n","authors":["Hongxiang Fan","Hao Chen","Liam Castelli","Zhiqiang Que","He Li","Kenneth Long","Wayne Luk"],"pdf_url":"https://arxiv.org/pdf/2308.06849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.05424v7","updated":"2023-08-13T21:33:26Z","published":"2022-08-08T16:54:01Z","title":"Hard-Constrained Deep Learning for Climate Downscaling","summary":" The availability of reliable, high-resolution climate and weather data is\nimportant to inform long-term decisions on climate adaptation and mitigation\nand to guide rapid responses to extreme events. Forecasting models are limited\nby computational costs and, therefore, often generate coarse-resolution\npredictions. Statistical downscaling, including super-resolution methods from\ndeep learning, can provide an efficient method of upsampling low-resolution\ndata. However, despite achieving visually compelling results in some cases,\nsuch models frequently violate conservation laws when predicting physical\nvariables. In order to conserve physical quantities, here we introduce methods\nthat guarantee statistical constraints are satisfied by a deep learning\ndownscaling model while also improving their performance according to\ntraditional metrics. We compare different constraining approaches and\ndemonstrate their applicability across different neural architectures as well\nas a variety of climate and weather datasets. Besides enabling faster and more\naccurate climate predictions through downscaling, we also show that our novel\nmethodologies can improve super-resolution for satellite data and standard\ndatasets.\n","authors":["Paula Harder","Alex Hernandez-Garcia","Venkatesh Ramesh","Qidong Yang","Prasanna Sattigeri","Daniela Szwarcman","Campbell Watson","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2208.05424v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08602v2","updated":"2023-08-13T20:36:46Z","published":"2023-07-13T21:51:29Z","title":"CaRT: Certified Safety and Robust Tracking in Learning-based Motion\n Planning for Multi-Agent Systems","summary":" The key innovation of our analytical method, CaRT, lies in establishing a new\nhierarchical, distributed architecture to guarantee the safety and robustness\nof a given learning-based motion planning policy. First, in a nominal setting,\nthe analytical form of our CaRT safety filter formally ensures safe maneuvers\nof nonlinear multi-agent systems, optimally with minimal deviation from the\nlearning-based policy. Second, in off-nominal settings, the analytical form of\nour CaRT robust filter optimally tracks the certified safe trajectory,\ngenerated by the previous layer in the hierarchy, the CaRT safety filter. We\nshow using contraction theory that CaRT guarantees safety and the exponential\nboundedness of the trajectory tracking error, even under the presence of\ndeterministic and stochastic disturbance. Also, the hierarchical nature of CaRT\nenables enhancing its robustness for safety just by its superior tracking to\nthe certified safe trajectory, thereby making it suitable for off-nominal\nscenarios with large disturbances. This is a major distinction from\nconventional safety function-driven approaches, where the robustness originates\nfrom the stability of a safe set, which could pull the system\nover-conservatively to the interior of the safe set. Our log-barrier\nformulation in CaRT allows for its distributed implementation in multi-agent\nsettings. We demonstrate the effectiveness of CaRT in several examples of\nnonlinear motion planning and control problems, including optimal,\nmulti-spacecraft reconfiguration.\n","authors":["Hiroyasu Tsukamoto","Benjamin Rivière","Changrak Choi","Amir Rahmani","Soon-Jo Chung"],"pdf_url":"https://arxiv.org/pdf/2307.08602v2.pdf","comment":"IEEE Conference on Decision and Control (CDC), Preprint Version,\n Accepted July, 2023"},{"id":"http://arxiv.org/abs/2211.03226v2","updated":"2023-08-13T19:58:52Z","published":"2022-11-06T22:05:27Z","title":"Rotation-equivariant Graph Neural Networks for Learning Glassy Liquids\n Representations","summary":" Within the glassy liquids community, the use of Machine Learning (ML) to\nmodel particles' static structure is currently a hot topic. The state of the\nart consists in Graph Neural Networks (GNNs), which have a great expressive\npower but are heavy models with numerous parameters and lack interpretability.\nInspired by recent advances in the field of Machine Learning group-equivariant\nrepresentations, we build a GNN that learns a robust representation of the\nglass' static structure by constraining it to preserve the roto-translation\n(SE(3)) equivariance. We show that this constraint not only significantly\nimproves the predictive power but also improves the ability to generalize to\nunseen temperatures while allowing to reduce the number of parameters.\nFurthermore, interpretability is improved, as we can relate the action of our\nbasic convolution layer to well-known rotation-invariant expert features.\nThrough transfer-learning experiments we demonstrate that our network learns a\nrobust representation, which allows us to push forward the idea of a learned\nglass structural order parameter.\n","authors":["Francesco Saverio Pezzicoli","Guillaume Charpiat","François P. Landes"],"pdf_url":"https://arxiv.org/pdf/2211.03226v2.pdf","comment":"15 pages, 9 figures plus references and appendix"},{"id":"http://arxiv.org/abs/2308.06838v1","updated":"2023-08-13T19:45:20Z","published":"2023-08-13T19:45:20Z","title":"Generalizing Topological Graph Neural Networks with Paths","summary":" While Graph Neural Networks (GNNs) have made significant strides in diverse\nareas, they are hindered by a theoretical constraint known as the\n1-Weisfeiler-Lehmann test. Even though latest advancements in higher-order GNNs\ncan overcome this boundary, they typically center around certain graph\ncomponents like cliques or cycles. However, our investigation goes a different\nroute. We put emphasis on paths, which are inherent in every graph. We are able\nto construct a more general topological perspective and form a bridge to\ncertain established theories about other topological domains. Interestingly,\nwithout any assumptions on graph sub-structures, our approach surpasses earlier\ntechniques in this field, achieving state-of-the-art performance on several\nbenchmarks.\n","authors":["Quang Truong","Peter Chin"],"pdf_url":"https://arxiv.org/pdf/2308.06838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.03821v2","updated":"2023-08-13T18:27:52Z","published":"2022-10-07T21:18:22Z","title":"Large Language Models can Implement Policy Iteration","summary":" This work presents In-Context Policy Iteration, an algorithm for performing\nReinforcement Learning (RL), in-context, using foundation models. While the\napplication of foundation models to RL has received considerable attention,\nmost approaches rely on either (1) the curation of expert demonstrations\n(either through manual design or task-specific pretraining) or (2) adaptation\nto the task of interest using gradient methods (either fine-tuning or training\nof adapter layers). Both of these techniques have drawbacks. Collecting\ndemonstrations is labor-intensive, and algorithms that rely on them do not\noutperform the experts from which the demonstrations were derived. All gradient\ntechniques are inherently slow, sacrificing the \"few-shot\" quality that made\nin-context learning attractive to begin with. In this work, we present an\nalgorithm, ICPI, that learns to perform RL tasks without expert demonstrations\nor gradients. Instead we present a policy-iteration method in which the prompt\ncontent is the entire locus of learning. ICPI iteratively updates the contents\nof the prompt from which it derives its policy through trial-and-error\ninteraction with an RL environment. In order to eliminate the role of\nin-weights learning (on which approaches like Decision Transformer rely\nheavily), we demonstrate our algorithm using Codex, a language model with no\nprior knowledge of the domains on which we evaluate it.\n","authors":["Ethan Brooks","Logan Walls","Richard L. Lewis","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2210.03821v2.pdf","comment":"10 pages, 4 figures, submitted to ICLR 2023"},{"id":"http://arxiv.org/abs/2308.06828v1","updated":"2023-08-13T18:14:10Z","published":"2023-08-13T18:14:10Z","title":"An Ensemble Approach to Question Classification: Integrating Electra\n Transformer, GloVe, and LSTM","summary":" This paper introduces a novel ensemble approach for question classification\nusing state-of-the-art models -- Electra, GloVe, and LSTM. The proposed model\nis trained and evaluated on the TREC dataset, a well-established benchmark for\nquestion classification tasks. The ensemble model combines the strengths of\nElectra, a transformer-based model for language understanding, GloVe, a global\nvectors for word representation, and LSTM, a recurrent neural network variant,\nproviding a robust and efficient solution for question classification.\nExtensive experiments were carried out to compare the performance of the\nproposed ensemble approach with other cutting-edge models, such as BERT,\nRoBERTa, and DistilBERT. Our results demonstrate that the ensemble model\noutperforms these models across all evaluation metrics, achieving an accuracy\nof 0.8 on the test set. These findings underscore the effectiveness of the\nensemble approach in enhancing the performance of question classification\ntasks, and invite further exploration of ensemble methods in natural language\nprocessing.\n","authors":["Sanad Aburass","Osama Dorgham"],"pdf_url":"https://arxiv.org/pdf/2308.06828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06827v1","updated":"2023-08-13T18:12:28Z","published":"2023-08-13T18:12:28Z","title":"Reinforcement Graph Clustering with Unknown Cluster Number","summary":" Deep graph clustering, which aims to group nodes into disjoint clusters by\nneural networks in an unsupervised manner, has attracted great attention in\nrecent years. Although the performance has been largely improved, the excellent\nperformance of the existing methods heavily relies on an accurately predefined\ncluster number, which is not always available in the real-world scenario. To\nenable the deep graph clustering algorithms to work without the guidance of the\npredefined cluster number, we propose a new deep graph clustering method termed\nReinforcement Graph Clustering (RGC). In our proposed method, cluster number\ndetermination and unsupervised representation learning are unified into a\nuniform framework by the reinforcement learning mechanism. Concretely, the\ndiscriminative node representations are first learned with the contrastive\npretext task. Then, to capture the clustering state accurately with both local\nand global information in the graph, both node and cluster states are\nconsidered. Subsequently, at each state, the qualities of different cluster\nnumbers are evaluated by the quality network, and the greedy action is executed\nto determine the cluster number. In order to conduct feedback actions, the\nclustering-oriented reward function is proposed to enhance the cohesion of the\nsame clusters and separate the different clusters. Extensive experiments\ndemonstrate the effectiveness and efficiency of our proposed method. The source\ncode of RGC is shared at https://github.com/yueliu1999/RGC and a collection\n(papers, codes and, datasets) of deep graph clustering is shared at\nhttps://github.com/yueliu1999/Awesome-Deep-Graph-Clustering on Github.\n","authors":["Yue Liu","Ke Liang","Jun Xia","Xihong Yang","Sihang Zhou","Meng Liu","Xinwang Liu","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2308.06827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.08106v2","updated":"2023-08-13T18:07:47Z","published":"2022-10-14T21:02:04Z","title":"A Primal-Dual Algorithm for Hybrid Federated Learning","summary":" Very few methods for hybrid federated learning, where clients only hold\nsubsets of both features and samples, exist. Yet, this scenario is very\nimportant in practical settings. We provide a fast, robust algorithm for hybrid\nfederated learning that hinges on Fenchel Duality. We prove the convergence of\nthe algorithm to the same solution as if the model was trained centrally in a\nvariety of practical regimes. Furthermore, we provide experimental results that\ndemonstrate the performance improvements of the algorithm over a commonly used\nmethod in federated learning, FedAvg. We also provide privacy considerations\nand necessary steps to protect client data.\n","authors":["Tom Overman","Garrett Blum","Diego Klabjan"],"pdf_url":"https://arxiv.org/pdf/2210.08106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.10945v2","updated":"2023-08-13T18:00:10Z","published":"2022-01-26T14:01:32Z","title":"On the Power of Gradual Network Alignment Using Dual-Perception\n Similarities","summary":" Network alignment (NA) is the task of finding the correspondence of nodes\nbetween two networks based on the network structure and node attributes. Our\nstudy is motivated by the fact that, since most of existing NA methods have\nattempted to discover all node pairs at once, they do not harness information\nenriched through interim discovery of node correspondences to more accurately\nfind the next correspondences during the node matching. To tackle this\nchallenge, we propose Grad-Align, a new NA method that gradually discovers node\npairs by making full use of node pairs exhibiting strong consistency, which are\neasy to be discovered in the early stage of gradual matching. Specifically,\nGrad-Align first generates node embeddings of the two networks based on graph\nneural networks along with our layer-wise reconstruction loss, a loss built\nupon capturing the first-order and higher-order neighborhood structures. Then,\nnodes are gradually aligned by computing dual-perception similarity measures\nincluding the multi-layer embedding similarity as well as the Tversky\nsimilarity, an asymmetric set similarity using the Tversky index applicable to\nnetworks with different scales. Additionally, we incorporate an edge\naugmentation module into Grad-Align to reinforce the structural consistency.\nThrough comprehensive experiments using real-world and synthetic datasets, we\nempirically demonstrate that Grad-Align consistently outperforms\nstate-of-the-art NA methods.\n","authors":["Jin-Duk Park","Cong Tran","Won-Yong Shin","Xin Cao"],"pdf_url":"https://arxiv.org/pdf/2201.10945v2.pdf","comment":"16 pages, 11 figures, 4 tables; 13 pages, to appear in the IEEE\n Transactions on Pattern Analysis and Machine Intelligence (Please cite our\n journal version that will appear in an upcoming issue.)"},{"id":"http://arxiv.org/abs/2308.06822v1","updated":"2023-08-13T17:40:56Z","published":"2023-08-13T17:40:56Z","title":"Approximate and Weighted Data Reconstruction Attack in Federated\n Learning","summary":" Federated Learning (FL) is a distributed learning paradigm that enables\nmultiple clients to collaborate on building a machine learning model without\nsharing their private data. Although FL is considered privacy-preserved by\ndesign, recent data reconstruction attacks demonstrate that an attacker can\nrecover clients' training data based on the parameters shared in FL. However,\nmost existing methods fail to attack the most widely used horizontal Federated\nAveraging (FedAvg) scenario, where clients share model parameters after\nmultiple local training steps. To tackle this issue, we propose an\ninterpolation-based approximation method, which makes attacking FedAvg\nscenarios feasible by generating the intermediate model updates of the clients'\nlocal training processes. Then, we design a layer-wise weighted loss function\nto improve the data quality of reconstruction. We assign different weights to\nmodel updates in different layers concerning the neural network structure, with\nthe weights tuned by Bayesian optimization. Finally, experimental results\nvalidate the superiority of our proposed approximate and weighted attack (AWA)\nmethod over the other state-of-the-art methods, as demonstrated by the\nsubstantial improvement in different evaluation metrics for image data\nreconstructions.\n","authors":["Ziqi Wang","Yongcun Song","Enrique Zuazua"],"pdf_url":"https://arxiv.org/pdf/2308.06822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06819v1","updated":"2023-08-13T17:23:36Z","published":"2023-08-13T17:23:36Z","title":"SoK: Realistic Adversarial Attacks and Defenses for Intelligent Network\n Intrusion Detection","summary":" Machine Learning (ML) can be incredibly valuable to automate anomaly\ndetection and cyber-attack classification, improving the way that Network\nIntrusion Detection (NID) is performed. However, despite the benefits of ML\nmodels, they are highly susceptible to adversarial cyber-attack examples\nspecifically crafted to exploit them. A wide range of adversarial attacks have\nbeen created and researchers have worked on various defense strategies to\nsafeguard ML models, but most were not intended for the specific constraints of\na communication network and its communication protocols, so they may lead to\nunrealistic examples in the NID domain. This Systematization of Knowledge (SoK)\nconsolidates and summarizes the state-of-the-art adversarial learning\napproaches that can generate realistic examples and could be used in real ML\ndevelopment and deployment scenarios with real network traffic flows. This SoK\nalso describes the open challenges regarding the use of adversarial ML in the\nNID domain, defines the fundamental properties that are required for an\nadversarial example to be realistic, and provides guidelines for researchers to\nensure that their future experiments are adequate for a real communication\nnetwork.\n","authors":["João Vitorino","Isabel Praça","Eva Maia"],"pdf_url":"https://arxiv.org/pdf/2308.06819v1.pdf","comment":"31 pages, 3 tables, 6 figures, Computers and Security journal"},{"id":"http://arxiv.org/abs/2306.02913v4","updated":"2023-08-13T16:09:33Z","published":"2023-06-05T14:19:52Z","title":"Decentralized SGD and Average-direction SAM are Asymptotically\n Equivalent","summary":" Decentralized stochastic gradient descent (D-SGD) allows collaborative\nlearning on massive devices simultaneously without the control of a central\nserver. However, existing theories claim that decentralization invariably\nundermines generalization. In this paper, we challenge the conventional belief\nand present a completely new perspective for understanding decentralized\nlearning. We prove that D-SGD implicitly minimizes the loss function of an\naverage-direction Sharpness-aware minimization (SAM) algorithm under general\nnon-convex non-$\\beta$-smooth settings. This surprising asymptotic equivalence\nreveals an intrinsic regularization-optimization trade-off and three advantages\nof decentralization: (1) there exists a free uncertainty evaluation mechanism\nin D-SGD to improve posterior estimation; (2) D-SGD exhibits a gradient\nsmoothing effect; and (3) the sharpness regularization effect of D-SGD does not\ndecrease as total batch size increases, which justifies the potential\ngeneralization benefit of D-SGD over centralized SGD (C-SGD) in large-batch\nscenarios.\n","authors":["Tongtian Zhu","Fengxiang He","Kaixuan Chen","Mingli Song","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2306.02913v4.pdf","comment":"Accepted for publication in the 40th International Conference on\n Machine Learning (ICML 2023)"},{"id":"http://arxiv.org/abs/2308.06801v1","updated":"2023-08-13T16:04:03Z","published":"2023-08-13T16:04:03Z","title":"SAILOR: Structural Augmentation Based Tail Node Representation Learning","summary":" Graph Neural Networks (GNNs) have achieved state-of-the-art performance in\nrepresentation learning for graphs recently. However, the effectiveness of\nGNNs, which capitalize on the key operation of message propagation, highly\ndepends on the quality of the topology structure. Most of the graphs in\nreal-world scenarios follow a long-tailed distribution on their node degrees,\nthat is, a vast majority of the nodes in the graph are tail nodes with only a\nfew connected edges. GNNs produce inferior node representations for tail nodes\nsince they lack structural information. In the pursuit of promoting the\nexpressiveness of GNNs for tail nodes, we explore how the deficiency of\nstructural information deteriorates the performance of tail nodes and propose a\ngeneral Structural Augmentation based taIL nOde Representation learning\nframework, dubbed as SAILOR, which can jointly learn to augment the graph\nstructure and extract more informative representations for tail nodes.\nExtensive experiments on public benchmark datasets demonstrate that SAILOR can\nsignificantly improve the tail node representations and outperform the\nstate-of-the-art baselines.\n","authors":["Jie Liao","Jintang Li","Liang Chen","Bingzhe Wu","Yatao Bian","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.06801v1.pdf","comment":"Accepted by CIKM 2023; Code is available at\n https://github.com/Jie-Re/SAILO"},{"id":"http://arxiv.org/abs/2308.06795v1","updated":"2023-08-13T15:44:39Z","published":"2023-08-13T15:44:39Z","title":"Faithful to Whom? Questioning Interpretability Measures in NLP","summary":" A common approach to quantifying model interpretability is to calculate\nfaithfulness metrics based on iteratively masking input tokens and measuring\nhow much the predicted label changes as a result. However, we show that such\nmetrics are generally not suitable for comparing the interpretability of\ndifferent neural text classifiers as the response to masked inputs is highly\nmodel-specific. We demonstrate that iterative masking can produce large\nvariation in faithfulness scores between comparable models, and show that\nmasked samples are frequently outside the distribution seen during training. We\nfurther investigate the impact of adversarial attacks and adversarial training\non faithfulness scores, and demonstrate the relevance of faithfulness measures\nfor analyzing feature salience in text adversarial attacks. Our findings\nprovide new insights into the limitations of current faithfulness metrics and\nkey considerations to utilize them appropriately.\n","authors":["Evan Crothers","Herna Viktor","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2308.06795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11077v2","updated":"2023-08-13T15:23:43Z","published":"2023-07-20T17:55:14Z","title":"AlignDet: Aligning Pre-training and Fine-tuning in Object Detection","summary":" The paradigm of large-scale pre-training followed by downstream fine-tuning\nhas been widely employed in various object detection algorithms. In this paper,\nwe reveal discrepancies in data, model, and task between the pre-training and\nfine-tuning procedure in existing practices, which implicitly limit the\ndetector's performance, generalization ability, and convergence speed. To this\nend, we propose AlignDet, a unified pre-training framework that can be adapted\nto various existing detectors to alleviate the discrepancies. AlignDet\ndecouples the pre-training process into two stages, i.e., image-domain and\nbox-domain pre-training. The image-domain pre-training optimizes the detection\nbackbone to capture holistic visual abstraction, and box-domain pre-training\nlearns instance-level semantics and task-aware concepts to initialize the parts\nout of the backbone. By incorporating the self-supervised pre-trained\nbackbones, we can pre-train all modules for various detectors in an\nunsupervised paradigm. As depicted in Figure 1, extensive experiments\ndemonstrate that AlignDet can achieve significant improvements across diverse\nprotocols, such as detection algorithm, model backbone, data setting, and\ntraining schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by\n2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs.\n","authors":["Ming Li","Jie Wu","Xionghui Wang","Chen Chen","Jie Qin","Xuefeng Xiao","Rui Wang","Min Zheng","Xin Pan"],"pdf_url":"https://arxiv.org/pdf/2307.11077v2.pdf","comment":"Camera Ready Version on ICCV 2023. Code and Models are publicly\n available. Project Page: https://liming-ai.github.io/AlignDet"},{"id":"http://arxiv.org/abs/2201.04604v4","updated":"2023-08-13T14:29:00Z","published":"2022-01-12T18:00:29Z","title":"Fine-grained Graph Learning for Multi-view Subspace Clustering","summary":" Multi-view subspace clustering (MSC) is a popular unsupervised method by\nintegrating heterogeneous information to reveal the intrinsic clustering\nstructure hidden across views. Usually, MSC methods use graphs (or affinity\nmatrices) fusion to learn a common structure, and further apply graph-based\napproaches to clustering. Despite progress, most of the methods do not\nestablish the connection between graph learning and clustering. Meanwhile,\nconventional graph fusion strategies assign coarse-grained weights to combine\nmulti-graph, ignoring the importance of local structure. In this paper, we\npropose a fine-grained graph learning framework for multi-view subspace\nclustering (FGL-MSC) to address these issues. To utilize the multi-view\ninformation sufficiently, we design a specific graph learning method by\nintroducing graph regularization and a local structure fusion pattern. The main\nchallenge is how to optimize the fine-grained fusion weights while generating\nthe learned graph that fits the clustering task, thus making the clustering\nrepresentation meaningful and competitive. Accordingly, an iterative algorithm\nis proposed to solve the above joint optimization problem, which obtains the\nlearned graph, the clustering representation, and the fusion weights\nsimultaneously. Extensive experiments on eight real-world datasets show that\nthe proposed framework has comparable performance to the state-of-the-art\nmethods. The source code of the proposed method is available at\nhttps://github.com/siriuslay/FGL-MSC.\n","authors":["Yidi Wang","Xiaobing Pei","Haoxi Zhan"],"pdf_url":"https://arxiv.org/pdf/2201.04604v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06780v1","updated":"2023-08-13T14:25:54Z","published":"2023-08-13T14:25:54Z","title":"Neural Networks at a Fraction with Pruned Quaternions","summary":" Contemporary state-of-the-art neural networks have increasingly large numbers\nof parameters, which prevents their deployment on devices with limited\ncomputational power. Pruning is one technique to remove unnecessary weights and\nreduce resource requirements for training and inference. In addition, for ML\ntasks where the input data is multi-dimensional, using higher-dimensional data\nembeddings such as complex numbers or quaternions has been shown to reduce the\nparameter count while maintaining accuracy. In this work, we conduct pruning on\nreal and quaternion-valued implementations of different architectures on\nclassification tasks. We find that for some architectures, at very high\nsparsity levels, quaternion models provide higher accuracies than their real\ncounterparts. For example, at the task of image classification on CIFAR-10\nusing Conv-4, at $3\\%$ of the number of parameters as the original model, the\npruned quaternion version outperforms the pruned real by more than $10\\%$.\nExperiments on various network architectures and datasets show that for\ndeployment in extremely resource-constrained environments, a sparse quaternion\nnetwork might be a better candidate than a real sparse model of similar\narchitecture.\n","authors":["Sahel Mohammad Iqbal","Subhankar Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.06780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06767v1","updated":"2023-08-13T13:34:04Z","published":"2023-08-13T13:34:04Z","title":"A Survey on Deep Neural Network Pruning-Taxonomy, Comparison, Analysis,\n and Recommendations","summary":" Modern deep neural networks, particularly recent large language models, come\nwith massive model sizes that require significant computational and storage\nresources. To enable the deployment of modern models on resource-constrained\nenvironments and accelerate inference time, researchers have increasingly\nexplored pruning techniques as a popular research direction in neural network\ncompression. However, there is a dearth of up-to-date comprehensive review\npapers on pruning. To address this issue, in this survey, we provide a\ncomprehensive review of existing research works on deep neural network pruning\nin a taxonomy of 1) universal/specific speedup, 2) when to prune, 3) how to\nprune, and 4) fusion of pruning and other compression techniques. We then\nprovide a thorough comparative analysis of seven pairs of contrast settings for\npruning (e.g., unstructured/structured) and explore emerging topics, including\npost-training pruning, different levels of supervision for pruning, and broader\napplications (e.g., adversarial robustness) to shed light on the commonalities\nand differences of existing methods and lay the foundation for further method\ndevelopment. To facilitate future research, we build a curated collection of\ndatasets, networks, and evaluations on different applications. Finally, we\nprovide some valuable recommendations on selecting pruning methods and prospect\npromising research directions. We build a repository at\nhttps://github.com/hrcheng1066/awesome-pruning.\n","authors":["Hongrong Cheng","Miao Zhang","Javen Qinfeng Shi"],"pdf_url":"https://arxiv.org/pdf/2308.06767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06764v1","updated":"2023-08-13T13:01:21Z","published":"2023-08-13T13:01:21Z","title":"Few-shot Class-incremental Learning: A Survey","summary":" Few-shot Class-Incremental Learning (FSCIL) presents a unique challenge in\nmachine learning, as it necessitates the continuous learning of new classes\nfrom sparse labeled training samples without forgetting previous knowledge.\nWhile this field has seen recent progress, it remains an active area of\nexploration. This paper aims to provide a comprehensive and systematic review\nof FSCIL. In our in-depth examination, we delve into various facets of FSCIL,\nencompassing the problem definition, the discussion of primary challenges of\nunreliable empirical risk minimization and the stability-plasticity dilemma,\ngeneral schemes, and relevant problems of incremental learning and few-shot\nlearning. Besides, we offer an overview of benchmark datasets and evaluation\nmetrics. Furthermore, we introduce the classification methods in FSCIL from\ndata-based, structure-based, and optimization-based approaches and the object\ndetection methods in FSCIL from anchor-free and anchor-based approaches. Beyond\nthese, we illuminate several promising research directions within FSCIL that\nmerit further investigation.\n","authors":["Jinghua Zhang","Li Liu","Olli Silven","Matti Pietikäinen","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2308.06764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06763v1","updated":"2023-08-13T13:00:50Z","published":"2023-08-13T13:00:50Z","title":"Discovering the Symptom Patterns of COVID-19 from Recovered and Deceased\n Patients Using Apriori Association Rule Mining","summary":" The COVID-19 pandemic has a devastating impact globally, claiming millions of\nlives and causing significant social and economic disruptions. In order to\noptimize decision-making and allocate limited resources, it is essential to\nidentify COVID-19 symptoms and determine the severity of each case. Machine\nlearning algorithms offer a potent tool in the medical field, particularly in\nmining clinical datasets for useful information and guiding scientific\ndecisions. Association rule mining is a machine learning technique for\nextracting hidden patterns from data. This paper presents an application of\nassociation rule mining based Apriori algorithm to discover symptom patterns\nfrom COVID-19 patients. The study, using 2875 records of patient, identified\nthe most common symptoms as apnea (72%), cough (64%), fever (59%), weakness\n(18%), myalgia (14.5%), and sore throat (12%). The proposed method provides\nclinicians with valuable insight into disease that can assist them in managing\nand treating it effectively.\n","authors":["Mohammad Dehghani","Zahra Yazdanparast","Mobin Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2308.06763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02918v2","updated":"2023-08-13T13:00:00Z","published":"2023-08-05T16:31:32Z","title":"Spectral Ranking Inferences based on General Multiway Comparisons","summary":" This paper studies the performance of the spectral method in the estimation\nand uncertainty quantification of the unobserved preference scores of compared\nentities in a very general and more realistic setup in which the comparison\ngraph consists of hyper-edges of possible heterogeneous sizes and the number of\ncomparisons can be as low as one for a given hyper-edge. Such a setting is\npervasive in real applications, circumventing the need to specify the graph\nrandomness and the restrictive homogeneous sampling assumption imposed in the\ncommonly-used Bradley-Terry-Luce (BTL) or Plackett-Luce (PL) models.\nFurthermore, in the scenarios when the BTL or PL models are appropriate, we\nunravel the relationship between the spectral estimator and the Maximum\nLikelihood Estimator (MLE). We discover that a two-step spectral method, where\nwe apply the optimal weighting estimated from the equal weighting vanilla\nspectral method, can achieve the same asymptotic efficiency as the MLE. Given\nthe asymptotic distributions of the estimated preference scores, we also\nintroduce a comprehensive framework to carry out both one-sample and two-sample\nranking inferences, applicable to both fixed and random graph settings. It is\nnoteworthy that it is the first time effective two-sample rank testing methods\nare proposed. Finally, we substantiate our findings via comprehensive numerical\nsimulations and subsequently apply our developed methodologies to perform\nstatistical inferences on statistics journals and movie rankings.\n","authors":["Jianqing Fan","Zhipeng Lou","Weichen Wang","Mengxin Yu"],"pdf_url":"https://arxiv.org/pdf/2308.02918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.06781v2","updated":"2023-08-13T12:59:43Z","published":"2021-11-12T15:47:10Z","title":"Q-Learning for MDPs with General Spaces: Convergence and Near Optimality\n via Quantization under Weak Continuity","summary":" Reinforcement learning algorithms often require finiteness of state and\naction spaces in Markov decision processes (MDPs) (also called controlled\nMarkov chains) and various efforts have been made in the literature towards the\napplicability of such algorithms for continuous state and action spaces. In\nthis paper, we show that under very mild regularity conditions (in particular,\ninvolving only weak continuity of the transition kernel of an MDP), Q-learning\nfor standard Borel MDPs via quantization of states and actions (called\nQuantized Q-Learning) converges to a limit, and furthermore this limit\nsatisfies an optimality equation which leads to near optimality with either\nexplicit performance bounds or which are guaranteed to be asymptotically\noptimal. Our approach builds on (i) viewing quantization as a measurement\nkernel and thus a quantized MDP as a partially observed Markov decision process\n(POMDP), (ii) utilizing near optimality and convergence results of Q-learning\nfor POMDPs, and (iii) finally, near-optimality of finite state model\napproximations for MDPs with weakly continuous kernels which we show to\ncorrespond to the fixed point of the constructed POMDP. Thus, our paper\npresents a very general convergence and approximation result for the\napplicability of Q-learning for continuous MDPs.\n","authors":["Ali Devran Kara","Naci Saldi","Serdar Yüksel"],"pdf_url":"https://arxiv.org/pdf/2111.06781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03308v2","updated":"2023-08-13T12:30:29Z","published":"2023-05-05T06:17:57Z","title":"Tiny-PPG: A Lightweight Deep Neural Network for Real-Time Detection of\n Motion Artifacts in Photoplethysmogram Signals on Edge Devices","summary":" Photoplethysmogram (PPG) signals are easily contaminated by motion artifacts\nin real-world settings, despite their widespread use in Internet-of-Things\n(IoT) based wearable and smart health devices for cardiovascular health\nmonitoring. This study proposed a lightweight deep neural network, called\nTiny-PPG, for accurate and real-time PPG artifact segmentation on IoT edge\ndevices. The model was trained and tested on a public dataset, PPG DaLiA, which\nfeatured complex artifacts with diverse lengths and morphologies during various\ndaily activities of 15 subjects using a watch-type device (Empatica E4). The\nmodel structure, training method and loss function were specifically designed\nto balance detection accuracy and speed for real-time PPG artifact detection in\nresource-constrained embedded devices. To optimize the model size and\ncapability in multi-scale feature representation, the model employed depth-wise\nseparable convolution and atrous spatial pyramid pooling modules, respectively.\nAdditionally, the contrastive loss was also utilized to further optimize the\nfeature embeddings. With additional model pruning, Tiny-PPG achieved\nstate-of-the-art detection accuracy of 87.4% while only having 19,726 model\nparameters (0.15 megabytes), and was successfully deployed on an STM32 embedded\nsystem for real-time PPG artifact detection. Therefore, this study provides an\neffective solution for resource-constraint IoT smart health devices in PPG\nartifact detection.\n","authors":["Yali Zheng","Chen Wu","Peizheng Cai","Zhiqiang Zhong","Hongda Huang","Yuqi Jiang"],"pdf_url":"https://arxiv.org/pdf/2305.03308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14104v3","updated":"2023-08-13T12:17:51Z","published":"2023-04-27T11:32:48Z","title":"Learning Human-Human Interactions in Images from Weak Textual\n Supervision","summary":" Interactions between humans are diverse and context-dependent, but previous\nworks have treated them as categorical, disregarding the heavy tail of possible\ninteractions. We propose a new paradigm of learning human-human interactions as\nfree text from a single still image, allowing for flexibility in modeling the\nunlimited space of situations and relationships between people. To overcome the\nabsence of data labelled specifically for this task, we use knowledge\ndistillation applied to synthetic caption data produced by a large language\nmodel without explicit supervision. We show that the pseudo-labels produced by\nthis procedure can be used to train a captioning model to effectively\nunderstand human-human interactions in images, as measured by a variety of\nmetrics that measure textual and semantic faithfulness and factual groundedness\nof our predictions. We further show that our approach outperforms SOTA image\ncaptioning and situation recognition models on this task. We will release our\ncode and pseudo-labels along with Waldo and Wenda, a manually-curated test set\nfor still image human-human interaction understanding.\n","authors":["Morris Alper","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2304.14104v3.pdf","comment":"To be presented at ICCV 2023. Project webpage:\n https://learning-interactions.github.io"},{"id":"http://arxiv.org/abs/2304.08842v2","updated":"2023-08-13T11:31:34Z","published":"2023-04-18T09:13:52Z","title":"UDTIRI: An Open-Source Intelligent Road Inspection Benchmark Suite","summary":" It is seen that there is enormous potential to leverage powerful deep\nlearning methods in the emerging field of urban digital twins. It is\nparticularly in the area of intelligent road inspection where there is\ncurrently limited research and data available. To facilitate progress in this\nfield, we have developed a well-labeled road pothole dataset named Urban\nDigital Twins Intelligent Road Inspection (UDTIRI) dataset. We hope this\ndataset will enable the use of powerful deep learning methods in urban road\ninspection, providing algorithms with a more comprehensive understanding of the\nscene and maximizing their potential. Our dataset comprises 1000 images of\npotholes, captured in various scenarios with different lighting and humidity\nconditions. Our intention is to employ this dataset for object detection,\nsemantic segmentation, and instance segmentation tasks. Our team has devoted\nsignificant effort to conducting a detailed statistical analysis, and\nbenchmarking a selection of representative algorithms from recent years. We\nalso provide a multi-task platform for researchers to fully exploit the\nperformance of various algorithms with the support of UDTIRI dataset.\n","authors":["Sicen Guo","Jiahang Li","Shuai Su","Yi Feng","Dacheng Zhou","Chen Chen","Denghuang Zhang","Xingyi Zhu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2304.08842v2.pdf","comment":"Database webpage: https://www.udtiri.com/, Kaggle webpage:\n https://www.kaggle.com/datasets/jiahangli617/udtiri"},{"id":"http://arxiv.org/abs/2308.06741v1","updated":"2023-08-13T10:18:10Z","published":"2023-08-13T10:18:10Z","title":"Heterogeneous Multi-Agent Reinforcement Learning via Mirror Descent\n Policy Optimization","summary":" This paper presents an extension of the Mirror Descent method to overcome\nchallenges in cooperative Multi-Agent Reinforcement Learning (MARL) settings,\nwhere agents have varying abilities and individual policies. The proposed\nHeterogeneous-Agent Mirror Descent Policy Optimization (HAMDPO) algorithm\nutilizes the multi-agent advantage decomposition lemma to enable efficient\npolicy updates for each agent while ensuring overall performance improvements.\nBy iteratively updating agent policies through an approximate solution of the\ntrust-region problem, HAMDPO guarantees stability and improves performance.\nMoreover, the HAMDPO algorithm is capable of handling both continuous and\ndiscrete action spaces for heterogeneous agents in various MARL problems. We\nevaluate HAMDPO on Multi-Agent MuJoCo and StarCraftII tasks, demonstrating its\nsuperiority over state-of-the-art algorithms such as HATRPO and HAPPO. These\nresults suggest that HAMDPO is a promising approach for solving cooperative\nMARL problems and could potentially be extended to address other challenging\nproblems in the field of MARL.\n","authors":["Mohammad Mehdi Nasiri","Mansoor Rezghi"],"pdf_url":"https://arxiv.org/pdf/2308.06741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02617v2","updated":"2023-08-13T10:11:09Z","published":"2023-06-05T06:31:14Z","title":"Permutation Decision Trees","summary":" Decision Tree is a well understood Machine Learning model that is based on\nminimizing impurities in the internal nodes. The most common impurity measures\nare Shannon entropy and Gini impurity. These impurity measures are insensitive\nto the order of training data and hence the final tree obtained is invariant to\nany permutation of the data. This leads to a serious limitation in modeling\ndata instances that have order dependencies. In this work, we propose the use\nof Effort-To-Compress (ETC) - a complexity measure, for the first time, as an\nimpurity measure. Unlike Shannon entropy and Gini impurity, structural impurity\nbased on ETC is able to capture order dependencies in the data, thus obtaining\npotentially different decision trees for different permutations of the same\ndata instances (Permutation Decision Trees). We then introduce the notion of\nPermutation Bagging achieved using permutation decision trees without the need\nfor random feature selection and sub-sampling. We compare the performance of\nthe proposed permutation bagged decision trees with Random Forests. Our model\ndoes not assume that the data instances are independent and identically\ndistributed. Potential applications include scenarios where a temporal order\npresent in the data instances is to be respected.\n","authors":["Harikrishnan N B","Nithin Nagaraj"],"pdf_url":"https://arxiv.org/pdf/2306.02617v2.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.06740v1","updated":"2023-08-13T10:09:25Z","published":"2023-08-13T10:09:25Z","title":"Weighted Sparse Partial Least Squares for Joint Sample and Feature\n Selection","summary":" Sparse Partial Least Squares (sPLS) is a common dimensionality reduction\ntechnique for data fusion, which projects data samples from two views by\nseeking linear combinations with a small number of variables with the maximum\nvariance. However, sPLS extracts the combinations between two data sets with\nall data samples so that it cannot detect latent subsets of samples. To extend\nthe application of sPLS by identifying a specific subset of samples and remove\noutliers, we propose an $\\ell_\\infty/\\ell_0$-norm constrained weighted sparse\nPLS ($\\ell_\\infty/\\ell_0$-wsPLS) method for joint sample and feature selection,\nwhere the $\\ell_\\infty/\\ell_0$-norm constrains are used to select a subset of\nsamples. We prove that the $\\ell_\\infty/\\ell_0$-norm constrains have the\nKurdyka-\\L{ojasiewicz}~property so that a globally convergent algorithm is\ndeveloped to solve it. Moreover, multi-view data with a same set of samples can\nbe available in various real problems. To this end, we extend the\n$\\ell_\\infty/\\ell_0$-wsPLS model and propose two multi-view wsPLS models for\nmulti-view data fusion. We develop an efficient iterative algorithm for each\nmulti-view wsPLS model and show its convergence property. As well as numerical\nand biomedical data experiments demonstrate the efficiency of the proposed\nmethods.\n","authors":["Wenwen Min","Taosheng Xu","Chris Ding"],"pdf_url":"https://arxiv.org/pdf/2308.06740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06738v1","updated":"2023-08-13T10:04:13Z","published":"2023-08-13T10:04:13Z","title":"Probabilistic Imputation for Time-series Classification with Missing\n Data","summary":" Multivariate time series data for real-world applications typically contain a\nsignificant amount of missing values. The dominant approach for classification\nwith such missing values is to impute them heuristically with specific values\n(zero, mean, values of adjacent time-steps) or learnable parameters. However,\nthese simple strategies do not take the data generative process into account,\nand more importantly, do not effectively capture the uncertainty in prediction\ndue to the multiple possibilities for the missing values. In this paper, we\npropose a novel probabilistic framework for classification with multivariate\ntime series data with missing values. Our model consists of two parts; a deep\ngenerative model for missing value imputation and a classifier. Extending the\nexisting deep generative models to better capture structures of time-series\ndata, our deep generative model part is trained to impute the missing values in\nmultiple plausible ways, effectively modeling the uncertainty of the\nimputation. The classifier part takes the time series data along with the\nimputed missing values and classifies signals, and is trained to capture the\npredictive uncertainty due to the multiple possibilities of imputations.\nImportantly, we show that na\\\"ively combining the generative model and the\nclassifier could result in trivial solutions where the generative model does\nnot produce meaningful imputations. To resolve this, we present a novel\nregularization technique that can promote the model to produce useful\nimputation values that help classification. Through extensive experiments on\nreal-world time series data with missing values, we demonstrate the\neffectiveness of our method.\n","authors":["SeungHyun Kim","Hyunsu Kim","EungGu Yun","Hwangrae Lee","Jaehun Lee","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2308.06738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06733v1","updated":"2023-08-13T09:51:16Z","published":"2023-08-13T09:51:16Z","title":"Precipitation nowcasting with generative diffusion models","summary":" In recent years traditional numerical methods for accurate weather prediction\nhave been increasingly challenged by deep learning methods. Numerous historical\ndatasets used for short and medium-range weather forecasts are typically\norganized into a regular spatial grid structure. This arrangement closely\nresembles images: each weather variable can be visualized as a map or, when\nconsidering the temporal axis, as a video. Several classes of generative\nmodels, comprising Generative Adversarial Networks, Variational Autoencoders,\nor the recent Denoising Diffusion Models have largely proved their\napplicability to the next-frame prediction problem, and is thus natural to test\ntheir performance on the weather prediction benchmarks. Diffusion models are\nparticularly appealing in this context, due to the intrinsically probabilistic\nnature of weather forecasting: what we are really interested to model is the\nprobability distribution of weather indicators, whose expected value is the\nmost likely prediction.\n In our study, we focus on a specific subset of the ERA-5 dataset, which\nincludes hourly data pertaining to Central Europe from the years 2016 to 2021.\nWithin this context, we examine the efficacy of diffusion models in handling\nthe task of precipitation nowcasting. Our work is conducted in comparison to\nthe performance of well-established U-Net models, as documented in the existing\nliterature. Our proposed approach of Generative Ensemble Diffusion (GED)\nutilizes a diffusion model to generate a set of possible weather scenarios\nwhich are then amalgamated into a probable prediction via the use of a\npost-processing network. This approach, in comparison to recent deep learning\nmodels, substantially outperformed them in terms of overall performance.\n","authors":["Andrea Asperti","Fabio Merizzi","Alberto Paparella","Giorgio Pedrazzi","Matteo Angelinelli","Stefano Colamonaco"],"pdf_url":"https://arxiv.org/pdf/2308.06733v1.pdf","comment":"21 pages, 6 figures"},{"id":"http://arxiv.org/abs/1903.10047v4","updated":"2023-08-13T09:04:34Z","published":"2019-03-24T19:42:39Z","title":"Approximation and Non-parametric Estimation of ResNet-type Convolutional\n Neural Networks","summary":" Convolutional neural networks (CNNs) have been shown to achieve optimal\napproximation and estimation error rates (in minimax sense) in several function\nclasses. However, previous analyzed optimal CNNs are unrealistically wide and\ndifficult to obtain via optimization due to sparse constraints in important\nfunction classes, including the H\\\"older class. We show a ResNet-type CNN can\nattain the minimax optimal error rates in these classes in more plausible\nsituations -- it can be dense, and its width, channel size, and filter size are\nconstant with respect to sample size. The key idea is that we can replicate the\nlearning ability of Fully-connected neural networks (FNNs) by tailored CNNs, as\nlong as the FNNs have \\textit{block-sparse} structures. Our theory is general\nin a sense that we can automatically translate any approximation rate achieved\nby block-sparse FNNs into that by CNNs. As an application, we derive\napproximation and estimation error rates of the aformentioned type of CNNs for\nthe Barron and H\\\"older classes with the same strategy.\n","authors":["Kenta Oono","Taiji Suzuki"],"pdf_url":"https://arxiv.org/pdf/1903.10047v4.pdf","comment":"Version 4: Fixed the constant B^{(fc)} in Theorems 1, 5 and the norm\n upper bound of w^{(l)}_m in Lemma 1. 8 pages + References 2 pages +\n Supplemental material 18 pages"},{"id":"http://arxiv.org/abs/2212.07462v2","updated":"2023-08-13T08:58:00Z","published":"2022-12-14T19:13:59Z","title":"Harmonic (Quantum) Neural Networks","summary":" Harmonic functions are abundant in nature, appearing in limiting cases of\nMaxwell's, Navier-Stokes equations, the heat and the wave equation.\nConsequently, there are many applications of harmonic functions from industrial\nprocess optimisation to robotic path planning and the calculation of first exit\ntimes of random walks. Despite their ubiquity and relevance, there have been\nfew attempts to incorporate inductive biases towards harmonic functions in\nmachine learning contexts. In this work, we demonstrate effective means of\nrepresenting harmonic functions in neural networks and extend such results also\nto quantum neural networks to demonstrate the generality of our approach. We\nbenchmark our approaches against (quantum) physics-informed neural networks,\nwhere we show favourable performance.\n","authors":["Atiyo Ghosh","Antonio A. Gentile","Mario Dagrada","Chul Lee","Seong-Hyok Kim","Hyukgeun Cha","Yunjun Choi","Brad Kim","Jeong-Il Kye","Vincent E. Elfving"],"pdf_url":"https://arxiv.org/pdf/2212.07462v2.pdf","comment":"12 pages (main), 7 pages (supplementary), 7 figures"},{"id":"http://arxiv.org/abs/2307.01482v4","updated":"2023-08-13T08:42:08Z","published":"2023-07-04T05:19:19Z","title":"Nexus sine qua non: Essentially Connected Networks for Traffic\n Forecasting","summary":" Spatiotemporal graph neural networks (STGNNs) have emerged as a leading\napproach for learning representations and forecasting on traffic datasets with\nunderlying topological and correlational structures. However, current STGNNs\nuse intricate techniques with high complexities to capture these structures,\nmaking them difficult to understand and scale. The existence of simple yet\nefficient architectures remains an open question. Upon closer examination, we\nfind what lies at the core of STGNN's representations are certain forms of\nspatiotemporal contextualization. In light of this, we design Nexus sine qua\nnon (NexuSQN), an essentially connected network built on an efficient\nmessage-passing backbone. NexuSQN simply uses learnable \"where\" and \"when\"\nlocators for the aforementioned contextualization and omits any intricate\ncomponents such as RNNs, Transformers, and diffusion convolutions. Results show\nthat NexuSQN outperforms intricately designed benchmarks in terms of size,\ncomputational efficiency, and accuracy. This suggests a promising future for\ndeveloping simple yet efficient neural predictors.\n","authors":["Tong Nie","Guoyang Qin","Lijun Sun","Yunpeng Wang","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2307.01482v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.11217v3","updated":"2023-08-13T08:14:24Z","published":"2021-12-18T10:45:31Z","title":"Model-Based Safe Reinforcement Learning with Time-Varying State and\n Control Constraints: An Application to Intelligent Vehicles","summary":" Recently, safe reinforcement learning (RL) with the actor-critic structure\nfor continuous control tasks has received increasing attention. It is still\nchallenging to learn a near-optimal control policy with safety and convergence\nguarantees. Also, few works have addressed the safe RL algorithm design under\ntime-varying safety constraints. This paper proposes a safe RL algorithm for\noptimal control of nonlinear systems with time-varying state and control\nconstraints. In the proposed approach, we construct a novel barrier force-based\ncontrol policy structure to guarantee control safety. A multi-step policy\nevaluation mechanism is proposed to predict the policy's safety risk under\ntime-varying safety constraints and guide the policy to update safely.\nTheoretical results on stability and robustness are proven. Also, the\nconvergence of the actor-critic implementation is analyzed. The performance of\nthe proposed algorithm outperforms several state-of-the-art RL algorithms in\nthe simulated Safety Gym environment. Furthermore, the approach is applied to\nthe integrated path following and collision avoidance problem for two\nreal-world intelligent vehicles. A differential-drive vehicle and an\nAckermann-drive one are used to verify offline deployment and online learning\nperformance, respectively. Our approach shows an impressive sim-to-real\ntransfer capability and a satisfactory online control performance in the\nexperiment.\n","authors":["Xinglong Zhang","Yaoqian Peng","Biao Luo","Wei Pan","Xin Xu","Haibin Xie"],"pdf_url":"https://arxiv.org/pdf/2112.11217v3.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.06718v1","updated":"2023-08-13T08:13:34Z","published":"2023-08-13T08:13:34Z","title":"Generalized Independent Noise Condition for Estimating Causal Structure\n with Latent Variables","summary":" We investigate the challenging task of learning causal structure in the\npresence of latent variables, including locating latent variables and\ndetermining their quantity, and identifying causal relationships among both\nlatent and observed variables. To address this, we propose a Generalized\nIndependent Noise (GIN) condition for linear non-Gaussian acyclic causal models\nthat incorporate latent variables, which establishes the independence between a\nlinear combination of certain measured variables and some other measured\nvariables. Specifically, for two observed random vectors $\\bf{Y}$ and $\\bf{Z}$,\nGIN holds if and only if $\\omega^{\\intercal}\\mathbf{Y}$ and $\\mathbf{Z}$ are\nindependent, where $\\omega$ is a non-zero parameter vector determined by the\ncross-covariance between $\\mathbf{Y}$ and $\\mathbf{Z}$. We then give necessary\nand sufficient graphical criteria of the GIN condition in linear non-Gaussian\nacyclic causal models. Roughly speaking, GIN implies the existence of an\nexogenous set $\\mathcal{S}$ relative to the parent set of $\\mathbf{Y}$ (w.r.t.\nthe causal ordering), such that $\\mathcal{S}$ d-separates $\\mathbf{Y}$ from\n$\\mathbf{Z}$. Interestingly, we find that the independent noise condition\n(i.e., if there is no confounder, causes are independent of the residual\nderived from regressing the effect on the causes) can be seen as a special case\nof GIN. With such a connection between GIN and latent causal structures, we\nfurther leverage the proposed GIN condition, together with a well-designed\nsearch procedure, to efficiently estimate Linear, Non-Gaussian Latent\nHierarchical Models (LiNGLaHs), where latent confounders may also be causally\nrelated and may even follow a hierarchical structure. We show that the\nunderlying causal structure of a LiNGLaH is identifiable in light of GIN\nconditions under mild assumptions. Experimental results show the effectiveness\nof the proposed approach.\n","authors":["Feng Xie","Biwei Huang","Zhengming Chen","Ruichu Cai","Clark Glymour","Zhi Geng","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.06718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06717v1","updated":"2023-08-13T08:12:01Z","published":"2023-08-13T08:12:01Z","title":"Estimating and Incentivizing Imperfect-Knowledge Agents with Hidden\n Rewards","summary":" In practice, incentive providers (i.e., principals) often cannot observe the\nreward realizations of incentivized agents, which is in contrast to many\nprincipal-agent models that have been previously studied. This information\nasymmetry challenges the principal to consistently estimate the agent's unknown\nrewards by solely watching the agent's decisions, which becomes even more\nchallenging when the agent has to learn its own rewards. This complex setting\nis observed in various real-life scenarios ranging from renewable energy\nstorage contracts to personalized healthcare incentives. Hence, it offers not\nonly interesting theoretical questions but also wide practical relevance. This\npaper explores a repeated adverse selection game between a self-interested\nlearning agent and a learning principal. The agent tackles a multi-armed bandit\n(MAB) problem to maximize their expected reward plus incentive. On top of the\nagent's learning, the principal trains a parallel algorithm and faces a\ntrade-off between consistently estimating the agent's unknown rewards and\nmaximizing their own utility by offering adaptive incentives to lead the agent.\nFor a non-parametric model, we introduce an estimator whose only input is the\nhistory of principal's incentives and agent's choices. We unite this estimator\nwith a proposed data-driven incentive policy within a MAB framework. Without\nrestricting the type of the agent's algorithm, we prove finite-sample\nconsistency of the estimator and a rigorous regret bound for the principal by\nconsidering the sequential externality imposed by the agent. Lastly, our\ntheoretical results are reinforced by simulations justifying applicability of\nour framework to green energy aggregator contracts.\n","authors":["Ilgin Dogan","Zuo-Jun Max Shen","Anil Aswani"],"pdf_url":"https://arxiv.org/pdf/2308.06717v1.pdf","comment":"72 pages, 6 figures. arXiv admin note: text overlap with\n arXiv:2304.07407"},{"id":"http://arxiv.org/abs/2308.06714v1","updated":"2023-08-13T08:10:23Z","published":"2023-08-13T08:10:23Z","title":"Learning on Graphs with Out-of-Distribution Nodes","summary":" Graph Neural Networks (GNNs) are state-of-the-art models for performing\nprediction tasks on graphs. While existing GNNs have shown great performance on\nvarious tasks related to graphs, little attention has been paid to the scenario\nwhere out-of-distribution (OOD) nodes exist in the graph during training and\ninference. Borrowing the concept from CV and NLP, we define OOD nodes as nodes\nwith labels unseen from the training set. Since a lot of networks are\nautomatically constructed by programs, real-world graphs are often noisy and\nmay contain nodes from unknown distributions. In this work, we define the\nproblem of graph learning with out-of-distribution nodes. Specifically, we aim\nto accomplish two tasks: 1) detect nodes which do not belong to the known\ndistribution and 2) classify the remaining nodes to be one of the known\nclasses. We demonstrate that the connection patterns in graphs are informative\nfor outlier detection, and propose Out-of-Distribution Graph Attention Network\n(OODGAT), a novel GNN model which explicitly models the interaction between\ndifferent kinds of nodes and separate inliers from outliers during feature\npropagation. Extensive experiments show that OODGAT outperforms existing\noutlier detection methods by a large margin, while being better or comparable\nin terms of in-distribution classification.\n","authors":["Yu Song","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06714v1.pdf","comment":"Accepted by KDD'22"},{"id":"http://arxiv.org/abs/2308.06709v1","updated":"2023-08-13T07:56:01Z","published":"2023-08-13T07:56:01Z","title":"The Hard-Constraint PINNs for Interface Optimal Control Problems","summary":" We show that the physics-informed neural networks (PINNs), in combination\nwith some recently developed discontinuity capturing neural networks, can be\napplied to solve optimal control problems subject to partial differential\nequations (PDEs) with interfaces and some control constraints. The resulting\nalgorithm is mesh-free and scalable to different PDEs, and it ensures the\ncontrol constraints rigorously. Since the boundary and interface conditions, as\nwell as the PDEs, are all treated as soft constraints by lumping them into a\nweighted loss function, it is necessary to learn them simultaneously and there\nis no guarantee that the boundary and interface conditions can be satisfied\nexactly. This immediately causes difficulties in tuning the weights in the\ncorresponding loss function and training the neural networks. To tackle these\ndifficulties and guarantee the numerical accuracy, we propose to impose the\nboundary and interface conditions as hard constraints in PINNs by developing a\nnovel neural network architecture. The resulting hard-constraint PINNs approach\nguarantees that both the boundary and interface conditions can be satisfied\nexactly and they are decoupled from the learning of the PDEs. Its efficiency is\npromisingly validated by some elliptic and parabolic interface optimal control\nproblems.\n","authors":["Ming-Chih Lai","Yongcun Song","Xiaoming Yuan","Hangrui Yue","Tianyou Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.06709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06708v1","updated":"2023-08-13T07:55:46Z","published":"2023-08-13T07:55:46Z","title":"Generating observation guided ensembles for data assimilation with\n denoising diffusion probabilistic model","summary":" This paper presents an ensemble data assimilation method using the pseudo\nensembles generated by denoising diffusion probabilistic model. Since the model\nis trained against noisy and sparse observation data, this model can produce\ndivergent ensembles close to observations. Thanks to the variance in generated\nensembles, our proposed method displays better performance than the\nwell-established ensemble data assimilation method when the simulation model is\nimperfect.\n","authors":["Yuuichi Asahi","Yuta Hasegawa","Naoyuki Onodera","Takashi Shimokawabe","Hayato Shiba","Yasuhiro Idomura"],"pdf_url":"https://arxiv.org/pdf/2308.06708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05124v2","updated":"2023-08-13T07:18:29Z","published":"2022-12-09T21:48:36Z","title":"Multi-view Graph Convolutional Networks with Differentiable Node\n Selection","summary":" Multi-view data containing complementary and consensus information can\nfacilitate representation learning by exploiting the intact integration of\nmulti-view features. Because most objects in real world often have underlying\nconnections, organizing multi-view data as heterogeneous graphs is beneficial\nto extracting latent information among different objects. Due to the powerful\ncapability to gather information of neighborhood nodes, in this paper, we apply\nGraph Convolutional Network (GCN) to cope with heterogeneous-graph data\noriginating from multi-view data, which is still under-explored in the field of\nGCN. In order to improve the quality of network topology and alleviate the\ninterference of noises yielded by graph fusion, some methods undertake sorting\noperations before the graph convolution procedure. These GCN-based methods\ngenerally sort and select the most confident neighborhood nodes for each\nvertex, such as picking the top-k nodes according to pre-defined confidence\nvalues. Nonetheless, this is problematic due to the non-differentiable sorting\noperators and inflexible graph embedding learning, which may result in blocked\ngradient computations and undesired performance. To cope with these issues, we\npropose a joint framework dubbed Multi-view Graph Convolutional Network with\nDifferentiable Node Selection (MGCN-DNS), which is constituted of an adaptive\ngraph fusion layer, a graph learning module and a differentiable node selection\nschema. MGCN-DNS accepts multi-channel graph-structural data as inputs and aims\nto learn more robust graph fusion through a differentiable neural network. The\neffectiveness of the proposed method is verified by rigorous comparisons with\nconsiderable state-of-the-art approaches in terms of multi-view semi-supervised\nclassification tasks.\n","authors":["Zhaoliang Chen","Lele Fu","Shunxin Xiao","Shiping Wang","Claudia Plant","Wenzhong Guo"],"pdf_url":"https://arxiv.org/pdf/2212.05124v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06703v1","updated":"2023-08-13T07:03:22Z","published":"2023-08-13T07:03:22Z","title":"Understanding the robustness difference between stochastic gradient\n descent and adaptive gradient methods","summary":" Stochastic gradient descent (SGD) and adaptive gradient methods, such as Adam\nand RMSProp, have been widely used in training deep neural networks. We\nempirically show that while the difference between the standard generalization\nperformance of models trained using these methods is small, those trained using\nSGD exhibit far greater robustness under input perturbations. Notably, our\ninvestigation demonstrates the presence of irrelevant frequencies in natural\ndatasets, where alterations do not affect models' generalization performance.\nHowever, models trained with adaptive methods show sensitivity to these\nchanges, suggesting that their use of irrelevant frequencies can lead to\nsolutions sensitive to perturbations. To better understand this difference, we\nstudy the learning dynamics of gradient descent (GD) and sign gradient descent\n(signGD) on a synthetic dataset that mirrors natural signals. With a\nthree-dimensional input space, the models optimized with GD and signGD have\nstandard risks close to zero but vary in their adversarial risks. Our result\nshows that linear models' robustness to $\\ell_2$-norm bounded changes is\ninversely proportional to the model parameters' weight norm: a smaller weight\nnorm implies better robustness. In the context of deep learning, our\nexperiments show that SGD-trained neural networks show smaller Lipschitz\nconstants, explaining the better robustness to input perturbations than those\ntrained with adaptive gradient methods.\n","authors":["Avery Ma","Yangchen Pan","Amir-massoud Farahmand"],"pdf_url":"https://arxiv.org/pdf/2308.06703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.06434v3","updated":"2023-08-13T06:55:39Z","published":"2022-06-13T19:28:42Z","title":"SmartGD: A GAN-Based Graph Drawing Framework for Diverse Aesthetic Goals","summary":" While a multitude of studies have been conducted on graph drawing, many\nexisting methods only focus on optimizing a single aesthetic aspect of graph\nlayouts, which can lead to sub-optimal results. There are a few existing\nmethods that have attempted to develop a flexible solution for optimizing\ndifferent aesthetic aspects measured by different aesthetic criteria.\nFurthermore, thanks to the significant advance in deep learning techniques,\nseveral deep learning-based layout methods were proposed recently. These\nmethods have demonstrated the advantages of deep learning approaches for graph\ndrawing. However, none of these existing methods can be directly applied to\noptimizing non-differentiable criteria without special accommodation. In this\nwork, we propose a novel Generative Adversarial Network (GAN) based deep\nlearning framework for graph drawing, called SmartGD, which can optimize\ndifferent quantitative aesthetic goals, regardless of their differentiability.\nTo demonstrate the effectiveness and efficiency of SmartGD, we conducted\nexperiments on minimizing stress, minimizing edge crossing, maximizing crossing\nangle, maximizing shape-based metrics, and a combination of multiple\naesthetics. Compared with several popular graph drawing algorithms, the\nexperimental results show that SmartGD achieves good performance both\nquantitatively and qualitatively.\n","authors":["Xiaoqi Wang","Kevin Yen","Yifan Hu","Han-Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2206.06434v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06701v1","updated":"2023-08-13T06:55:05Z","published":"2023-08-13T06:55:05Z","title":"Camouflaged Image Synthesis Is All You Need to Boost Camouflaged\n Detection","summary":" Camouflaged objects that blend into natural scenes pose significant\nchallenges for deep-learning models to detect and synthesize. While camouflaged\nobject detection is a crucial task in computer vision with diverse real-world\napplications, this research topic has been constrained by limited data\navailability. We propose a framework for synthesizing camouflage data to\nenhance the detection of camouflaged objects in natural scenes. Our approach\nemploys a generative model to produce realistic camouflage images, which can be\nused to train existing object detection models. Specifically, we use a\ncamouflage environment generator supervised by a camouflage distribution\nclassifier to synthesize the camouflage images, which are then fed into our\ngenerator to expand the dataset. Our framework outperforms the current\nstate-of-the-art method on three datasets (COD10k, CAMO, and CHAMELEON),\ndemonstrating its effectiveness in improving camouflaged object detection. This\napproach can serve as a plug-and-play data generation and augmentation module\nfor existing camouflaged object detection tasks and provides a novel way to\nintroduce more diversity and distributions into current camouflage datasets.\n","authors":["Haichao Zhang","Can Qin","Yu Yin","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2308.06701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05034v2","updated":"2023-08-13T06:36:34Z","published":"2023-08-09T16:04:55Z","title":"Kairos: Practical Intrusion Detection and Investigation using\n Whole-system Provenance","summary":" Provenance graphs are structured audit logs that describe the history of a\nsystem's execution. Recent studies have explored a variety of techniques to\nanalyze provenance graphs for automated host intrusion detection, focusing\nparticularly on advanced persistent threats. Sifting through their design\ndocuments, we identify four common dimensions that drive the development of\nprovenance-based intrusion detection systems (PIDSes): scope (can PIDSes detect\nmodern attacks that infiltrate across application boundaries?), attack\nagnosticity (can PIDSes detect novel attacks without a priori knowledge of\nattack characteristics?), timeliness (can PIDSes efficiently monitor host\nsystems as they run?), and attack reconstruction (can PIDSes distill attack\nactivity from large provenance graphs so that sysadmins can easily understand\nand quickly respond to system intrusion?). We present KAIROS, the first PIDS\nthat simultaneously satisfies the desiderata in all four dimensions, whereas\nexisting approaches sacrifice at least one and struggle to achieve comparable\ndetection performance.\n Kairos leverages a novel graph neural network-based encoder-decoder\narchitecture that learns the temporal evolution of a provenance graph's\nstructural changes to quantify the degree of anomalousness for each system\nevent. Then, based on this fine-grained information, Kairos reconstructs attack\nfootprints, generating compact summary graphs that accurately describe\nmalicious activity over a stream of system audit logs. Using state-of-the-art\nbenchmark datasets, we demonstrate that Kairos outperforms previous approaches.\n","authors":["Zijun Cheng","Qiujian Lv","Jinyuan Liang","Yan Wang","Degang Sun","Thomas Pasquier","Xueyuan Han"],"pdf_url":"https://arxiv.org/pdf/2308.05034v2.pdf","comment":"23 pages, 16 figures, to appear in the 45th IEEE Symposium on\n Security and Privacy (S&P'24)"},{"id":"http://arxiv.org/abs/2211.06077v3","updated":"2023-08-13T06:23:46Z","published":"2022-11-11T09:16:25Z","title":"Overparameterized random feature regression with nearly orthogonal data","summary":" We investigate the properties of random feature ridge regression (RFRR) given\nby a two-layer neural network with random Gaussian initialization. We study the\nnon-asymptotic behaviors of the RFRR with nearly orthogonal deterministic\nunit-length input data vectors in the overparameterized regime, where the width\nof the first layer is much larger than the sample size. Our analysis shows\nhigh-probability non-asymptotic concentration results for the training errors,\ncross-validations, and generalization errors of RFRR centered around their\nrespective values for a kernel ridge regression (KRR). This KRR is derived from\nan expected kernel generated by a nonlinear random feature map. We then\napproximate the performance of the KRR by a polynomial kernel matrix obtained\nfrom the Hermite polynomial expansion of the activation function, whose degree\nonly depends on the orthogonality among different data points. This polynomial\nkernel determines the asymptotic behavior of the RFRR and the KRR. Our results\nhold for a wide variety of activation functions and input data sets that\nexhibit nearly orthogonal properties. Based on these approximations, we obtain\na lower bound for the generalization error of the RFRR for a nonlinear\nstudent-teacher model.\n","authors":["Zhichao Wang","Yizhe Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.06077v3.pdf","comment":"39 pages. A condition on the activation function is added in\n Assumption 2.2"},{"id":"http://arxiv.org/abs/2212.02234v2","updated":"2023-08-13T06:02:06Z","published":"2022-11-13T03:23:54Z","title":"Review of medical data analysis based on spiking neural networks","summary":" Medical data mainly includes various types of biomedical signals and medical\nimages, which can be used by professional doctors to make judgments on\npatients' health conditions. However, the interpretation of medical data\nrequires a lot of human cost and there may be misjudgments, so many scholars\nuse neural networks and deep learning to classify and study medical data, which\ncan improve the efficiency and accuracy of doctors and detect diseases early\nfor early diagnosis, etc. Therefore, it has a wide range of application\nprospects. However, traditional neural networks have disadvantages such as high\nenergy consumption and high latency (slow computation speed). This paper\npresents recent research on signal classification and disease diagnosis based\non a third-generation neural network, the spiking neuron network, using medical\ndata including EEG signals, ECG signals, EMG signals and MRI images. The\nadvantages and disadvantages of pulsed neural networks compared with\ntraditional networks are summarized and its development orientation in the\nfuture is prospected.\n","authors":["X. Li","X. Zhang","X. Yi","D. Liu","H. Wang","B. Zhang","B. Zhang","D. Zhao","L. Wang"],"pdf_url":"https://arxiv.org/pdf/2212.02234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06692v1","updated":"2023-08-13T05:56:36Z","published":"2023-08-13T05:56:36Z","title":"SimMatchV2: Semi-Supervised Learning with Graph Consistency","summary":" Semi-Supervised image classification is one of the most fundamental problem\nin computer vision, which significantly reduces the need for human labor. In\nthis paper, we introduce a new semi-supervised learning algorithm - SimMatchV2,\nwhich formulates various consistency regularizations between labeled and\nunlabeled data from the graph perspective. In SimMatchV2, we regard the\naugmented view of a sample as a node, which consists of a label and its\ncorresponding representation. Different nodes are connected with the edges,\nwhich are measured by the similarity of the node representations. Inspired by\nthe message passing and node classification in graph theory, we propose four\ntypes of consistencies, namely 1) node-node consistency, 2) node-edge\nconsistency, 3) edge-edge consistency, and 4) edge-node consistency. We also\nuncover that a simple feature normalization can reduce the gaps of the feature\nnorm between different augmented views, significantly improving the performance\nof SimMatchV2. Our SimMatchV2 has been validated on multiple semi-supervised\nlearning benchmarks. Notably, with ResNet-50 as our backbone and 300 epochs of\ntraining, SimMatchV2 achieves 71.9\\% and 76.2\\% Top-1 Accuracy with 1\\% and\n10\\% labeled examples on ImageNet, which significantly outperforms the previous\nmethods and achieves state-of-the-art performance. Code and pre-trained models\nare available at\n\\href{https://github.com/mingkai-zheng/SimMatchV2}{https://github.com/mingkai-zheng/SimMatchV2}.\n","authors":["Mingkai Zheng","Shan You","Lang Huang","Chen Luo","Fei Wang","Chen Qian","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.06692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08957v2","updated":"2023-08-13T05:53:08Z","published":"2023-01-21T14:33:02Z","title":"Slice Transformer and Self-supervised Learning for 6DoF Localization in\n 3D Point Cloud Maps","summary":" Precise localization is critical for autonomous vehicles. We present a\nself-supervised learning method that employs Transformers for the first time\nfor the task of outdoor localization using LiDAR data. We propose a pre-text\ntask that reorganizes the slices of a $360^\\circ$ LiDAR scan to leverage its\naxial properties. Our model, called Slice Transformer, employs multi-head\nattention while systematically processing the slices. To the best of our\nknowledge, this is the first instance of leveraging multi-head attention for\noutdoor point clouds. We additionally introduce the Perth-WA dataset, which\nprovides a large-scale LiDAR map of Perth city in Western Australia, covering\n$\\sim$4km$^2$ area. Localization annotations are provided for Perth-WA. The\nproposed localization method is thoroughly evaluated on Perth-WA and\nAppollo-SouthBay datasets. We also establish the efficacy of our\nself-supervised learning approach for the common downstream task of object\nclassification using ModelNet40 and ScanNN datasets. The code and Perth-WA data\nwill be publicly released.\n","authors":["Muhammad Ibrahim","Naveed Akhtar","Saeed Anwar","Michael Wise","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2301.08957v2.pdf","comment":"Accepted in IEEE International Conference on Robotics and Automation\n (ICRA), 2023"},{"id":"http://arxiv.org/abs/2202.00916v4","updated":"2023-08-13T05:44:37Z","published":"2022-02-02T08:36:10Z","title":"Scalable Decision-Focused Learning in Restless Multi-Armed Bandits with\n Application to Maternal and Child Health","summary":" This paper studies restless multi-armed bandit (RMAB) problems with unknown\narm transition dynamics but with known correlated arm features. The goal is to\nlearn a model to predict transition dynamics given features, where the Whittle\nindex policy solves the RMAB problems using predicted transitions. However,\nprior works often learn the model by maximizing the predictive accuracy instead\nof final RMAB solution quality, causing a mismatch between training and\nevaluation objectives. To address this shortcoming, we propose a novel approach\nfor decision-focused learning in RMAB that directly trains the predictive model\nto maximize the Whittle index solution quality. We present three key\ncontributions: (i) we establish differentiability of the Whittle index policy\nto support decision-focused learning; (ii) we significantly improve the\nscalability of decision-focused learning approaches in sequential problems,\nspecifically RMAB problems; (iii) we apply our algorithm to a previously\ncollected dataset of maternal and child health to demonstrate its performance.\nIndeed, our algorithm is the first for decision-focused learning in RMAB that\nscales to real-world problem sizes.\n","authors":["Kai Wang","Shresth Verma","Aditya Mate","Sanket Shah","Aparna Taneja","Neha Madhiwalla","Aparna Hegde","Milind Tambe"],"pdf_url":"https://arxiv.org/pdf/2202.00916v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06686v1","updated":"2023-08-13T05:22:49Z","published":"2023-08-13T05:22:49Z","title":"MDB: Interactively Querying Datasets and Models","summary":" As models are trained and deployed, developers need to be able to\nsystematically debug errors that emerge in the machine learning pipeline. We\npresent MDB, a debugging framework for interactively querying datasets and\nmodels. MDB integrates functional programming with relational algebra to build\nexpressive queries over a database of datasets and model predictions. Queries\nare reusable and easily modified, enabling debuggers to rapidly iterate and\nrefine queries to discover and characterize errors and model behaviors. We\nevaluate MDB on object detection, bias discovery, image classification, and\ndata imputation tasks across self-driving videos, large language models, and\nmedical records. Our experiments show that MDB enables up to 10x faster and\n40\\% shorter queries than other baselines. In a user study, we find developers\ncan successfully construct complex queries that describe errors of machine\nlearning models.\n","authors":["Aaditya Naik","Adam Stein","Yinjun Wu","Eric Wong","Mayur Naik"],"pdf_url":"https://arxiv.org/pdf/2308.06686v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.06853v1","updated":"2023-08-13T22:14:01Z","published":"2023-08-13T22:14:01Z","title":"UGC Quality Assessment: Exploring the Impact of Saliency in Deep\n Feature-Based Quality Assessment","summary":" The volume of User Generated Content (UGC) has increased in recent years. The\nchallenge with this type of content is assessing its quality. So far, the\nstate-of-the-art metrics are not exhibiting a very high correlation with\nperceptual quality. In this paper, we explore state-of-the-art metrics that\nextract/combine natural scene statistics and deep neural network features. We\nexperiment with these by introducing saliency maps to improve perceptibility.\nWe train and test our models using public datasets, namely, YouTube-UGC and\nKoNViD-1k. Preliminary results indicate that high correlations are achieved by\nusing only deep features while adding saliency is not always boosting the\nperformance. Our results and code will be made publicly available to serve as a\nbenchmark for the research community and can be found on our project page:\nhttps://github.com/xinyiW915/SPIE-2023-Supplementary.\n","authors":["Xinyi Wang","Angeliki Katsenou","David Bull"],"pdf_url":"https://arxiv.org/pdf/2308.06853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06725v1","updated":"2023-08-13T09:05:56Z","published":"2023-08-13T09:05:56Z","title":"CLE Diffusion: Controllable Light Enhancement Diffusion Model","summary":" Low light enhancement has gained increasing importance with the rapid\ndevelopment of visual creation and editing. However, most existing enhancement\nalgorithms are designed to homogeneously increase the brightness of images to a\npre-defined extent, limiting the user experience. To address this issue, we\npropose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a\nnovel diffusion framework to provide users with rich controllability. Built\nwith a conditional diffusion model, we introduce an illumination embedding to\nlet users control their desired brightness level. Additionally, we incorporate\nthe Segment-Anything Model (SAM) to enable user-friendly region\ncontrollability, where users can click on objects to specify the regions they\nwish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves\ncompetitive performance regarding quantitative metrics, qualitative results,\nand versatile controllability. Project page:\n\\url{https://yuyangyin.github.io/CLEDiffusion/}\n","authors":["Yuyang Yin","Dejia Xu","Chuangchuang Tan","Ping Liu","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2308.06725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06696v1","updated":"2023-08-13T06:29:38Z","published":"2023-08-13T06:29:38Z","title":"MACO: A Modality Adversarial and Contrastive Framework for\n Modality-missing Multi-modal Knowledge Graph Completion","summary":" Recent years have seen significant advancements in multi-modal knowledge\ngraph completion (MMKGC). MMKGC enhances knowledge graph completion (KGC) by\nintegrating multi-modal entity information, thereby facilitating the discovery\nof unobserved triples in the large-scale knowledge graphs (KGs). Nevertheless,\nexisting methods emphasize the design of elegant KGC models to facilitate\nmodality interaction, neglecting the real-life problem of missing modalities in\nKGs. The missing modality information impedes modal interaction, consequently\nundermining the model's performance. In this paper, we propose a modality\nadversarial and contrastive framework (MACO) to solve the modality-missing\nproblem in MMKGC. MACO trains a generator and discriminator adversarially to\ngenerate missing modality features that can be incorporated into the MMKGC\nmodel. Meanwhile, we design a cross-modal contrastive loss to improve the\nperformance of the generator. Experiments on public benchmarks with further\nexplorations demonstrate that MACO could achieve state-of-the-art results and\nserve as a versatile framework to bolster various MMKGC models. Our code and\nbenchmark data are available at https://github.com/zjukg/MACO.\n","authors":["Yichi Zhang","Zhuo Chen","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.06696v1.pdf","comment":"This is the ArXiv version of our paper accepted by NLPCC 2023. The\n code will be released soon"}]},"2023-08-12T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2303.13475v2","updated":"2023-08-12T23:51:53Z","published":"2023-03-20T16:53:36Z","title":"Learning Semantic Text Similarity to rank Hypernyms of Financial Terms","summary":" Over the years, there has been a paradigm shift in how users access financial\nservices. With the advancement of digitalization more users have been\npreferring the online mode of performing financial activities. This has led to\nthe generation of a huge volume of financial content. Most investors prefer to\ngo through these contents before making decisions. Every industry has terms\nthat are specific to the domain it operates in. Banking and Financial Services\nare not an exception to this. In order to fully comprehend these contents, one\nneeds to have a thorough understanding of the financial terms. Getting a basic\nidea about a term becomes easy when it is explained with the help of the broad\ncategory to which it belongs. This broad category is referred to as hypernym.\nFor example, \"bond\" is a hypernym of the financial term \"alternative\ndebenture\". In this paper, we propose a system capable of extracting and\nranking hypernyms for a given financial term. The system has been trained with\nfinancial text corpora obtained from various sources like DBpedia [4],\nInvestopedia, Financial Industry Business Ontology (FIBO), prospectus and so\non. Embeddings of these terms have been extracted using FinBERT [3], FinISH [1]\nand fine-tuned using SentenceBERT [54]. A novel approach has been used to\naugment the training set with negative samples. It uses the hierarchy present\nin FIBO. Finally, we benchmark the system performance with that of the existing\nones. We establish that it performs better than the existing ones and is also\nscalable.\n","authors":["Sohom Ghosh","Ankush Chopra","Sudip Kumar Naskar"],"pdf_url":"https://arxiv.org/pdf/2303.13475v2.pdf","comment":"Our code base:\n https://github.com/sohomghosh/FinSim_Financial_Hypernym_detection"},{"id":"http://arxiv.org/abs/2303.07274v4","updated":"2023-08-12T22:37:31Z","published":"2023-03-13T16:49:43Z","title":"Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of\n Synthetic and Compositional Images","summary":" Weird, unusual, and uncanny images pique the curiosity of observers because\nthey challenge commonsense. For example, an image released during the 2022\nworld cup depicts the famous soccer stars Lionel Messi and Cristiano Ronaldo\nplaying chess, which playfully violates our expectation that their competition\nshould occur on the football field. Humans can easily recognize and interpret\nthese unconventional images, but can AI models do the same? We introduce\nWHOOPS!, a new dataset and benchmark for visual commonsense. The dataset is\ncomprised of purposefully commonsense-defying images created by designers using\npublicly-available image generation tools like Midjourney. We consider several\ntasks posed over the dataset. In addition to image captioning, cross-modal\nmatching, and visual question answering, we introduce a difficult explanation\ngeneration task, where models must identify and explain why a given image is\nunusual. Our results show that state-of-the-art models such as GPT3 and BLIP2\nstill lag behind human performance on WHOOPS!. We hope our dataset will inspire\nthe development of AI models with stronger visual commonsense reasoning\nabilities. Data, models and code are available at the project website:\nwhoops-benchmark.github.io\n","authors":["Nitzan Bitton-Guetta","Yonatan Bitton","Jack Hessel","Ludwig Schmidt","Yuval Elovici","Gabriel Stanovsky","Roy Schwartz"],"pdf_url":"https://arxiv.org/pdf/2303.07274v4.pdf","comment":"Accepted to ICCV 2023. Website: whoops-benchmark.github.io"},{"id":"http://arxiv.org/abs/2307.09702v3","updated":"2023-08-12T21:09:44Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for Large Language Models","summary":" In this article we show how the problem of neural text generation can be\nconstructively reformulated in terms of transitions between the states of a\nfinite-state machine. This framework leads to an efficient approach to guiding\ntext generation with regular expressions and context-free grammars by allowing\nthe construction of an index over a language model's vocabulary. The approach\nis model agnostic, allows one to enforce domain-specific knowledge and\nconstraints, and enables the construction of reliable interfaces by\nguaranteeing the structure of the generated text. It adds little overhead to\nthe token sequence generation process and significantly outperforms existing\nsolutions. An implementation is provided in the open source Python library\nOutlines\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06610v1","updated":"2023-08-12T16:56:55Z","published":"2023-08-12T16:56:55Z","title":"Bio-SIEVE: Exploring Instruction Tuning Large Language Models for\n Systematic Review Automation","summary":" Medical systematic reviews can be very costly and resource intensive. We\nexplore how Large Language Models (LLMs) can support and be trained to perform\nliterature screening when provided with a detailed set of selection criteria.\nSpecifically, we instruction tune LLaMA and Guanaco models to perform abstract\nscreening for medical systematic reviews. Our best model, Bio-SIEVE,\noutperforms both ChatGPT and trained traditional approaches, and generalises\nbetter across medical domains. However, there remains the challenge of adapting\nthe model to safety-first scenarios. We also explore the impact of multi-task\ntraining with Bio-SIEVE-Multi, including tasks such as PICO extraction and\nexclusion reasoning, but find that it is unable to match single-task\nBio-SIEVE's performance. We see Bio-SIEVE as an important step towards\nspecialising LLMs for the biomedical systematic review process and explore its\nfuture developmental opportunities. We release our models, code and a list of\nDOIs to reconstruct our dataset for reproducibility.\n","authors":["Ambrose Robinson","William Thorne","Ben P. Wu","Abdullah Pandor","Munira Essat","Mark Stevenson","Xingyi Song"],"pdf_url":"https://arxiv.org/pdf/2308.06610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.07340v5","updated":"2023-08-12T15:45:07Z","published":"2021-06-02T02:43:18Z","title":"MathBERT: A Pre-trained Language Model for General NLP Tasks in\n Mathematics Education","summary":" Since the introduction of the original BERT (i.e., BASE BERT), researchers\nhave developed various customized BERT models with improved performance for\nspecific domains and tasks by exploiting the benefits of transfer learning. Due\nto the nature of mathematical texts, which often use domain specific vocabulary\nalong with equations and math symbols, we posit that the development of a new\nBERT model for mathematics would be useful for many mathematical downstream\ntasks. In this resource paper, we introduce our multi-institutional effort\n(i.e., two learning platforms and three academic institutions in the US) toward\nthis need: MathBERT, a model created by pre-training the BASE BERT model on a\nlarge mathematical corpus ranging from pre-kindergarten (pre-k), to\nhigh-school, to college graduate level mathematical content. In addition, we\nselect three general NLP tasks that are often used in mathematics education:\nprediction of knowledge component, auto-grading open-ended Q&A, and knowledge\ntracing, to demonstrate the superiority of MathBERT over BASE BERT. Our\nexperiments show that MathBERT outperforms prior best methods by 1.2-22% and\nBASE BERT by 2-8% on these tasks. In addition, we build a mathematics specific\nvocabulary 'mathVocab' to train with MathBERT. We discover that MathBERT\npre-trained with 'mathVocab' outperforms MathBERT trained with the BASE BERT\nvocabulary (i.e., 'origVocab'). MathBERT is currently being adopted at the\nparticipated leaning platforms: Stride, Inc, a commercial educational resource\nprovider, and ASSISTments.org, a free online educational platform. We release\nMathBERT for public usage at: https://github.com/tbs17/MathBERT.\n","authors":["Jia Tracy Shen","Michiharu Yamashita","Ethan Prihar","Neil Heffernan","Xintao Wu","Ben Graff","Dongwon Lee"],"pdf_url":"https://arxiv.org/pdf/2106.07340v5.pdf","comment":"Accepted by NeurIPS 2021 MATHAI4ED Workshop (Best Paper)"},{"id":"http://arxiv.org/abs/2308.06595v1","updated":"2023-08-12T15:27:51Z","published":"2023-08-12T15:27:51Z","title":"VisIT-Bench: A Benchmark for Vision-Language Instruction Following\n Inspired by Real-World Use","summary":" We introduce VisIT-Bench (Visual InsTruction Benchmark), a benchmark for\nevaluation of instruction-following vision-language models for real-world use.\nOur starting point is curating 70 'instruction families' that we envision\ninstruction tuned vision-language models should be able to address. Extending\nbeyond evaluations like VQAv2 and COCO, tasks range from basic recognition to\ngame playing and creative generation. Following curation, our dataset comprises\n592 test queries, each with a human-authored instruction-conditioned caption.\nThese descriptions surface instruction-specific factors, e.g., for an\ninstruction asking about the accessibility of a storefront for wheelchair\nusers, the instruction-conditioned caption describes ramps/potential obstacles.\nThese descriptions enable 1) collecting human-verified reference outputs for\neach instance; and 2) automatic evaluation of candidate multimodal generations\nusing a text-only LLM, aligning with human judgment. We quantify quality gaps\nbetween models and references using both human and automatic evaluations; e.g.,\nthe top-performing instruction-following model wins against the GPT-4 reference\nin just 27% of the comparison. VisIT-Bench is dynamic to participate,\npractitioners simply submit their model's response on the project website;\nData, code and leaderboard is available at visit-bench.github.io.\n","authors":["Yonatan Bitton","Hritik Bansal","Jack Hessel","Rulin Shao","Wanrong Zhu","Anas Awadalla","Josh Gardner","Rohan Taori","Ludwig Schimdt"],"pdf_url":"https://arxiv.org/pdf/2308.06595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03987v2","updated":"2023-08-12T14:57:37Z","published":"2023-07-08T14:25:57Z","title":"A Stitch in Time Saves Nine: Detecting and Mitigating Hallucinations of\n LLMs by Validating Low-Confidence Generation","summary":" Recently developed large language models have achieved remarkable success in\ngenerating fluent and coherent text. However, these models often tend to\n'hallucinate' which critically hampers their reliability. In this work, we\naddress this crucial problem and propose an approach that actively detects and\nmitigates hallucinations during the generation process. Specifically, we first\nidentify the candidates of potential hallucination leveraging the model's logit\noutput values, check their correctness through a validation procedure, mitigate\nthe detected hallucinations, and then continue with the generation process.\nThrough extensive experiments with GPT-3.5 (text-davinci-003) on the 'article\ngeneration task', we first demonstrate the individual efficacy of our detection\nand mitigation techniques. Specifically, the detection technique achieves a\nrecall of ~88% and the mitigation technique successfully mitigates 57.6% of the\ncorrectly detected hallucinations. Importantly, our mitigation technique does\nnot introduce new hallucinations even in the case of incorrectly detected\nhallucinations, i.e., false positives. Then, we show that the proposed active\ndetection and mitigation approach successfully reduces the hallucinations of\nthe GPT-3.5 model from 47.5% to 14.5% on average. We further demonstrate the\neffectiveness and wide applicability of our approach through additional studies\nincluding performance on different types of questions (multi-hop and false\npremise questions) and with another LLM from a different model family (Vicuna).\nIn summary, our work contributes to improving the reliability and\ntrustworthiness of large language models, a crucial step en route to enabling\ntheir widespread adoption in real-world applications.\n","authors":["Neeraj Varshney","Wenlin Yao","Hongming Zhang","Jianshu Chen","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2307.03987v2.pdf","comment":"update to include additional experiments"},{"id":"http://arxiv.org/abs/2308.06552v1","updated":"2023-08-12T12:38:10Z","published":"2023-08-12T12:38:10Z","title":"MT4CrossOIE: Multi-stage Tuning for Cross-lingual Open Information\n Extraction","summary":" Cross-lingual open information extraction aims to extract structured\ninformation from raw text across multiple languages. Previous work uses a\nshared cross-lingual pre-trained model to handle the different languages but\nunderuses the potential of the language-specific representation. In this paper,\nwe propose an effective multi-stage tuning framework called MT4CrossIE,\ndesigned for enhancing cross-lingual open information extraction by injecting\nlanguage-specific knowledge into the shared model. Specifically, the\ncross-lingual pre-trained model is first tuned in a shared semantic space\n(e.g., embedding matrix) in the fixed encoder and then other components are\noptimized in the second stage. After enough training, we freeze the pre-trained\nmodel and tune the multiple extra low-rank language-specific modules using\nmixture-of-LoRAs for model-based cross-lingual transfer. In addition, we\nleverage two-stage prompting to encourage the large language model (LLM) to\nannotate the multi-lingual raw data for data-based cross-lingual transfer. The\nmodel is trained with multi-lingual objectives on our proposed dataset\nOpenIE4++ by combing the model-based and data-based transfer techniques.\nExperimental results on various benchmarks emphasize the importance of\naggregating multiple plug-in-and-play language-specific modules and demonstrate\nthe effectiveness of MT4CrossIE in cross-lingual\nOIE\\footnote{\\url{https://github.com/CSJianYang/Multilingual-Multimodal-NLP}}.\n","authors":["Zixiang Wang","Linzheng Chai","Jian Yang","Jiaqi Bai","Yuwei Yin","Jiaheng Liu","Hongcheng Guo","Tongliang Li","Liqun Yang","Hebboul Zine el-abidine","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2308.06552v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.06547v1","updated":"2023-08-12T12:13:52Z","published":"2023-08-12T12:13:52Z","title":"Alternative Pseudo-Labeling for Semi-Supervised Automatic Speech\n Recognition","summary":" When labeled data is insufficient, semi-supervised learning with the\npseudo-labeling technique can significantly improve the performance of\nautomatic speech recognition. However, pseudo-labels are often noisy,\ncontaining numerous incorrect tokens. Taking noisy labels as ground-truth in\nthe loss function results in suboptimal performance. Previous works attempted\nto mitigate this issue by either filtering out the nosiest pseudo-labels or\nimproving the overall quality of pseudo-labels. While these methods are\neffective to some extent, it is unrealistic to entirely eliminate incorrect\ntokens in pseudo-labels. In this work, we propose a novel framework named\nalternative pseudo-labeling to tackle the issue of noisy pseudo-labels from the\nperspective of the training objective. The framework comprises several\ncomponents. Firstly, a generalized CTC loss function is introduced to handle\nnoisy pseudo-labels by accepting alternative tokens in the positions of\nincorrect tokens. Applying this loss function in pseudo-labeling requires\ndetecting incorrect tokens in the predicted pseudo-labels. In this work, we\nadopt a confidence-based error detection method that identifies the incorrect\ntokens by comparing their confidence scores with a given threshold, thus\nnecessitating the confidence score to be discriminative. Hence, the second\nproposed technique is the contrastive CTC loss function that widens the\nconfidence gap between the correctly and incorrectly predicted tokens, thereby\nimproving the error detection ability. Additionally, obtaining satisfactory\nperformance with confidence-based error detection typically requires extensive\nthreshold tuning. Instead, we propose an automatic thresholding method that\nuses labeled data as a proxy for determining the threshold, thus saving the\npain of manual tuning.\n","authors":["Han Zhu","Dongji Gao","Gaofeng Cheng","Daniel Povey","Pengyuan Zhang","Yonghong Yan"],"pdf_url":"https://arxiv.org/pdf/2308.06547v1.pdf","comment":"Accepted by IEEE/ACM Transactions on Audio, Speech and Language\n Processing (TASLP), 2023"},{"id":"http://arxiv.org/abs/2308.06546v1","updated":"2023-08-12T12:03:41Z","published":"2023-08-12T12:03:41Z","title":"MC-DRE: Multi-Aspect Cross Integration for Drug Event/Entity Extraction","summary":" Extracting meaningful drug-related information chunks, such as adverse drug\nevents (ADE), is crucial for preventing morbidity and saving many lives. Most\nADE are reported via an unstructured conversation with the medical context.\nHence, applying a general entity recognition approach is not sufficient enough.\nThe key is how to integrate and align multiple crucial aspects to detect drug\nevent information, including drug event semantics, syntactic structures, and\nmedical domain terminology. In this paper, we propose a new multi-aspect\ncross-integration framework for drug entity/event detection by capturing and\naligning different context/language/knowledge properties from drug-related\ndocuments. We first construct multi-aspect encoders to describe semantic,\nsyntactic, and medical document contextual information by conducting those slot\ntagging tasks, main drug entity/event detection, part-of-speech tagging, and\ngeneral medical named entity recognition. Then, each encoder conducts cross\nintegration and alignment with other contextual information in three ways,\nincluding the key-value cross, attention cross, and feedforward cross, so the\nmulti-encoders are integrated in depth. Then, we perform extensive experiments\non two widely used drug-related entity recognition downstream tasks, flat\nentity detection and discontinuous event extraction. Our model significantly\noutperforms all recent twelve state-of-the-art models. The implementation code\nwill be released at~\\url{https://github.com/adlnlp/mc-dre}.\n","authors":["Jie Yang","Soyeon Caren Han","Siqu Long","Josiah Poon","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.06546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06527v1","updated":"2023-08-12T11:00:59Z","published":"2023-08-12T11:00:59Z","title":"With a Little Help from the Authors: Reproducing Human Evaluation of an\n MT Error Detector","summary":" This work presents our efforts to reproduce the results of the human\nevaluation experiment presented in the paper of Vamvas and Sennrich (2022),\nwhich evaluated an automatic system detecting over- and undertranslations\n(translations containing more or less information than the original) in machine\ntranslation (MT) outputs. Despite the high quality of the documentation and\ncode provided by the authors, we discuss some problems we found in reproducing\nthe exact experimental setup and offer recommendations for improving\nreproducibility. Our replicated results generally confirm the conclusions of\nthe original study, but in some cases, statistically significant differences\nwere observed, suggesting a high variability of human annotation.\n","authors":["Ondřej Plátek","Mateusz Lango","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2308.06527v1.pdf","comment":"Submitted to\n https://www.aclweb.org/portal/content/repronlp-shared-task-reproducibility-evaluations-nlp-2023"},{"id":"http://arxiv.org/abs/2208.00463v2","updated":"2023-08-12T10:59:56Z","published":"2022-07-31T16:23:23Z","title":"Mismatching-Aware Unsupervised Translation Quality Estimation For\n Low-Resource Languages","summary":" Translation Quality Estimation (QE) is the task of predicting the quality of\nmachine translation (MT) output without any reference. This task has gained\nincreasing attention as an important component in the practical applications of\nMT. In this paper, we first propose XLMRScore, which is a cross-lingual\ncounterpart of BERTScore computed via the XLM-RoBERTa (XLMR) model. This metric\ncan be used as a simple unsupervised QE method, while employing it results in\ntwo issues: firstly, the untranslated tokens leading to unexpectedly high\ntranslation scores, and secondly, the issue of mismatching errors between\nsource and hypothesis tokens when applying the greedy matching in XLMRScore. To\nmitigate these issues, we suggest replacing untranslated words with the unknown\ntoken and the cross-lingual alignment of the pre-trained model to represent\naligned words closer to each other, respectively. We evaluate the proposed\nmethod on four low-resource language pairs of WMT21 QE shared task, as well as\na new English-Farsi test dataset introduced in this paper. Experiments show\nthat our method could get comparable results with the supervised baseline for\ntwo zero-shot scenarios, i.e., with less than 0.01 difference in Pearson\ncorrelation, while outperforming unsupervised rivals in all the low-resource\nlanguage pairs for above 8%, on average.\n","authors":["Fatemeh Azadi","Heshaam Faili","Mohammad Javad Dousti"],"pdf_url":"https://arxiv.org/pdf/2208.00463v2.pdf","comment":"Submitted to Language Resources and Evaluation"},{"id":"http://arxiv.org/abs/2305.11095v2","updated":"2023-08-12T09:53:41Z","published":"2023-05-18T16:32:58Z","title":"Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot\n Task Generalization","summary":" We investigate the emergent abilities of the recently proposed web-scale\nspeech model Whisper, by adapting it to unseen tasks with prompt engineering.\nWe selected three tasks: audio-visual speech recognition (AVSR), code-switched\nspeech recognition (CS-ASR), and speech translation (ST) on unseen language\npairs. We design task-specific prompts, by either leveraging another\nlarge-scale model, or simply manipulating the special tokens in the default\nprompts. Experiments show that compared to the default prompts, our proposed\nprompts improve performance by 10% to 45% on the three zero-shot tasks, and\neven outperform SotA supervised models on some datasets. In addition, our\nexperiments reveal many interesting properties of Whisper, including its\nrobustness to prompts, bias on accents, and the multilingual understanding in\nits latent space. Code is available at\nhttps://github.com/jasonppy/PromptingWhisper\n","authors":["Puyuan Peng","Brian Yan","Shinji Watanabe","David Harwath"],"pdf_url":"https://arxiv.org/pdf/2305.11095v2.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.06512v1","updated":"2023-08-12T09:31:43Z","published":"2023-08-12T09:31:43Z","title":"HyperFormer: Enhancing Entity and Relation Interaction for\n Hyper-Relational Knowledge Graph Completion","summary":" Hyper-relational knowledge graphs (HKGs) extend standard knowledge graphs by\nassociating attribute-value qualifiers to triples, which effectively represent\nadditional fine-grained information about its associated triple.\nHyper-relational knowledge graph completion (HKGC) aims at inferring unknown\ntriples while considering its qualifiers. Most existing approaches to HKGC\nexploit a global-level graph structure to encode hyper-relational knowledge\ninto the graph convolution message passing process. However, the addition of\nmulti-hop information might bring noise into the triple prediction process. To\naddress this problem, we propose HyperFormer, a model that considers\nlocal-level sequential information, which encodes the content of the entities,\nrelations and qualifiers of a triple. More precisely, HyperFormer is composed\nof three different modules: an entity neighbor aggregator module allowing to\nintegrate the information of the neighbors of an entity to capture different\nperspectives of it; a relation qualifier aggregator module to integrate\nhyper-relational knowledge into the corresponding relation to refine the\nrepresentation of relational content; a convolution-based bidirectional\ninteraction module based on a convolutional operation, capturing pairwise\nbidirectional interactions of entity-relation, entity-qualifier, and\nrelation-qualifier. realize the depth perception of the content related to the\ncurrent statement. Furthermore, we introduce a Mixture-of-Experts strategy into\nthe feed-forward layers of HyperFormer to strengthen its representation\ncapabilities while reducing the amount of model parameters and computation.\nExtensive experiments on three well-known datasets with four different\nconditions demonstrate HyperFormer's effectiveness. Datasets and code are\navailable at https://github.com/zhiweihu1103/HKGC-HyperFormer.\n","authors":["Zhiwei Hu","Víctor Gutiérrez-Basulto","Zhiliang Xiang","Ru Li","Jeff Z. Pan"],"pdf_url":"https://arxiv.org/pdf/2308.06512v1.pdf","comment":"Accepted at CIKM'23"},{"id":"http://arxiv.org/abs/2308.06507v1","updated":"2023-08-12T08:52:40Z","published":"2023-08-12T08:52:40Z","title":"AutoConv: Automatically Generating Information-seeking Conversations\n with Large Language Models","summary":" Information-seeking conversation, which aims to help users gather information\nthrough conversation, has achieved great progress in recent years. However, the\nresearch is still stymied by the scarcity of training data. To alleviate this\nproblem, we propose AutoConv for synthetic conversation generation, which takes\nadvantage of the few-shot learning ability and generation capacity of large\nlanguage models (LLM). Specifically, we formulate the conversation generation\nproblem as a language modeling task, then finetune an LLM with a few human\nconversations to capture the characteristics of the information-seeking process\nand use it for generating synthetic conversations with high quality.\nExperimental results on two frequently-used datasets verify that AutoConv has\nsubstantial improvements over strong baselines and alleviates the dependence on\nhuman annotation. In addition, we also provide several analysis studies to\npromote future research.\n","authors":["Siheng Li","Cheng Yang","Yichun Yin","Xinyu Zhu","Zesen Cheng","Lifeng Shang","Xin Jiang","Qun Liu","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06507v1.pdf","comment":"Accepted to ACL 2023 Main Conference (Short)"},{"id":"http://arxiv.org/abs/2308.06502v1","updated":"2023-08-12T08:34:15Z","published":"2023-08-12T08:34:15Z","title":"Three Ways of Using Large Language Models to Evaluate Chat","summary":" This paper describes the systems submitted by team6 for ChatEval, the DSTC 11\nTrack 4 competition. We present three different approaches to predicting\nturn-level qualities of chatbot responses based on large language models\n(LLMs). We report improvement over the baseline using dynamic few-shot examples\nfrom a vector store for the prompts for ChatGPT. We also analyze the\nperformance of the other two approaches and report needed improvements for\nfuture work. We developed the three systems over just two weeks, showing the\npotential of LLMs for this task. An ablation study conducted after the\nchallenge deadline shows that the new Llama 2 models are closing the\nperformance gap between ChatGPT and open-source LLMs. However, we find that the\nLlama 2 models do not benefit from few-shot examples in the same way as\nChatGPT.\n","authors":["Ondřej Plátek","Vojtěch Hudeček","Patricia Schmidtová","Mateusz Lango","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2308.06502v1.pdf","comment":"Accepted to DSTC11 workshop https://dstc11.dstc.community/"},{"id":"http://arxiv.org/abs/2308.06501v1","updated":"2023-08-12T08:33:42Z","published":"2023-08-12T08:33:42Z","title":"NewsDialogues: Towards Proactive News Grounded Conversation","summary":" Hot news is one of the most popular topics in daily conversations. However,\nnews grounded conversation has long been stymied by the lack of well-designed\ntask definition and scarce data. In this paper, we propose a novel task,\nProactive News Grounded Conversation, in which a dialogue system can\nproactively lead the conversation based on some key topics of the news. In\naddition, both information-seeking and chit-chat scenarios are included\nrealistically, where the user may ask a series of questions about the news\ndetails or express their opinions and be eager to chat. To further develop this\nnovel task, we collect a human-to-human Chinese dialogue dataset\n\\ts{NewsDialogues}, which includes 1K conversations with a total of 14.6K\nutterances and detailed annotations for target topics and knowledge spans.\nFurthermore, we propose a method named Predict-Generate-Rank, consisting of a\ngenerator for grounded knowledge prediction and response generation, and a\nranker for the ranking of multiple responses to alleviate the exposure bias. We\nconduct comprehensive experiments to demonstrate the effectiveness of the\nproposed method and further present several key findings and challenges to\nprompt future research.\n","authors":["Siheng Li","Yichun Yin","Cheng Yang","Wangjie Jiang","Yiwei Li","Zesen Cheng","Lifeng Shang","Xin Jiang","Qun Liu","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06501v1.pdf","comment":"Accepted to ACL 2023 Conference (Long Paper; Findings)"},{"id":"http://arxiv.org/abs/2308.06488v1","updated":"2023-08-12T07:12:45Z","published":"2023-08-12T07:12:45Z","title":"Generating Faithful Text From a Knowledge Graph with Noisy Reference\n Text","summary":" Knowledge Graph (KG)-to-Text generation aims at generating fluent\nnatural-language text that accurately represents the information of a given\nknowledge graph. While significant progress has been made in this task by\nexploiting the power of pre-trained language models (PLMs) with appropriate\ngraph structure-aware modules, existing models still fall short of generating\nfaithful text, especially when the ground-truth natural-language text contains\nadditional information that is not present in the graph. In this paper, we\ndevelop a KG-to-text generation model that can generate faithful\nnatural-language text from a given graph, in the presence of noisy reference\ntext. Our framework incorporates two core ideas: Firstly, we utilize\ncontrastive learning to enhance the model's ability to differentiate between\nfaithful and hallucinated information in the text, thereby encouraging the\ndecoder to generate text that aligns with the input graph. Secondly, we empower\nthe decoder to control the level of hallucination in the generated text by\nemploying a controllable text generation technique. We evaluate our model's\nperformance through the standard quantitative metrics as well as a\nChatGPT-based quantitative and qualitative analysis. Our evaluation\ndemonstrates the superior performance of our model over state-of-the-art\nKG-to-text models on faithfulness.\n","authors":["Tahsina Hashem","Weiqing Wang","Derry Tanti Wijaya","Mohammed Eunus Ali","Yuan-Fang Li"],"pdf_url":"https://arxiv.org/pdf/2308.06488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04424v2","updated":"2023-08-12T06:17:42Z","published":"2023-08-08T17:53:24Z","title":"A Bi-directional Multi-hop Inference Model for Joint Dialog Sentiment\n Classification and Act Recognition","summary":" The joint task of Dialog Sentiment Classification (DSC) and Act Recognition\n(DAR) aims to predict the sentiment label and act label for each utterance in a\ndialog simultaneously. However, current methods encode the dialog context in\nonly one direction, which limits their ability to thoroughly comprehend the\ncontext. Moreover, these methods overlook the explicit correlations between\nsentiment and act labels, which leads to an insufficient ability to capture\nrich sentiment and act clues and hinders effective and accurate reasoning. To\naddress these issues, we propose a Bi-directional Multi-hop Inference Model\n(BMIM) that leverages a feature selection network and a bi-directional\nmulti-hop inference network to iteratively extract and integrate rich sentiment\nand act clues in a bi-directional manner. We also employ contrastive learning\nand dual learning to explicitly model the correlations of sentiment and act\nlabels. Our experiments on two widely-used datasets show that BMIM outperforms\nstate-of-the-art baselines by at least 2.6% on F1 score in DAR and 1.4% on F1\nscore in DSC. Additionally, Our proposed model not only improves the\nperformance but also enhances the interpretability of the joint sentiment and\nact prediction task.\n","authors":["Li Zheng","Fei Li","Yuyang Chai","Chong Teng","Donghong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.04424v2.pdf","comment":"Accepted by NLPCC 2023"},{"id":"http://arxiv.org/abs/2308.04498v2","updated":"2023-08-12T06:12:36Z","published":"2023-08-08T18:03:29Z","title":"DialogRE^C+: An Extension of DialogRE to Investigate How Much\n Coreference Helps Relation Extraction in Dialogs","summary":" Dialogue relation extraction (DRE) that identifies the relations between\nargument pairs in dialogue text, suffers much from the frequent occurrence of\npersonal pronouns, or entity and speaker coreference. This work introduces a\nnew benchmark dataset DialogRE^C+, introducing coreference resolution into the\nDRE scenario. With the aid of high-quality coreference knowledge, the reasoning\nof argument relations is expected to be enhanced. In DialogRE^C+ dataset, we\nmanually annotate total 5,068 coreference chains over 36,369 argument mentions\nbased on the existing DialogRE data, where four different coreference chain\ntypes namely speaker chain, person chain, location chain and organization chain\nare explicitly marked. We further develop 4 coreference-enhanced graph-based\nDRE models, which learn effective coreference representations for improving the\nDRE task. We also train a coreference resolution model based on our annotations\nand evaluate the effect of automatically extracted coreference chains\ndemonstrating the practicality of our dataset and its potential to other\ndomains and tasks.\n","authors":["Yiyun Xiong","Mengwei Dai","Fei Li","Hao Fei","Bobo Li","Shengqiong Wu","Donghong Ji","Chong Teng"],"pdf_url":"https://arxiv.org/pdf/2308.04498v2.pdf","comment":"Accepted by NLPCC 2023"},{"id":"http://arxiv.org/abs/2308.04502v2","updated":"2023-08-12T06:05:26Z","published":"2023-08-08T18:11:27Z","title":"Revisiting Disentanglement and Fusion on Modality and Context in\n Conversational Multimodal Emotion Recognition","summary":" It has been a hot research topic to enable machines to understand human\nemotions in multimodal contexts under dialogue scenarios, which is tasked with\nmultimodal emotion analysis in conversation (MM-ERC). MM-ERC has received\nconsistent attention in recent years, where a diverse range of methods has been\nproposed for securing better task performance. Most existing works treat MM-ERC\nas a standard multimodal classification problem and perform multimodal feature\ndisentanglement and fusion for maximizing feature utility. Yet after revisiting\nthe characteristic of MM-ERC, we argue that both the feature multimodality and\nconversational contextualization should be properly modeled simultaneously\nduring the feature disentanglement and fusion steps. In this work, we target\nfurther pushing the task performance by taking full consideration of the above\ninsights. On the one hand, during feature disentanglement, based on the\ncontrastive learning technique, we devise a Dual-level Disentanglement\nMechanism (DDM) to decouple the features into both the modality space and\nutterance space. On the other hand, during the feature fusion stage, we propose\na Contribution-aware Fusion Mechanism (CFM) and a Context Refusion Mechanism\n(CRM) for multimodal and context integration, respectively. They together\nschedule the proper integrations of multimodal and context features.\nSpecifically, CFM explicitly manages the multimodal feature contributions\ndynamically, while CRM flexibly coordinates the introduction of dialogue\ncontexts. On two public MM-ERC datasets, our system achieves new\nstate-of-the-art performance consistently. Further analyses demonstrate that\nall our proposed mechanisms greatly facilitate the MM-ERC task by making full\nuse of the multimodal and context features adaptively. Note that our proposed\nmethods have the great potential to facilitate a broader range of other\nconversational multimodal tasks.\n","authors":["Bobo Li","Hao Fei","Lizi Liao","Yu Zhao","Chong Teng","Tat-Seng Chua","Donghong Ji","Fei Li"],"pdf_url":"https://arxiv.org/pdf/2308.04502v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.05081v2","updated":"2023-08-12T06:02:02Z","published":"2023-08-09T17:20:14Z","title":"Constructing Holistic Spatio-Temporal Scene Graph for Video Semantic\n Role Labeling","summary":" Video Semantic Role Labeling (VidSRL) aims to detect the salient events from\ngiven videos, by recognizing the predict-argument event structures and the\ninterrelationships between events. While recent endeavors have put forth\nmethods for VidSRL, they can be mostly subject to two key drawbacks, including\nthe lack of fine-grained spatial scene perception and the insufficiently\nmodeling of video temporality. Towards this end, this work explores a novel\nholistic spatio-temporal scene graph (namely HostSG) representation based on\nthe existing dynamic scene graph structures, which well model both the\nfine-grained spatial semantics and temporal dynamics of videos for VidSRL.\nBuilt upon the HostSG, we present a nichetargeting VidSRL framework. A\nscene-event mapping mechanism is first designed to bridge the gap between the\nunderlying scene structure and the high-level event semantic structure,\nresulting in an overall hierarchical scene-event (termed ICE) graph structure.\nWe further perform iterative structure refinement to optimize the ICE graph,\nsuch that the overall structure representation can best coincide with end task\ndemand. Finally, three subtask predictions of VidSRL are jointly decoded, where\nthe end-to-end paradigm effectively avoids error propagation. On the benchmark\ndataset, our framework boosts significantly over the current best-performing\nmodel. Further analyses are shown for a better understanding of the advances of\nour methods.\n","authors":["Yu Zhao","Hao Fei","Yixin Cao","Bobo Li","Meishan Zhang","Jianguo Wei","Min Zhang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.05081v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.06463v1","updated":"2023-08-12T04:05:57Z","published":"2023-08-12T04:05:57Z","title":"GPT-4 Is Too Smart To Be Safe: Stealthy Chat with LLMs via Cipher","summary":" Safety lies at the core of the development of Large Language Models (LLMs).\nThere is ample work on aligning LLMs with human ethics and preferences,\nincluding data filtering in pretraining, supervised fine-tuning, reinforcement\nlearning from human feedback, and red teaming, etc. In this study, we discover\nthat chat in cipher can bypass the safety alignment techniques of LLMs, which\nare mainly conducted in natural languages. We propose a novel framework\nCipherChat to systematically examine the generalizability of safety alignment\nto non-natural languages -- ciphers. CipherChat enables humans to chat with\nLLMs through cipher prompts topped with system role descriptions and few-shot\nenciphered demonstrations. We use CipherChat to assess state-of-the-art LLMs,\nincluding ChatGPT and GPT-4 for different representative human ciphers across\n11 safety domains in both English and Chinese. Experimental results show that\ncertain ciphers succeed almost 100% of the time to bypass the safety alignment\nof GPT-4 in several safety domains, demonstrating the necessity of developing\nsafety alignment for non-natural languages. Notably, we identify that LLMs seem\nto have a ''secret cipher'', and propose a novel SelfCipher that uses only role\nplay and several demonstrations in natural language to evoke this capability.\nSelfCipher surprisingly outperforms existing human ciphers in almost all cases.\nOur code and data will be released at https://github.com/RobustNLP/CipherChat.\n","authors":["Youliang Yuan","Wenxiang Jiao","Wenxuan Wang","Jen-tse Huang","Pinjia He","Shuming Shi","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2308.06463v1.pdf","comment":"13 pages, 4 figures, 9 tables"},{"id":"http://arxiv.org/abs/2308.06457v1","updated":"2023-08-12T03:30:49Z","published":"2023-08-12T03:30:49Z","title":"Text-to-Video: a Two-stage Framework for Zero-shot Identity-agnostic\n Talking-head Generation","summary":" The advent of ChatGPT has introduced innovative methods for information\ngathering and analysis. However, the information provided by ChatGPT is limited\nto text, and the visualization of this information remains constrained.\nPrevious research has explored zero-shot text-to-video (TTV) approaches to\ntransform text into videos. However, these methods lacked control over the\nidentity of the generated audio, i.e., not identity-agnostic, hindering their\neffectiveness. To address this limitation, we propose a novel two-stage\nframework for person-agnostic video cloning, specifically focusing on TTV\ngeneration. In the first stage, we leverage pretrained zero-shot models to\nachieve text-to-speech (TTS) conversion. In the second stage, an audio-driven\ntalking head generation method is employed to produce compelling videos\nprivided the audio generated in the first stage. This paper presents a\ncomparative analysis of different TTS and audio-driven talking head generation\nmethods, identifying the most promising approach for future research and\ndevelopment. Some audio and videos samples can be found in the following link:\nhttps://github.com/ZhichaoWang970201/Text-to-Video/tree/main.\n","authors":["Zhichao Wang","Mengyu Dai","Keld Lundgaard"],"pdf_url":"https://arxiv.org/pdf/2308.06457v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2308.06454v1","updated":"2023-08-12T03:23:09Z","published":"2023-08-12T03:23:09Z","title":"Demonstration-based learning for few-shot biomedical named entity\n recognition under machine reading comprehension","summary":" Although deep learning techniques have shown significant achievements, they\nfrequently depend on extensive amounts of hand-labeled data and tend to perform\ninadequately in few-shot scenarios. The objective of this study is to devise a\nstrategy that can improve the model's capability to recognize biomedical\nentities in scenarios of few-shot learning. By redefining biomedical named\nentity recognition (BioNER) as a machine reading comprehension (MRC) problem,\nwe propose a demonstration-based learning method to address few-shot BioNER,\nwhich involves constructing appropriate task demonstrations. In assessing our\nproposed method, we compared the proposed method with existing advanced methods\nusing six benchmark datasets, including BC4CHEMD, BC5CDR-Chemical,\nBC5CDR-Disease, NCBI-Disease, BC2GM, and JNLPBA. We examined the models'\nefficacy by reporting F1 scores from both the 25-shot and 50-shot learning\nexperiments. In 25-shot learning, we observed 1.1% improvements in the average\nF1 scores compared to the baseline method, reaching 61.7%, 84.1%, 69.1%, 70.1%,\n50.6%, and 59.9% on six datasets, respectively. In 50-shot learning, we further\nimproved the average F1 scores by 1.0% compared to the baseline method,\nreaching 73.1%, 86.8%, 76.1%, 75.6%, 61.7%, and 65.4%, respectively. We\nreported that in the realm of few-shot learning BioNER, MRC-based language\nmodels are much more proficient in recognizing biomedical entities compared to\nthe sequence labeling approach. Furthermore, our MRC-language models can\ncompete successfully with fully-supervised learning methodologies that rely\nheavily on the availability of abundant annotated data. These results highlight\npossible pathways for future advancements in few-shot BioNER methodologies.\n","authors":["Leilei Su","Jian Chen","Yifan Peng","Cong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.06454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06450v1","updated":"2023-08-12T03:05:44Z","published":"2023-08-12T03:05:44Z","title":"Simple Model Also Works: A Novel Emotion Recognition Network in Textual\n Conversation Based on Curriculum Learning Strategy","summary":" Emotion Recognition in Conversation (ERC) has emerged as a research hotspot\nin domains such as conversational robots and question-answer systems. How to\nefficiently and adequately retrieve contextual emotional cues has been one of\nthe key challenges in the ERC task. Existing efforts do not fully model the\ncontext and employ complex network structures, resulting in excessive\ncomputational resource overhead without substantial performance improvement. In\nthis paper, we propose a novel Emotion Recognition Network based on Curriculum\nLearning strategy (ERNetCL). The proposed ERNetCL primarily consists of\nTemporal Encoder (TE), Spatial Encoder (SE), and Curriculum Learning (CL) loss.\nWe utilize TE and SE to combine the strengths of previous methods in a\nsimplistic manner to efficiently capture temporal and spatial contextual\ninformation in the conversation. To simulate the way humans learn curriculum\nfrom easy to hard, we apply the idea of CL to the ERC task to progressively\noptimize the network parameters of ERNetCL. At the beginning of training, we\nassign lower learning weights to difficult samples. As the epoch increases, the\nlearning weights for these samples are gradually raised. Extensive experiments\non four datasets exhibit that our proposed method is effective and dramatically\nbeats other baseline models.\n","authors":["Jiang Li","Xiaoping Wang","Yingjian Liu","Qing Zhou","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.06450v1.pdf","comment":"12 pages,9 figures"},{"id":"http://arxiv.org/abs/2302.12324v3","updated":"2023-08-12T03:00:55Z","published":"2023-02-23T20:39:06Z","title":"Summaries as Captions: Generating Figure Captions for Scientific\n Documents with Automated Text Summarization","summary":" Good figure captions help paper readers understand complex scientific\nfigures. Unfortunately, even published papers often have poorly written\ncaptions. Automatic caption generation could aid paper writers by providing\ngood starting captions that can be refined for better quality. Prior work often\ntreated figure caption generation as a vision-to-language task. In this paper,\nwe show that it can be more effectively tackled as a text summarization task in\nscientific documents. We fine-tuned PEGASUS, a pre-trained abstractive\nsummarization model, to specifically summarize figure-referencing paragraphs\n(e.g., \"Figure 3 shows...\") into figure captions. Experiments on large-scale\narXiv figures show that our method outperforms prior vision methods in both\nautomatic and human evaluations. We further conducted an in-depth investigation\nfocused on two key challenges: (i) the common presence of low-quality\nauthor-written captions and (ii) the lack of clear standards for good captions.\nOur code and data are available at:\nhttps://github.com/Crowd-AI-Lab/Generating-Figure-Captions-as-a-Text-Summarization-Task.\n","authors":["Chieh-Yang Huang","Ting-Yao Hsu","Ryan Rossi","Ani Nenkova","Sungchul Kim","Gromit Yeuk-Yin Chan","Eunyee Koh","Clyde Lee Giles","Ting-Hao 'Kenneth' Huang"],"pdf_url":"https://arxiv.org/pdf/2302.12324v3.pdf","comment":"Accepted by INLG-2023"},{"id":"http://arxiv.org/abs/2308.06431v1","updated":"2023-08-12T01:34:41Z","published":"2023-08-12T01:34:41Z","title":"Performance Prediction for Multi-hop Questions","summary":" We study the problem of Query Performance Prediction (QPP) for open-domain\nmulti-hop Question Answering (QA), where the task is to estimate the difficulty\nof evaluating a multi-hop question over a corpus. Despite the extensive\nresearch on predicting the performance of ad-hoc and QA retrieval models, there\nhas been a lack of study on the estimation of the difficulty of multi-hop\nquestions. The problem is challenging due to the multi-step nature of the\nretrieval process, potential dependency of the steps and the reasoning\ninvolved. To tackle this challenge, we propose multHP, a novel pre-retrieval\nmethod for predicting the performance of open-domain multi-hop questions. Our\nextensive evaluation on the largest multi-hop QA dataset using several modern\nQA systems shows that the proposed model is a strong predictor of the\nperformance, outperforming traditional single-hop QPP models. Additionally, we\ndemonstrate that our approach can be effectively used to optimize the\nparameters of QA systems, such as the number of documents to be retrieved,\nresulting in improved overall retrieval performance.\n","authors":["Mohammadreza Samadi","Davood Rafiei"],"pdf_url":"https://arxiv.org/pdf/2308.06431v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.04711v2","updated":"2023-08-12T00:02:15Z","published":"2023-08-09T05:06:39Z","title":"Answering Unseen Questions With Smaller Language Models Using Rationale\n Generation and Dense Retrieval","summary":" When provided with sufficient explanatory context, smaller Language Models\nhave been shown to exhibit strong reasoning ability on challenging short-answer\nquestion-answering tasks where the questions are unseen in training. We\nevaluate two methods for further improvement in this setting. Both methods\nfocus on combining rationales generated by a larger Language Model with longer\ncontexts created from a multi-hop dense retrieval system. The first method\n($\\textit{RR}$) involves training a Rationale Ranking model to score both\ngenerated rationales and retrieved contexts with respect to relevance and\ntruthfulness. We then use the scores to derive combined contexts from both\nknowledge sources using a number of combinatory strategies. For the second\nmethod ($\\textit{RATD}$) we train a smaller Reasoning model using\nretrieval-augmented training datasets such that it becomes proficient at\nutilising relevant information from longer text sequences that may be only\npartially evidential and frequently contain many irrelevant sentences.\nGenerally we find that both methods are effective but that the $\\textit{RATD}$\nmethod is more straightforward to apply and produces the strongest results in\nthe unseen setting on which we focus. Our single best Reasoning model using\nonly 440 million parameters materially improves upon strong comparable prior\nbaselines for unseen evaluation datasets (StrategyQA 58.9 $\\rightarrow$ 61.7\nacc., CommonsenseQA 63.6 $\\rightarrow$ 72.7 acc., ARC-DA 31.6 $\\rightarrow$\n52.1 F1, IIRC 25.5 $\\rightarrow$ 27.3 F1) and a version utilising our prior\nknowledge of each type of question in selecting a context combination strategy\ndoes even better. Our proposed models also generally outperform direct prompts\nagainst much larger models (BLOOM 175B and StableVicuna 13B) in both few-shot\nchain-of-thought and few-shot answer-only settings.\n","authors":["Tim Hartill","Diana Benavides-Prado","Michael Witbrock","Patricia J. Riddle"],"pdf_url":"https://arxiv.org/pdf/2308.04711v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07342v1","updated":"2023-08-12T16:45:39Z","published":"2023-08-12T16:45:39Z","title":"Emergent communication for AR","summary":" Mobile augmented reality (MAR) is widely acknowledged as one of the\nubiquitous interfaces to the digital twin and Metaverse, demanding unparalleled\nlevels of latency, computational power, and energy efficiency. The existing\nsolutions for realizing MAR combine multiple technologies like edge, cloud\ncomputing, and fifth-generation (5G) networks. However, the inherent\ncommunication latency of visual data imposes apparent limitations on the\nquality of experience (QoE). To address the challenge, we propose an emergent\nsemantic communication framework to learn the communication protocols in MAR.\nSpecifically, we train two agents through a modified Lewis signaling game to\nemerge a discrete communication protocol spontaneously. Based on this protocol,\ntwo agents can communicate about the abstract idea of visual data through\nmessages with extremely small data sizes in a noisy channel, which leads to\nmessage errors. To better simulate real-world scenarios, we incorporate channel\nuncertainty into our training process. Experiments have shown that the proposed\nscheme has better generalization on unseen objects than traditional object\nrecognition used in MAR and can effectively enhance communication efficiency\nthrough the utilization of small-size messages.\n","authors":["Ruxiao Chen","Shuaishuai Guo"],"pdf_url":"https://arxiv.org/pdf/2308.07342v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.06556v1","updated":"2023-08-12T12:58:39Z","published":"2023-08-12T12:58:39Z","title":"Contrastive Learning for Cross-modal Artist Retrieval","summary":" Music retrieval and recommendation applications often rely on content\nfeatures encoded as embeddings, which provide vector representations of items\nin a music dataset. Numerous complementary embeddings can be derived from\nprocessing items originally represented in several modalities, e.g., audio\nsignals, user interaction data, or editorial data. However, data of any given\nmodality might not be available for all items in any music dataset. In this\nwork, we propose a method based on contrastive learning to combine embeddings\nfrom multiple modalities and explore the impact of the presence or absence of\nembeddings from diverse modalities in an artist similarity task. Experiments on\ntwo datasets suggest that our contrastive method outperforms single-modality\nembeddings and baseline algorithms for combining modalities, both in terms of\nartist retrieval accuracy and coverage. Improvements with respect to other\nmethods are particularly significant for less popular query artists. We\ndemonstrate our method successfully combines complementary information from\ndiverse modalities, and is more robust to missing modality data (i.e., it\nbetter handles the retrieval of artists with different modality embeddings than\nthe query artist's).\n","authors":["Andres Ferraro","Jaehun Kim","Sergio Oramas","Andreas Ehmann","Fabien Gouyon"],"pdf_url":"https://arxiv.org/pdf/2308.06556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06480v1","updated":"2023-08-12T06:23:41Z","published":"2023-08-12T06:23:41Z","title":"Context-aware Event Forecasting via Graph Disentanglement","summary":" Event forecasting has been a demanding and challenging task throughout the\nentire human history. It plays a pivotal role in crisis alarming and disaster\nprevention in various aspects of the whole society. The task of event\nforecasting aims to model the relational and temporal patterns based on\nhistorical events and makes forecasting to what will happen in the future. Most\nexisting studies on event forecasting formulate it as a problem of link\nprediction on temporal event graphs. However, such pure structured formulation\nsuffers from two main limitations: 1) most events fall into general and\nhigh-level types in the event ontology, and therefore they tend to be\ncoarse-grained and offers little utility which inevitably harms the forecasting\naccuracy; and 2) the events defined by a fixed ontology are unable to retain\nthe out-of-ontology contextual information. To address these limitations, we\npropose a novel task of context-aware event forecasting which incorporates\nauxiliary contextual information. First, the categorical context provides\nsupplementary fine-grained information to the coarse-grained events. Second and\nmore importantly, the context provides additional information towards specific\nsituation and condition, which is crucial or even determinant to what will\nhappen next. However, it is challenging to properly integrate context into the\nevent forecasting framework, considering the complex patterns in the\nmulti-context scenario. Towards this end, we design a novel framework named\nSeparation and Collaboration Graph Disentanglement (short as SeCoGD) for\ncontext-aware event forecasting. Since there is no available dataset for this\nnovel task, we construct three large-scale datasets based on GDELT.\nExperimental results demonstrate that our model outperforms a list of SOTA\nmethods.\n","authors":["Yunshan Ma","Chenchen Ye","Zijian Wu","Xiang Wang","Yixin Cao","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2308.06480v1.pdf","comment":"KDD 2023, 9 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2302.09971v3","updated":"2023-08-12T02:28:40Z","published":"2023-02-20T13:30:12Z","title":"Social4Rec: Distilling User Preference from Social Graph for Video\n Recommendation in Tencent","summary":" Despite recommender systems play a key role in network content platforms,\nmining the user's interests is still a significant challenge. Existing works\npredict the user interest by utilizing user behaviors, i.e., clicks, views,\netc., but current solutions are ineffective when users perform unsettled\nactivities. The latter ones involve new users, which have few activities of any\nkind, and sparse users who have low-frequency behaviors. We uniformly describe\nboth these user-types as \"cold users\", which are very common but often\nneglected in network content platforms. To address this issue, we enhance the\nrepresentation of the user interest by combining his social interest, e.g.,\nfriendship, following bloggers, interest groups, etc., with the activity\nbehaviors. Thus, in this work, we present a novel algorithm entitled SocialNet,\nwhich adopts a two-stage method to progressively extract the coarse-grained and\nfine-grained social interest. Our technique then concatenates SocialNet's\noutput with the original user representation to get the final user\nrepresentation that combines behavior interests and social interests. Offline\nexperiments on Tencent video's recommender system demonstrate the superiority\nover the baseline behavior-based model. The online experiment also shows a\nsignificant performance improvement in clicks and view time in the real-world\nrecommendation system. The source code is available at\nhttps://github.com/Social4Rec/SocialNet.\n","authors":["Xuanji Xiao","Huaqiang Dai","Qian Dong","Shuzi Niu","Yuzhen Liu","Pei Liu"],"pdf_url":"https://arxiv.org/pdf/2302.09971v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06431v1","updated":"2023-08-12T01:34:41Z","published":"2023-08-12T01:34:41Z","title":"Performance Prediction for Multi-hop Questions","summary":" We study the problem of Query Performance Prediction (QPP) for open-domain\nmulti-hop Question Answering (QA), where the task is to estimate the difficulty\nof evaluating a multi-hop question over a corpus. Despite the extensive\nresearch on predicting the performance of ad-hoc and QA retrieval models, there\nhas been a lack of study on the estimation of the difficulty of multi-hop\nquestions. The problem is challenging due to the multi-step nature of the\nretrieval process, potential dependency of the steps and the reasoning\ninvolved. To tackle this challenge, we propose multHP, a novel pre-retrieval\nmethod for predicting the performance of open-domain multi-hop questions. Our\nextensive evaluation on the largest multi-hop QA dataset using several modern\nQA systems shows that the proposed model is a strong predictor of the\nperformance, outperforming traditional single-hop QPP models. Additionally, we\ndemonstrate that our approach can be effectively used to optimize the\nparameters of QA systems, such as the number of documents to be retrieved,\nresulting in improved overall retrieval performance.\n","authors":["Mohammadreza Samadi","Davood Rafiei"],"pdf_url":"https://arxiv.org/pdf/2308.06431v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.08460v1","updated":"2023-08-12T23:08:15Z","published":"2023-08-12T23:08:15Z","title":"Stationary Algorithmic Balancing For Dynamic Email Re-Ranking Problem","summary":" Email platforms need to generate personalized rankings of emails that satisfy\nuser preferences, which may vary over time. We approach this as a\nrecommendation problem based on three criteria: closeness (how relevant the\nsender and topic are to the user), timeliness (how recent the email is), and\nconciseness (how brief the email is). We propose MOSR (Multi-Objective\nStationary Recommender), a novel online algorithm that uses an adaptive control\nmodel to dynamically balance these criteria and adapt to preference changes. We\nevaluate MOSR on the Enron Email Dataset, a large collection of real emails,\nand compare it with other baselines. The results show that MOSR achieves better\nperformance, especially under non-stationary preferences, where users value\ndifferent criteria more or less over time. We also test MOSR's robustness on a\nsmaller down-sampled dataset that exhibits high variance in email\ncharacteristics, and show that it maintains stable rankings across different\nsamples. Our work offers novel insights into how to design email re-ranking\nsystems that account for multiple objectives impacting user satisfaction.\n","authors":["Jiayi Liu","Jennifer Neville"],"pdf_url":"https://arxiv.org/pdf/2308.08460v1.pdf","comment":"Published in KDD'23"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.06464v1","updated":"2023-08-12T04:51:04Z","published":"2023-08-12T04:51:04Z","title":"A One-dimensional HEVC video steganalysis method using the Optimality of\n Predicted Motion Vectors","summary":" Among steganalysis techniques, detection against motion vector (MV)\ndomain-based video steganography in High Efficiency Video Coding (HEVC)\nstandard remains a hot and challenging issue. For the purpose of improving the\ndetection performance, this paper proposes a steganalysis feature based on the\noptimality of predicted MVs with a dimension of one. Firstly, we point out that\nthe motion vector prediction (MVP) of the prediction unit (PU) encoded using\nthe Advanced Motion Vector Prediction (AMVP) technique satisfies the local\noptimality in the cover video. Secondly, we analyze that in HEVC video, message\nembedding either using MVP index or motion vector differences (MVD) may destroy\nthe above optimality of MVP. And then, we define the optimal rate of MVP in\nHEVC video as a steganalysis feature. Finally, we conduct steganalysis\ndetection experiments on two general datasets for three popular steganography\nmethods and compare the performance with four state-of-the-art steganalysis\nmethods. The experimental results show that the proposed optimal rate of MVP\nfor all cover videos is 100\\%, while the optimal rate of MVP for all stego\nvideos is less than 100\\%. Therefore, the proposed steganography scheme can\naccurately distinguish between cover videos and stego videos, and it is\nefficiently applied to practical scenarios with no model training and low\ncomputational complexity.\n","authors":["Jun Li","Minqing Zhang","Ke Niu","Yingnan Zhang","Xiaoyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06464v1.pdf","comment":"Submitted to TCSVT"}]},"2023-08-15T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.07922v1","updated":"2023-08-15T17:59:18Z","published":"2023-08-15T17:59:18Z","title":"RAVEN: In-Context Learning with Retrieval Augmented Encoder-Decoder\n Language Models","summary":" In this paper, we investigate the in-context learning ability of\nretrieval-augmented encoder-decoder language models. We first conduct a\ncomprehensive analysis of the state-of-the-art ATLAS model and identify its\nlimitations in in-context learning, primarily due to a mismatch between\npretraining and testing, as well as a restricted context length. To address\nthese issues, we propose RAVEN, a model that combines retrieval-augmented\nmasked language modeling and prefix language modeling. We further introduce\nFusion-in-Context Learning to enhance the few-shot performance by enabling the\nmodel to leverage more in-context examples without requiring additional\ntraining or model modifications. Through extensive experiments, we demonstrate\nthat RAVEN significantly outperforms ATLAS and achieves results comparable to\nthe most advanced language models in certain scenarios, despite having\nsubstantially fewer parameters. Our work underscores the potential of\nretrieval-augmented encoder-decoder language models for in-context learning and\nencourages further research in this direction.\n","authors":["Jie Huang","Wei Ping","Peng Xu","Mohammad Shoeybi","Kevin Chen-Chuan Chang","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2308.07922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07921v1","updated":"2023-08-15T17:58:45Z","published":"2023-08-15T17:58:45Z","title":"Solving Challenging Math Word Problems Using GPT-4 Code Interpreter with\n Code-based Self-Verification","summary":" Recent progress in large language models (LLMs) like GPT-4 and PaLM-2 has\nbrought significant advancements in addressing math reasoning problems. In\nparticular, OpenAI's latest version of GPT-4, known as GPT-4 Code Interpreter,\nshows remarkable performance on challenging math datasets. In this paper, we\nexplore the effect of code on enhancing LLMs' reasoning capability by\nintroducing different constraints on the \\textit{Code Usage Frequency} of GPT-4\nCode Interpreter. We found that its success can be largely attributed to its\npowerful skills in generating and executing code, evaluating the output of code\nexecution, and rectifying its solution when receiving unreasonable outputs.\nBased on this insight, we propose a novel and effective prompting method,\nexplicit \\uline{c}ode-based \\uline{s}elf-\\uline{v}erification~(CSV), to further\nboost the mathematical reasoning potential of GPT-4 Code Interpreter. This\nmethod employs a zero-shot prompt on GPT-4 Code Interpreter to encourage it to\nuse code to self-verify its answers. In instances where the verification state\nregisters as ``False'', the model shall automatically amend its solution,\nanalogous to our approach of rectifying errors during a mathematics\nexamination. Furthermore, we recognize that the states of the verification\nresult indicate the confidence of a solution, which can improve the\neffectiveness of majority voting. With GPT-4 Code Interpreter and CSV, we\nachieve an impressive zero-shot accuracy on MATH dataset \\textbf{(53.9\\% $\\to$\n84.3\\%)}.\n","authors":["Aojun Zhou","Ke Wang","Zimu Lu","Weikang Shi","Sichun Luo","Zipeng Qin","Shaoqing Lu","Anya Jia","Linqi Song","Mingjie Zhan","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.07921v1.pdf","comment":"Solving Challenging Math Word Problems Using GPT-4 Code Interpreter\n with Code-based Self-Verification"},{"id":"http://arxiv.org/abs/2308.07902v1","updated":"2023-08-15T17:40:34Z","published":"2023-08-15T17:40:34Z","title":"Through the Lens of Core Competency: Survey on Evaluation of Large\n Language Models","summary":" From pre-trained language model (PLM) to large language model (LLM), the\nfield of natural language processing (NLP) has witnessed steep performance\ngains and wide practical uses. The evaluation of a research field guides its\ndirection of improvement. However, LLMs are extremely hard to thoroughly\nevaluate for two reasons. First of all, traditional NLP tasks become inadequate\ndue to the excellent performance of LLM. Secondly, existing evaluation tasks\nare difficult to keep up with the wide range of applications in real-world\nscenarios. To tackle these problems, existing works proposed various benchmarks\nto better evaluate LLMs. To clarify the numerous evaluation tasks in both\nacademia and industry, we investigate multiple papers concerning LLM\nevaluations. We summarize 4 core competencies of LLM, including reasoning,\nknowledge, reliability, and safety. For every competency, we introduce its\ndefinition, corresponding benchmarks, and metrics. Under this competency\narchitecture, similar tasks are combined to reflect corresponding ability,\nwhile new tasks can also be easily added into the system. Finally, we give our\nsuggestions on the future direction of LLM's evaluation.\n","authors":["Ziyu Zhuang","Qiguang Chen","Longxuan Ma","Mingda Li","Yi Han","Yushan Qian","Haopeng Bai","Zixian Feng","Weinan Zhang","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07899v1","updated":"2023-08-15T17:40:10Z","published":"2023-08-15T17:40:10Z","title":"The Regular Expression Inference Challenge","summary":" We propose \\emph{regular expression inference (REI)} as a challenge for\ncode/language modelling, and the wider machine learning community. REI is a\nsupervised machine learning (ML) and program synthesis task, and poses the\nproblem of finding minimal regular expressions from examples: Given two finite\nsets of strings $P$ and $N$ and a cost function $\\text{cost}(\\cdot)$, the task\nis to generate an expression $r$ that accepts all strings in $P$ and rejects\nall strings in $N$, while no other such expression $r'$ exists with\n$\\text{cost}(r')<\\text{cost}(r)$.\n REI has advantages as a challenge problem: (i) regular expressions are\nwell-known, widely used, and a natural idealisation of code; (ii) REI's\nasymptotic worst-case complexity is well understood; (iii) REI has a small\nnumber of easy to understand parameters (e.g.~$P$ or $N$ cardinality, string\nlengths of examples, or the cost function); this lets us easily finetune\nREI-hardness; (iv) REI is an unsolved problem for deep learning based ML.\n Recently, an REI solver was implemented on GPUs, using program synthesis\ntechniques. This enabled, for the first time, fast generation of minimal\nexpressions for complex REI instances. Building on this advance, we generate\nand publish the first large-scale datasets for REI, and devise and evaluate\nseveral initial heuristic and machine learning baselines.\n We invite the community to participate and explore ML methods that learn to\nsolve REI problems. We believe that progress in REI directly translates to\ncode/language modelling.\n","authors":["Mojtaba Valizadeh","Philip John Gorinski","Ignacio Iacobacci","Martin Berger"],"pdf_url":"https://arxiv.org/pdf/2308.07899v1.pdf","comment":"7 pages, 3 pages appendix, 6 tables"},{"id":"http://arxiv.org/abs/2308.07891v1","updated":"2023-08-15T17:33:24Z","published":"2023-08-15T17:33:24Z","title":"Link-Context Learning for Multimodal LLMs","summary":" The ability to learn from context with novel concepts, and deliver\nappropriate responses are essential in human conversations. Despite current\nMultimodal Large Language Models (MLLMs) and Large Language Models (LLMs) being\ntrained on mega-scale datasets, recognizing unseen images or understanding\nnovel concepts in a training-free manner remains a challenge. In-Context\nLearning (ICL) explores training-free few-shot learning, where models are\nencouraged to ``learn to learn\" from limited tasks and generalize to unseen\ntasks. In this work, we propose link-context learning (LCL), which emphasizes\n\"reasoning from cause and effect\" to augment the learning capabilities of\nMLLMs. LCL goes beyond traditional ICL by explicitly strengthening the causal\nrelationship between the support set and the query set. By providing\ndemonstrations with causal links, LCL guides the model to discern not only the\nanalogy but also the underlying causal associations between data points, which\nempowers MLLMs to recognize unseen images and understand novel concepts more\neffectively. To facilitate the evaluation of this novel approach, we introduce\nthe ISEKAI dataset, comprising exclusively of unseen generated image-label\npairs designed for link-context learning. Extensive experiments show that our\nLCL-MLLM exhibits strong link-context learning capabilities to novel concepts\nover vanilla MLLMs. Code and data will be released at\nhttps://github.com/isekai-portal/Link-Context-Learning.\n","authors":["Yan Tai","Weichen Fan","Zhao Zhang","Feng Zhu","Rui Zhao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07891v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.07889v1","updated":"2023-08-15T17:30:57Z","published":"2023-08-15T17:30:57Z","title":"A Comprehensive Study on Knowledge Graph Embedding over Relational\n Patterns Based on Rule Learning","summary":" Knowledge Graph Embedding (KGE) has proven to be an effective approach to\nsolving the Knowledge Graph Completion (KGC) task. Relational patterns which\nrefer to relations with specific semantics exhibiting graph patterns are an\nimportant factor in the performance of KGE models. Though KGE models'\ncapabilities are analyzed over different relational patterns in theory and a\nrough connection between better relational patterns modeling and better\nperformance of KGC has been built, a comprehensive quantitative analysis on KGE\nmodels over relational patterns remains absent so it is uncertain how the\ntheoretical support of KGE to a relational pattern contributes to the\nperformance of triples associated to such a relational pattern. To address this\nchallenge, we evaluate the performance of 7 KGE models over 4 common relational\npatterns on 2 benchmarks, then conduct an analysis in theory, entity frequency,\nand part-to-whole three aspects and get some counterintuitive conclusions.\nFinally, we introduce a training-free method Score-based Patterns Adaptation\n(SPA) to enhance KGE models' performance over various relational patterns. This\napproach is simple yet effective and can be applied to KGE models without\nadditional training. Our experimental results demonstrate that our method\ngenerally enhances performance over specific relational patterns. Our source\ncode is available from GitHub at\nhttps://github.com/zjukg/Comprehensive-Study-over-Relational-Patterns.\n","authors":["Long Jin","Zhen Yao","Mingyang Chen","Huajun Chen","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07889v1.pdf","comment":"This paper is accepted by ISWC 2023"},{"id":"http://arxiv.org/abs/2305.18144v2","updated":"2023-08-15T17:10:57Z","published":"2023-05-29T15:15:53Z","title":"GripRank: Bridging the Gap between Retrieval and Generation via the\n Generative Knowledge Improved Passage Ranking","summary":" Retrieval-enhanced text generation has shown remarkable progress on\nknowledge-intensive language tasks, such as open-domain question answering and\nknowledge-enhanced dialogue generation, by leveraging passages retrieved from a\nlarge passage corpus for delivering a proper answer given the input query.\nHowever, the retrieved passages are not ideal for guiding answer generation\nbecause of the discrepancy between retrieval and generation, i.e., the\ncandidate passages are all treated equally during the retrieval procedure\nwithout considering their potential to generate a proper answer. This\ndiscrepancy makes a passage retriever deliver a sub-optimal collection of\ncandidate passages to generate the answer. In this paper, we propose the\nGeneRative Knowledge Improved Passage Ranking (GripRank) approach, addressing\nthe above challenge by distilling knowledge from a generative passage estimator\n(GPE) to a passage ranker, where the GPE is a generative language model used to\nmeasure how likely the candidate passages can generate the proper answer. We\nrealize the distillation procedure by teaching the passage ranker learning to\nrank the passages ordered by the GPE. Furthermore, we improve the distillation\nquality by devising a curriculum knowledge distillation mechanism, which allows\nthe knowledge provided by the GPE can be progressively distilled to the ranker\nthrough an easy-to-hard curriculum, enabling the passage ranker to correctly\nrecognize the provenance of the answer from many plausible candidates. We\nconduct extensive experiments on four datasets across three knowledge-intensive\nlanguage tasks. Experimental results show advantages over the state-of-the-art\nmethods for both passage ranking and answer generation on the KILT benchmark.\n","authors":["Jiaqi Bai","Hongcheng Guo","Jiaheng Liu","Jian Yang","Xinnian Liang","Zhao Yan","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2305.18144v2.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2308.07876v1","updated":"2023-08-15T16:41:53Z","published":"2023-08-15T16:41:53Z","title":"Synthesizing Political Zero-Shot Relation Classification via Codebook\n Knowledge, NLI, and ChatGPT","summary":" Recent supervised models for event coding vastly outperform pattern-matching\nmethods. However, their reliance solely on new annotations disregards the vast\nknowledge within expert databases, hindering their applicability to\nfine-grained classification. To address these limitations, we explore zero-shot\napproaches for political event ontology relation classification, by leveraging\nknowledge from established annotation codebooks. Our study encompasses both\nChatGPT and a novel natural language inference (NLI) based approach named ZSP.\nZSP adopts a tree-query framework that deconstructs the task into context,\nmodality, and class disambiguation levels. This framework improves\ninterpretability, efficiency, and adaptability to schema changes. By conducting\nextensive experiments on our newly curated datasets, we pinpoint the\ninstability issues within ChatGPT and highlight the superior performance of\nZSP. ZSP achieves an impressive 40% improvement in F1 score for fine-grained\nRootcode classification. ZSP demonstrates competitive performance compared to\nsupervised BERT models, positioning it as a valuable tool for event record\nvalidation and ontology development. Our work underscores the potential of\nleveraging transfer learning and existing expertise to enhance the efficiency\nand scalability of research in the field.\n","authors":["Yibo Hu","Erick Skorupa Parolin","Latifur Khan","Patrick T. Brandt","Javier Osorio","Vito J. D'Orazio"],"pdf_url":"https://arxiv.org/pdf/2308.07876v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2307.05695v3","updated":"2023-08-15T16:41:13Z","published":"2023-07-11T18:02:09Z","title":"Stack More Layers Differently: High-Rank Training Through Low-Rank\n Updates","summary":" Despite the dominance and effectiveness of scaling, resulting in large\nnetworks with hundreds of billions of parameters, the necessity to train\noverparametrized models remains poorly understood, and alternative approaches\ndo not necessarily make it cheaper to train high-performance models. In this\npaper, we explore low-rank training techniques as an alternative approach to\ntraining large neural networks. We introduce a novel method called ReLoRA,\nwhich utilizes low-rank updates to train high-rank networks. We apply ReLoRA to\npre-training transformer language models with up to 350M parameters and\ndemonstrate comparable performance to regular neural network training.\nFurthermore, we observe that the efficiency of ReLoRA increases with model\nsize, making it a promising approach for training multi-billion-parameter\nnetworks efficiently. Our findings shed light on the potential of low-rank\ntraining techniques and their implications for scaling laws.\n","authors":["Vladislav Lialin","Namrata Shivagunde","Sherin Muckatira","Anna Rumshisky"],"pdf_url":"https://arxiv.org/pdf/2307.05695v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07871v1","updated":"2023-08-15T16:39:10Z","published":"2023-08-15T16:39:10Z","title":"Emotion Embeddings $\\unicode{x2014}$ Learning Stable and Homogeneous\n Abstractions from Heterogeneous Affective Datasets","summary":" Human emotion is expressed in many communication modalities and media formats\nand so their computational study is equally diversified into natural language\nprocessing, audio signal analysis, computer vision, etc. Similarly, the large\nvariety of representation formats used in previous research to describe\nemotions (polarity scales, basic emotion categories, dimensional approaches,\nappraisal theory, etc.) have led to an ever proliferating diversity of\ndatasets, predictive models, and software tools for emotion analysis. Because\nof these two distinct types of heterogeneity, at the expressional and\nrepresentational level, there is a dire need to unify previous work on\nincreasingly diverging data and label types. This article presents such a\nunifying computational model. We propose a training procedure that learns a\nshared latent representation for emotions, so-called emotion embeddings,\nindependent of different natural languages, communication modalities, media or\nrepresentation label formats, and even disparate model architectures.\nExperiments on a wide range of heterogeneous affective datasets indicate that\nthis approach yields the desired interoperability for the sake of reusability,\ninterpretability and flexibility, without penalizing prediction quality. Code\nand data are archived under https://doi.org/10.5281/zenodo.7405327 .\n","authors":["Sven Buechel","Udo Hahn"],"pdf_url":"https://arxiv.org/pdf/2308.07871v1.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2306.08456v2","updated":"2023-08-15T16:21:47Z","published":"2023-06-14T11:57:31Z","title":"PoetryDiffusion: Towards Joint Semantic and Metrical Manipulation in\n Poetry Generation","summary":" Controllable text generation is a challenging and meaningful field in natural\nlanguage generation (NLG). Especially, poetry generation is a typical one with\nwell-defined and strict conditions for text generation which is an ideal\nplayground for the assessment of current methodologies. While prior works\nsucceeded in controlling either semantic or metrical aspects of poetry\ngeneration, simultaneously addressing both remains a challenge. In this paper,\nwe pioneer the use of the Diffusion model for generating sonnets and Chinese\nSongCi poetry to tackle such challenges. In terms of semantics, our\nPoetryDiffusion model, built upon the Diffusion model, generates entire\nsentences or poetry by comprehensively considering the entirety of sentence\ninformation. This approach enhances semantic expression, distinguishing it from\nautoregressive and large language models (LLMs). For metrical control, the\nseparation feature of diffusion generation and its constraint control module\nenable us to flexibly incorporate a novel metrical controller to manipulate and\nevaluate metrics (format and rhythm). The denoising process in PoetryDiffusion\nallows for gradual enhancement of semantics and flexible integration of the\nmetrical controller which can calculate and impose penalties on states that\nstray significantly from the target control distribution. Experimental results\non two datasets demonstrate that our model outperforms existing models in\nautomatic evaluation of semantic, metrical, and overall performance as well as\nhuman evaluation.\n","authors":["Zhiyuan Hu","Chumin Liu","Yue Feng","Anh Tuan Luu","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2306.08456v2.pdf","comment":"9 Pages"},{"id":"http://arxiv.org/abs/2305.17401v3","updated":"2023-08-15T16:02:43Z","published":"2023-05-27T07:59:49Z","title":"A Framework For Refining Text Classification and Object Recognition from\n Academic Articles","summary":" With the widespread use of the internet, it has become increasingly crucial\nto extract specific information from vast amounts of academic articles\nefficiently. Data mining techniques are generally employed to solve this issue.\nHowever, data mining for academic articles is challenging since it requires\nautomatically extracting specific patterns in complex and unstructured layout\ndocuments. Current data mining methods for academic articles employ\nrule-based(RB) or machine learning(ML) approaches. However, using rule-based\nmethods incurs a high coding cost for complex typesetting articles. On the\nother hand, simply using machine learning methods requires annotation work for\ncomplex content types within the paper, which can be costly. Furthermore, only\nusing machine learning can lead to cases where patterns easily recognized by\nrule-based methods are mistakenly extracted. To overcome these issues, from the\nperspective of analyzing the standard layout and typesetting used in the\nspecified publication, we emphasize implementing specific methods for specific\ncharacteristics in academic articles. We have developed a novel Text Block\nRefinement Framework (TBRF), a machine learning and rule-based scheme hybrid.\nWe used the well-known ACL proceeding articles as experimental data for the\nvalidation experiment. The experiment shows that our approach achieved over 95%\nclassification accuracy and 90% detection accuracy for tables and figures.\n","authors":["Jinghong Li","Koichi Ota","Wen Gu","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2305.17401v3.pdf","comment":"This paper has been accepted at 'The International Symposium on\n Innovations in Intelligent Systems and Applications 2023 (INISTA 2023)'"},{"id":"http://arxiv.org/abs/2305.04003v3","updated":"2023-08-15T15:09:02Z","published":"2023-05-06T10:36:39Z","title":"ANTONIO: Towards a Systematic Method of Generating NLP Benchmarks for\n Verification","summary":" Verification of machine learning models used in Natural Language Processing\n(NLP) is known to be a hard problem. In particular, many known neural network\nverification methods that work for computer vision and other numeric datasets\ndo not work for NLP. Here, we study technical reasons that underlie this\nproblem. Based on this analysis, we propose practical methods and heuristics\nfor preparing NLP datasets and models in a way that renders them amenable to\nknown verification methods based on abstract interpretation. We implement these\nmethods as a Python library called ANTONIO that links to the neural network\nverifiers ERAN and Marabou. We perform evaluation of the tool using an NLP\ndataset R-U-A-Robot suggested as a benchmark for verifying legally critical NLP\napplications. We hope that, thanks to its general applicability, this work will\nopen novel possibilities for including NLP verification problems into neural\nnetwork verification competitions, and will popularise NLP problems within this\ncommunity.\n","authors":["Marco Casadio","Luca Arnaboldi","Matthew L. Daggitt","Omri Isac","Tanvi Dinkar","Daniel Kienitz","Verena Rieser","Ekaterina Komendantskaya"],"pdf_url":"https://arxiv.org/pdf/2305.04003v3.pdf","comment":"To appear in proceedings of 6th Workshop on Formal Methods for\n ML-Enabled Autonomous Systems (Affiliated with CAV 2023)"},{"id":"http://arxiv.org/abs/2308.07791v1","updated":"2023-08-15T14:16:29Z","published":"2023-08-15T14:16:29Z","title":"Informed Named Entity Recognition Decoding for Generative Language\n Models","summary":" Ever-larger language models with ever-increasing capabilities are by now\nwell-established text processing tools. Alas, information extraction tasks such\nas named entity recognition are still largely unaffected by this progress as\nthey are primarily based on the previous generation of encoder-only transformer\nmodels. Here, we propose a simple yet effective approach, Informed Named Entity\nRecognition Decoding (iNERD), which treats named entity recognition as a\ngenerative process. It leverages the language understanding capabilities of\nrecent generative models in a future-proof manner and employs an informed\ndecoding scheme incorporating the restricted nature of information extraction\ninto open-ended text generation, improving performance and eliminating any risk\nof hallucinations. We coarse-tune our model on a merged named entity corpus to\nstrengthen its performance, evaluate five generative language models on eight\nnamed entity recognition datasets, and achieve remarkable results, especially\nin an environment with an unknown entity class set, demonstrating the\nadaptability of the approach.\n","authors":["Tobias Deußer","Lars Hillebrand","Christian Bauckhage","Rafet Sifa"],"pdf_url":"https://arxiv.org/pdf/2308.07791v1.pdf","comment":"12 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2307.00360v2","updated":"2023-08-15T13:59:42Z","published":"2023-07-01T15:10:01Z","title":"BatGPT: A Bidirectional Autoregessive Talker from Generative Pre-trained\n Transformer","summary":" BatGPT is a large-scale language model designed and trained jointly by Wuhan\nUniversity and Shanghai Jiao Tong University. It is capable of generating\nhighly natural and fluent text in response to various types of input, including\ntext prompts, images, and audio. In the modeling level, we employ a\nbidirectional autoregressive architecture that allows the model to efficiently\ncapture the complex dependencies of natural language, making it highly\neffective in tasks such as language generation, dialog systems, and question\nanswering. Moreover, the bidirectional autoregressive modeling not only\noperates from left to right but also from right to left, effectively reducing\nfixed memory effects and alleviating model hallucinations.\n In the training aspect, we propose a novel parameter expansion method for\nleveraging the pre-training of smaller models and employ reinforcement learning\nfrom both AI and human feedback, aimed at improving the model's alignment\nperformance. Overall, these approaches significantly improve the effectiveness\nof BatGPT, and the model can be utilized for a wide range of natural language\napplications.\n","authors":["Zuchao Li","Shitou Zhang","Hai Zhao","Yifei Yang","Dongjie Yang"],"pdf_url":"https://arxiv.org/pdf/2307.00360v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07777v1","updated":"2023-08-15T13:53:52Z","published":"2023-08-15T13:53:52Z","title":"Enhancing Visually-Rich Document Understanding via Layout Structure\n Modeling","summary":" In recent years, the use of multi-modal pre-trained Transformers has led to\nsignificant advancements in visually-rich document understanding. However,\nexisting models have mainly focused on features such as text and vision while\nneglecting the importance of layout relationship between text nodes. In this\npaper, we propose GraphLayoutLM, a novel document understanding model that\nleverages the modeling of layout structure graph to inject document layout\nknowledge into the model. GraphLayoutLM utilizes a graph reordering algorithm\nto adjust the text sequence based on the graph structure. Additionally, our\nmodel uses a layout-aware multi-head self-attention layer to learn document\nlayout knowledge. The proposed model enables the understanding of the spatial\narrangement of text elements, improving document comprehension. We evaluate our\nmodel on various benchmarks, including FUNSD, XFUND and CORD, and achieve\nstate-of-the-art results among these datasets. Our experimental results\ndemonstrate that our proposed method provides a significant improvement over\nexisting approaches and showcases the importance of incorporating layout\ninformation into document understanding models. We also conduct an ablation\nstudy to investigate the contribution of each component of our model. The\nresults show that both the graph reordering algorithm and the layout-aware\nmulti-head self-attention layer play a crucial role in achieving the best\nperformance.\n","authors":["Qiwei Li","Zuchao Li","Xiantao Cai","Bo Du","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.07777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16198v4","updated":"2023-08-15T13:31:15Z","published":"2022-11-28T16:48:41Z","title":"SuS-X: Training-Free Name-Only Transfer of Vision-Language Models","summary":" Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet\neffective way to train large-scale vision-language models. CLIP demonstrates\nimpressive zero-shot classification and retrieval on diverse downstream tasks.\nHowever, to leverage its full potential, fine-tuning still appears to be\nnecessary. Fine-tuning the entire CLIP model can be resource-intensive and\nunstable. Moreover, recent methods that aim to circumvent this need for\nfine-tuning still require access to images from the target distribution. In\nthis paper, we pursue a different approach and explore the regime of\ntraining-free \"name-only transfer\" in which the only knowledge we possess about\nthe downstream task comprises the names of downstream target categories. We\npropose a novel method, SuS-X, consisting of two key building blocks -- SuS and\nTIP-X, that requires neither intensive fine-tuning nor costly labelled data.\nSuS-X achieves state-of-the-art zero-shot classification results on 19\nbenchmark datasets. We further show the utility of TIP-X in the training-free\nfew-shot setting, where we again achieve state-of-the-art results over strong\ntraining-free baselines. Code is available at\nhttps://github.com/vishaal27/SuS-X.\n","authors":["Vishaal Udandarao","Ankush Gupta","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2211.16198v4.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.07758v1","updated":"2023-08-15T13:19:59Z","published":"2023-08-15T13:19:59Z","title":"Backward Reasoning in Large Language Models for Verification","summary":" Chain-of-Though (CoT) prompting has shown promising performance in various\nreasoning tasks. Recently, Self-Consistency \\citep{wang2023selfconsistency}\nproposes to sample a diverse set of reasoning chains which may lead to\ndifferent answers while the answer that receives the most votes is selected. In\nthis paper, we propose a novel method to use backward reasoning in verifying\ncandidate answers. We mask a token in the question by ${\\bf x}$ and ask the LLM\nto predict the masked token when a candidate answer is provided by \\textit{a\nsimple template}, i.e., ``\\textit{\\textbf{If we know the answer of the above\nquestion is \\{a candidate answer\\}, what is the value of unknown variable ${\\bf\nx}$?}}'' Intuitively, the LLM is expected to predict the masked token\nsuccessfully if the provided candidate answer is correct. We further propose\nFOBAR to combine forward and backward reasoning for estimating the probability\nof candidate answers. We conduct extensive experiments on six data sets and\nthree LLMs. Experimental results demonstrate that FOBAR achieves\nstate-of-the-art performance on various reasoning benchmarks.\n","authors":["Weisen Jiang","Han Shi","Longhui Yu","Zhengying Liu","Yu Zhang","Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2308.07758v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.01413v2","updated":"2023-08-15T12:19:56Z","published":"2023-07-30T18:47:54Z","title":"LaFiCMIL: Rethinking Large File Classification from the Perspective of\n Correlated Multiple Instance Learning","summary":" Transformer-based models, such as BERT, have revolutionized various language\ntasks, but still struggle with large file classification due to their input\nlimit (e.g., 512 tokens). Despite several attempts to alleviate this\nlimitation, no method consistently excels across all benchmark datasets,\nprimarily because they can only extract partial essential information from the\ninput file. Additionally, they fail to adapt to the varied properties of\ndifferent types of large files. In this work, we tackle this problem from the\nperspective of correlated multiple instance learning. The proposed approach,\nLaFiCMIL, serves as a versatile framework applicable to various large file\nclassification tasks covering binary, multi-class, and multi-label\nclassification tasks, spanning various domains including Natural Language\nProcessing, Programming Language Processing, and Android Analysis. To evaluate\nits effectiveness, we employ eight benchmark datasets pertaining to Long\nDocument Classification, Code Defect Detection, and Android Malware Detection.\nLeveraging BERT-family models as feature extractors, our experimental results\ndemonstrate that LaFiCMIL achieves new state-of-the-art performance across all\nbenchmark datasets. This is largely attributable to its capability of scaling\nBERT up to nearly 20K tokens, running on a single Tesla V-100 GPU with 32G of\nmemory.\n","authors":["Tiezhu Sun","Weiguo Pian","Nadia Daoudi","Kevin Allix","Tegawendé F. Bissyandé","Jacques Klein"],"pdf_url":"https://arxiv.org/pdf/2308.01413v2.pdf","comment":"12 pages; update results; manuscript revision"},{"id":"http://arxiv.org/abs/2308.07107v2","updated":"2023-08-15T12:09:20Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":" As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions within\nthis expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07711v1","updated":"2023-08-15T11:45:34Z","published":"2023-08-15T11:45:34Z","title":"SPM: Structured Pretraining and Matching Architectures for Relevance\n Modeling in Meituan Search","summary":" In e-commerce search, relevance between query and documents is an essential\nrequirement for satisfying user experience. Different from traditional\ne-commerce platforms that offer products, users search on life service\nplatforms such as Meituan mainly for product providers, which usually have\nabundant structured information, e.g. name, address, category, thousands of\nproducts. Modeling search relevance with these rich structured contents is\nchallenging due to the following issues: (1) there is language distribution\ndiscrepancy among different fields of structured document, making it difficult\nto directly adopt off-the-shelf pretrained language model based methods like\nBERT. (2) different fields usually have different importance and their length\nvary greatly, making it difficult to extract document information helpful for\nrelevance matching.\n To tackle these issues, in this paper we propose a novel two-stage\npretraining and matching architecture for relevance matching with rich\nstructured documents. At pretraining stage, we propose an effective pretraining\nmethod that employs both query and multiple fields of document as inputs,\nincluding an effective information compression method for lengthy fields. At\nrelevance matching stage, a novel matching method is proposed by leveraging\ndomain knowledge in search query to generate more effective document\nrepresentations for relevance scoring. Extensive offline experiments and online\nA/B tests on millions of users verify that the proposed architectures\neffectively improve the performance of relevance modeling. The model has\nalready been deployed online, serving the search traffic of Meituan for over a\nyear.\n","authors":["Wen Zan","Yaopeng Han","Xiaotian Jiang","Yao Xiao","Yang Yang","Dayao Chen","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07711v1.pdf","comment":"Accepted by CIKM '23"},{"id":"http://arxiv.org/abs/2308.07706v1","updated":"2023-08-15T11:28:21Z","published":"2023-08-15T11:28:21Z","title":"Exploring Transfer Learning in Medical Image Segmentation using\n Vision-Language Models","summary":" Medical Image Segmentation is crucial in various clinical applications within\nthe medical domain. While state-of-the-art segmentation models have proven\neffective, integrating textual guidance to enhance visual features for this\ntask remains an area with limited progress. Existing segmentation models that\nutilize textual guidance are primarily trained on open-domain images, raising\nconcerns about their direct applicability in the medical domain without manual\nintervention or fine-tuning.\n To address these challenges, we propose using multimodal vision-language\nmodels for capturing semantic information from image descriptions and images,\nenabling the segmentation of diverse medical images. This study comprehensively\nevaluates existing vision language models across multiple datasets to assess\ntheir transferability from the open domain to the medical field. Furthermore,\nwe introduce variations of image descriptions for previously unseen images in\nthe dataset, revealing notable variations in model performance based on the\ngenerated prompts.\n Our findings highlight the distribution shift between the open-domain images\nand the medical domain and show that the segmentation models trained on\nopen-domain images are not directly transferrable to the medical field. But\ntheir performance can be increased by finetuning them in the medical datasets.\nWe report the zero-shot and finetuned segmentation performance of 4 Vision\nLanguage Models (VLMs) on 11 medical datasets using 9 types of prompts derived\nfrom 14 attributes.\n","authors":["Kanchan Poudel","Manish Dhakal","Prasiddha Bhandari","Rabin Adhikari","Safal Thapaliya","Bishesh Khanal"],"pdf_url":"https://arxiv.org/pdf/2308.07706v1.pdf","comment":"25 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.07702v1","updated":"2023-08-15T11:08:30Z","published":"2023-08-15T11:08:30Z","title":"Better Zero-Shot Reasoning with Role-Play Prompting","summary":" Modern large language models (LLMs), such as ChatGPT, exhibit a remarkable\ncapacity for role-playing, enabling them to embody not only human characters\nbut also non-human entities like a Linux terminal. This versatility allows them\nto simulate complex human-like interactions and behaviors within various\ncontexts, as well as to emulate specific objects or systems. While these\ncapabilities have enhanced user engagement and introduced novel modes of\ninteraction, the influence of role-playing on LLMs' reasoning abilities remains\nunderexplored. In this study, we introduce a strategically designed role-play\nprompting methodology and assess its performance under the zero-shot setting\nacross twelve diverse reasoning benchmarks, encompassing arithmetic,\ncommonsense reasoning, symbolic reasoning, and more. Leveraging models such as\nChatGPT and Llama 2, our empirical results illustrate that role-play prompting\nconsistently surpasses the standard zero-shot approach across most datasets.\nNotably, accuracy on AQuA rises from 53.5% to 63.8%, and on Last Letter from\n23.8% to 84.2%. Beyond enhancing contextual understanding, we posit that\nrole-play prompting serves as an implicit Chain-of-Thought (CoT) trigger,\nthereby improving the quality of reasoning. By comparing our approach with the\nZero-Shot-CoT technique, which prompts the model to \"think step by step\", we\nfurther demonstrate that role-play prompting can generate a more effective CoT.\nThis highlights its potential to augment the reasoning capabilities of LLMs.\n","authors":["Aobo Kong","Shiwan Zhao","Hao Chen","Qicheng Li","Yong Qin","Ruiqi Sun","Xin Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07661v1","updated":"2023-08-15T09:24:38Z","published":"2023-08-15T09:24:38Z","title":"Attention Is Not All You Need Anymore","summary":" In recent years, the popular Transformer architecture has achieved great\nsuccess in many application areas, including natural language processing and\ncomputer vision. Many existing works aim to reduce the computational and memory\ncomplexity of the self-attention mechanism in the Transformer by trading off\nperformance. However, performance is key for the continuing success of the\nTransformer. In this paper, a drop-in replacement for the self-attention\nmechanism in the Transformer, called the Extractor, is proposed. Experimental\nresults show that replacing the self-attention mechanism with the Extractor\nimproves the performance of the Transformer. Furthermore, the proposed\nExtractor has the potential to run faster than the self-attention since it has\na much shorter critical path of computation. Additionally, the sequence\nprediction problem in the context of text generation is formulated using\nvariable-length discrete-time Markov chains, and the Transformer is reviewed\nbased on our understanding.\n","authors":["Zhe Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07654v1","updated":"2023-08-15T09:05:27Z","published":"2023-08-15T09:05:27Z","title":"SEER: Super-Optimization Explorer for HLS using E-graph Rewriting with\n MLIR","summary":" High-level synthesis (HLS) is a process that automatically translates a\nsoftware program in a high-level language into a low-level hardware\ndescription. However, the hardware designs produced by HLS tools still suffer\nfrom a significant performance gap compared to manual implementations. This is\nbecause the input HLS programs must still be written using hardware design\nprinciples.\n Existing techniques either leave the program source unchanged or perform a\nfixed sequence of source transformation passes, potentially missing\nopportunities to find the optimal design. We propose a super-optimization\napproach for HLS that automatically rewrites an arbitrary software program into\nefficient HLS code that can be used to generate an optimized hardware design.\nWe developed a toolflow named SEER, based on the e-graph data structure, to\nefficiently explore equivalent implementations of a program at scale. SEER\nprovides an extensible framework, orchestrating existing software compiler\npasses and hardware synthesis optimizers.\n Our work is the first attempt to exploit e-graph rewriting for large software\ncompiler frameworks, such as MLIR. Across a set of open-source benchmarks, we\nshow that SEER achieves up to 38x the performance within 1.4x the area of the\noriginal program. Via an Intel-provided case study, SEER demonstrates the\npotential to outperform manually optimized designs produced by hardware\nexperts.\n","authors":["Jianyi Cheng","Samuel Coward","Lorenzo Chelini","Rafael Barbalho","Theo Drane"],"pdf_url":"https://arxiv.org/pdf/2308.07654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07645v1","updated":"2023-08-15T08:49:14Z","published":"2023-08-15T08:49:14Z","title":"Steering Language Generation: Harnessing Contrastive Expert Guidance and\n Negative Prompting for Coherent and Diverse Synthetic Data Generation","summary":" Large Language Models (LLMs) hold immense potential to generate synthetic\ndata of high quality and utility, which has numerous applications from\ndownstream model training to practical data utilisation. However, contemporary\nmodels, despite their impressive capacities, consistently struggle to produce\nboth coherent and diverse data. To address the coherency issue, we introduce\ncontrastive expert guidance, where the difference between the logit\ndistributions of fine-tuned and base language models is emphasised to ensure\ndomain adherence. In order to ensure diversity, we utilise existing real and\nsynthetic examples as negative prompts to the model. We deem this dual-pronged\napproach to logit reshaping as STEER: Semantic Text Enhancement via Embedding\nRepositioning. STEER operates at inference-time and systematically guides the\nLLMs to strike a balance between adherence to the data distribution (ensuring\nsemantic fidelity) and deviation from prior synthetic examples or existing real\ndatasets (ensuring diversity and authenticity). This delicate balancing act is\nachieved by dynamically moving towards or away from chosen representations in\nthe latent space. STEER demonstrates improved performance over previous\nsynthetic data generation techniques, exhibiting better balance between data\ndiversity and coherency across three distinct tasks: hypothesis generation,\ntoxic and non-toxic comment generation, and commonsense reasoning task\ngeneration. We demonstrate how STEER allows for fine-tuned control over the\ndiversity-coherency trade-off via its hyperparameters, highlighting its\nversatility.\n","authors":["Charles O'Neill","Yuan-Sen Ting","Ioana Ciuca","Roberta Raileanu","Jack Miller","Thang Bui"],"pdf_url":"https://arxiv.org/pdf/2308.07645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07635v1","updated":"2023-08-15T08:32:20Z","published":"2023-08-15T08:32:20Z","title":"LLM-Mini-CEX: Automatic Evaluation of Large Language Model for\n Diagnostic Conversation","summary":" There is an increasing interest in developing LLMs for medical diagnosis to\nimprove diagnosis efficiency. Despite their alluring technological potential,\nthere is no unified and comprehensive evaluation criterion, leading to the\ninability to evaluate the quality and potential risks of medical LLMs, further\nhindering the application of LLMs in medical treatment scenarios. Besides,\ncurrent evaluations heavily rely on labor-intensive interactions with LLMs to\nobtain diagnostic dialogues and human evaluation on the quality of diagnosis\ndialogue. To tackle the lack of unified and comprehensive evaluation criterion,\nwe first initially establish an evaluation criterion, termed LLM-specific\nMini-CEX to assess the diagnostic capabilities of LLMs effectively, based on\noriginal Mini-CEX. To address the labor-intensive interaction problem, we\ndevelop a patient simulator to engage in automatic conversations with LLMs, and\nutilize ChatGPT for evaluating diagnosis dialogues automatically. Experimental\nresults show that the LLM-specific Mini-CEX is adequate and necessary to\nevaluate medical diagnosis dialogue. Besides, ChatGPT can replace manual\nevaluation on the metrics of humanistic qualities and provides reproducible and\nautomated comparisons between different LLMs.\n","authors":["Xiaoming Shi","Jie Xu","Jinru Ding","Jiali Pang","Sichen Liu","Shuqing Luo","Xingwei Peng","Lu Lu","Haihong Yang","Mingtao Hu","Tong Ruan","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07633v1","updated":"2023-08-15T08:31:05Z","published":"2023-08-15T08:31:05Z","title":"A Survey on Model Compression for Large Language Models","summary":" Large Language Models (LLMs) have revolutionized natural language processing\ntasks with remarkable success. However, their formidable size and computational\ndemands present significant challenges for practical deployment, especially in\nresource-constrained environments. As these challenges become increasingly\npertinent, the field of model compression has emerged as a pivotal research\narea to alleviate these limitations. This paper presents a comprehensive survey\nthat navigates the landscape of model compression techniques tailored\nspecifically for LLMs. Addressing the imperative need for efficient deployment,\nwe delve into various methodologies, encompassing quantization, pruning,\nknowledge distillation, and more. Within each of these techniques, we highlight\nrecent advancements and innovative approaches that contribute to the evolving\nlandscape of LLM research. Furthermore, we explore benchmarking strategies and\nevaluation metrics that are essential for assessing the effectiveness of\ncompressed LLMs. By providing insights into the latest developments and\npractical implications, this survey serves as an invaluable resource for both\nresearchers and practitioners. As LLMs continue to evolve, this survey aims to\nfacilitate enhanced efficiency and real-world applicability, establishing a\nfoundation for future advancements in the field.\n","authors":["Xunyu Zhu","Jian Li","Yong Liu","Can Ma","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15199v2","updated":"2023-08-15T08:30:45Z","published":"2023-07-27T21:14:46Z","title":"PromptStyler: Prompt-driven Style Generation for Source-free Domain\n Generalization","summary":" In a joint vision-language space, a text feature (e.g., from \"a photo of a\ndog\") could effectively represent its relevant image features (e.g., from dog\nphotos). Also, a recent study has demonstrated the cross-modal transferability\nphenomenon of this joint space. From these observations, we propose\nPromptStyler which simulates various distribution shifts in the joint space by\nsynthesizing diverse styles via prompts without using any images to deal with\nsource-free domain generalization. The proposed method learns to generate a\nvariety of style features (from \"a S* style of a\") via learnable style word\nvectors for pseudo-words S*. To ensure that learned styles do not distort\ncontent information, we force style-content features (from \"a S* style of a\n[class]\") to be located nearby their corresponding content features (from\n\"[class]\") in the joint vision-language space. After learning style word\nvectors, we train a linear classifier using synthesized style-content features.\nPromptStyler achieves the state of the art on PACS, VLCS, OfficeHome and\nDomainNet, even though it does not require any images for training.\n","authors":["Junhyeong Cho","Gilhyun Nam","Sungyeon Kim","Hunmin Yang","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.15199v2.pdf","comment":"Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/"},{"id":"http://arxiv.org/abs/2308.03291v2","updated":"2023-08-15T08:20:30Z","published":"2023-08-07T04:20:38Z","title":"SynJax: Structured Probability Distributions for JAX","summary":" The development of deep learning software libraries enabled significant\nprogress in the field by allowing users to focus on modeling, while letting the\nlibrary to take care of the tedious and time-consuming task of optimizing\nexecution for modern hardware accelerators. However, this has benefited only\nparticular types of deep learning models, such as Transformers, whose\nprimitives map easily to the vectorized computation. The models that explicitly\naccount for structured objects, such as trees and segmentations, did not\nbenefit equally because they require custom algorithms that are difficult to\nimplement in a vectorized form.\n SynJax directly addresses this problem by providing an efficient vectorized\nimplementation of inference algorithms for structured distributions covering\nalignment, tagging, segmentation, constituency trees and spanning trees. With\nSynJax we can build large-scale differentiable models that explicitly model\nstructure in the data. The code is available at\nhttps://github.com/deepmind/synjax.\n","authors":["Miloš Stanojević","Laurent Sartran"],"pdf_url":"https://arxiv.org/pdf/2308.03291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12449v2","updated":"2023-08-15T08:11:16Z","published":"2023-02-24T04:31:18Z","title":"SGL-PT: A Strong Graph Learner with Graph Prompt Tuning","summary":" Recently, much exertion has been paid to design graph self-supervised methods\nto obtain generalized pre-trained models, and adapt pre-trained models onto\ndownstream tasks through fine-tuning. However, there exists an inherent gap\nbetween pretext and downstream graph tasks, which insufficiently exerts the\nability of pre-trained models and even leads to negative transfer. Meanwhile,\nprompt tuning has seen emerging success in natural language processing by\naligning pre-training and fine-tuning with consistent training objectives. In\nthis paper, we identify the challenges for graph prompt tuning: The first is\nthe lack of a strong and universal pre-training task across sundry pre-training\nmethods in graph domain. The second challenge lies in the difficulty of\ndesigning a consistent training objective for both pre-training and downstream\ntasks. To overcome above obstacles, we propose a novel framework named SGL-PT\nwhich follows the learning strategy ``Pre-train, Prompt, and Predict''.\nSpecifically, we raise a strong and universal pre-training task coined as SGL\nthat acquires the complementary merits of generative and contrastive\nself-supervised graph learning. And aiming for graph classification task, we\nunify pre-training and fine-tuning by designing a novel verbalizer-free\nprompting function, which reformulates the downstream task in a similar format\nas pretext task. Empirical results show that our method surpasses other\nbaselines under unsupervised setting, and our prompt tuning method can greatly\nfacilitate models on biological datasets over fine-tuning methods.\n","authors":["Yun Zhu","Jianhao Guo","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2302.12449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07610v1","updated":"2023-08-15T07:40:21Z","published":"2023-08-15T07:40:21Z","title":"LogPrompt: Prompt Engineering Towards Zero-Shot and Interpretable Log\n Analysis","summary":" Automated log analysis is crucial in modern software-intensive systems for\nensuring reliability and resilience throughout software maintenance and\nengineering life cycles. Existing methods perform tasks such as log parsing and\nlog anomaly detection by providing a single prediction value without\ninterpretation. However, given the increasing volume of system events, the\nlimited interpretability of analysis results hinders analysts' trust and their\nability to take appropriate actions. Moreover, these methods require\nsubstantial in-domain training data, and their performance declines sharply (by\nup to 62.5%) in online scenarios involving unseen logs from new domains, a\ncommon occurrence due to rapid software updates. In this paper, we propose\nLogPrompt, a novel zero-shot and interpretable log analysis approach. LogPrompt\nemploys large language models (LLMs) to perform zero-shot log analysis tasks\nvia a suite of advanced prompt strategies tailored for log tasks, which\nenhances LLMs' performance by up to 107.5% compared with simple prompts.\nExperiments on nine publicly available evaluation datasets across two tasks\ndemonstrate that LogPrompt, despite using no training data, outperforms\nexisting approaches trained on thousands of logs by up to around 50%. We also\nconduct a human evaluation of LogPrompt's interpretability, with six\npractitioners possessing over 10 years of experience, who highly rated the\ngenerated content in terms of usefulness and readability (averagely 4.42/5).\nLogPrompt also exhibits remarkable compatibility with open-source and\nsmaller-scale LLMs, making it flexible for practical deployment.\n","authors":["Yilun Liu","Shimin Tao","Weibin Meng","Jingyu Wang","Wenbing Ma","Yanqing Zhao","Yuhang Chen","Hao Yang","Yanfei Jiang","Xun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07074v2","updated":"2023-08-15T07:37:32Z","published":"2023-08-14T11:16:28Z","title":"#InsTag: Instruction Tagging for Analyzing Supervised Fine-tuning of\n Large Language Models","summary":" Foundation language models obtain the instruction-following ability through\nsupervised fine-tuning (SFT). Diversity and complexity are considered critical\nfactors of a successful SFT dataset, while their definitions remain obscure and\nlack quantitative analyses. In this work, we propose InsTag, an open-set\nfine-grained tagger, to tag samples within SFT datasets based on semantics and\nintentions and define instruction diversity and complexity regarding tags. We\nobtain 6.6K tags to describe comprehensive user queries. Then we analyze\npopular open-sourced SFT datasets and find that the model ability grows with\nmore diverse and complex data. Based on this observation, we propose a data\nselector based on InsTag to select 6K diverse and complex samples from\nopen-source datasets and fine-tune models on InsTag-selected data. The\nresulting models, TagLM, outperform open-source models based on considerably\nlarger SFT data evaluated by MT-Bench, echoing the importance of query\ndiversity and complexity. We open-source InsTag in\nhttps://github.com/OFA-Sys/InsTag.\n","authors":["Keming Lu","Hongyi Yuan","Zheng Yuan","Runji Lin","Junyang Lin","Chuanqi Tan","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06546v2","updated":"2023-08-15T07:28:15Z","published":"2023-08-12T12:03:41Z","title":"MC-DRE: Multi-Aspect Cross Integration for Drug Event/Entity Extraction","summary":" Extracting meaningful drug-related information chunks, such as adverse drug\nevents (ADE), is crucial for preventing morbidity and saving many lives. Most\nADEs are reported via an unstructured conversation with the medical context, so\napplying a general entity recognition approach is not sufficient enough. In\nthis paper, we propose a new multi-aspect cross-integration framework for drug\nentity/event detection by capturing and aligning different\ncontext/language/knowledge properties from drug-related documents. We first\nconstruct multi-aspect encoders to describe semantic, syntactic, and medical\ndocument contextual information by conducting those slot tagging tasks, main\ndrug entity/event detection, part-of-speech tagging, and general medical named\nentity recognition. Then, each encoder conducts cross-integration with other\ncontextual information in three ways: the key-value cross, attention cross, and\nfeedforward cross, so the multi-encoders are integrated in depth. Our model\noutperforms all SOTA on two widely used tasks, flat entity detection and\ndiscontinuous event extraction.\n","authors":["Jie Yang","Soyeon Caren Han","Siqu Long","Josiah Poon","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.06546v2.pdf","comment":"Accepted at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.07601v1","updated":"2023-08-15T07:10:41Z","published":"2023-08-15T07:10:41Z","title":"VBD-MT Chinese-Vietnamese Translation Systems for VLSP 2022","summary":" We present our systems participated in the VLSP 2022 machine translation\nshared task. In the shared task this year, we participated in both translation\ntasks, i.e., Chinese-Vietnamese and Vietnamese-Chinese translations. We build\nour systems based on the neural-based Transformer model with the powerful\nmultilingual denoising pre-trained model mBART. The systems are enhanced by a\nsampling method for backtranslation, which leverage large scale available\nmonolingual data. Additionally, several other methods are applied to improve\nthe translation quality including ensembling and postprocessing. We achieve\n38.9 BLEU on ChineseVietnamese and 38.0 BLEU on VietnameseChinese on the public\ntest sets, which outperform several strong baselines.\n","authors":["Hai Long Trieu","Song Kiet Bui","Tan Minh Tran","Van Khanh Tran","Hai An Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.07601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03025v2","updated":"2023-08-15T05:11:41Z","published":"2023-07-06T14:42:01Z","title":"Style Over Substance: Evaluation Biases for Large Language Models","summary":" As large language models (LLMs) continue to advance, accurately and\ncomprehensively evaluating their performance becomes increasingly challenging.\nHuman evaluations are conventionally considered the gold standard in natural\nlanguage generation, but recent advancements incorporate state-of-the-art LLMs\nas proxies for human judges in evaluation processes. However, the extent to\nwhich humans and LLMs are capable evaluators remains uncertain. This study\ninvestigates the behavior of crowd-sourced and expert annotators, as well as\nLLMs, when comparing outputs from different models. To achieve this, we curate\na dataset of intentionally flawed machine-generated answers. Our findings\nreveal a concerning bias in the evaluation process, as answers with factual\nerrors are rated more favorably than answers that are too short or contained\ngrammatical errors. To address this issue, we propose independently evaluating\nmachine-generated text across multiple dimensions, rather than merging all the\nevaluation aspects into a single score. We instantiate this idea with the Elo\nrating system, resulting in the Multi-Elo Rating System. Empirical results from\nour study reveal that this proposed approach significantly enhances the quality\nof LLM-based evaluations, particularly in terms of factual accuracy. However,\nthere is no significant improvement in crowd-sourced-based evaluations,\nindicating the need for further investigation and refinement.\n","authors":["Minghao Wu","Alham Fikri Aji"],"pdf_url":"https://arxiv.org/pdf/2307.03025v2.pdf","comment":"Work in progress, 17 pages, 4 tables, 12 figures"},{"id":"http://arxiv.org/abs/2306.00804v3","updated":"2023-08-15T04:36:14Z","published":"2023-06-01T15:33:30Z","title":"Adaptive Contextual Biasing for Transducer Based Streaming Speech\n Recognition","summary":" By incorporating additional contextual information, deep biasing methods have\nemerged as a promising solution for speech recognition of personalized words.\nHowever, for real-world voice assistants, always biasing on such personalized\nwords with high prediction scores can significantly degrade the performance of\nrecognizing common words. To address this issue, we propose an adaptive\ncontextual biasing method based on Context-Aware Transformer Transducer (CATT)\nthat utilizes the biased encoder and predictor embeddings to perform streaming\nprediction of contextual phrase occurrences. Such prediction is then used to\ndynamically switch the bias list on and off, enabling the model to adapt to\nboth personalized and common scenarios. Experiments on Librispeech and internal\nvoice assistant datasets show that our approach can achieve up to 6.7% and\n20.7% relative reduction in WER and CER compared to the baseline respectively,\nmitigating up to 96.7% and 84.9% of the relative WER and CER increase for\ncommon cases. Furthermore, our approach has a minimal performance impact in\npersonalized scenarios while maintaining a streaming inference pipeline with\nnegligible RTF increase.\n","authors":["Tianyi Xu","Zhanheng Yang","Kaixun Huang","Pengcheng Guo","Ao Zhang","Biao Li","Changru Chen","Chao Li","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2306.00804v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06953v2","updated":"2023-08-15T04:34:09Z","published":"2023-08-14T06:09:51Z","title":"Thresh: A Unified, Customizable and Deployable Platform for Fine-Grained\n Text Evaluation","summary":" Fine-grained, span-level human evaluation has emerged as a reliable and\nrobust method for evaluating text generation tasks such as summarization,\nsimplification, machine translation and news generation, and the derived\nannotations have been useful for training automatic metrics and improving\nlanguage models. However, existing annotation tools implemented for these\nevaluation frameworks lack the adaptability to be extended to different domains\nor languages, or modify annotation settings according to user needs. And the\nabsence of a unified annotated data format inhibits the research in multi-task\nlearning. In this paper, we introduce Thresh, a unified, customizable and\ndeployable platform for fine-grained evaluation. By simply creating a YAML\nconfiguration file, users can build and test an annotation interface for any\nframework within minutes -- all in one web browser window. To facilitate\ncollaboration and sharing, Thresh provides a community hub that hosts a\ncollection of fine-grained frameworks and corresponding annotations made and\ncollected by the community, covering a wide range of NLP tasks. For deployment,\nThresh offers multiple options for any scale of annotation projects from small\nmanual inspections to large crowdsourcing ones. Additionally, we introduce a\nPython library to streamline the entire process from typology design and\ndeployment to annotation processing. Thresh is publicly accessible at\nhttps://thresh.tools.\n","authors":["David Heineman","Yao Dou","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2308.06953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07556v1","updated":"2023-08-15T03:49:59Z","published":"2023-08-15T03:49:59Z","title":"A User-Centered Evaluation of Spanish Text Simplification","summary":" We present an evaluation of text simplification (TS) in Spanish for a\nproduction system, by means of two corpora focused in both complex-sentence and\ncomplex-word identification. We compare the most prevalent Spanish-specific\nreadability scores with neural networks, and show that the latter are\nconsistently better at predicting user preferences regarding TS. As part of our\nanalysis, we find that multilingual models underperform against equivalent\nSpanish-only models on the same task, yet all models focus too often on\nspurious statistical features, such as sentence length. We release the corpora\nin our evaluation to the broader community with the hopes of pushing forward\nthe state-of-the-art in Spanish natural language processing.\n","authors":["Adrian de Wynter","Anthony Hevia","Si-Qing Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07556v1.pdf","comment":"Data at https://github.com/microsoft/BrevE-CLaro"},{"id":"http://arxiv.org/abs/2308.04306v2","updated":"2023-08-15T03:19:20Z","published":"2023-08-08T14:51:16Z","title":"Deep Learning-Based Knowledge Injection for Metaphor Detection: A\n Comprehensive Review","summary":" The history of metaphor research also marks the evolution of knowledge\ninfusion research. With the continued advancement of deep learning techniques\nin recent years, the natural language processing community has shown great\ninterest in applying knowledge to successful results in metaphor recognition\ntasks. Although there has been a gradual increase in the number of approaches\ninvolving knowledge injection in the field of metaphor recognition, there is a\nlack of a complete review article on knowledge injection based approaches.\nTherefore, the goal of this paper is to provide a comprehensive review of\nresearch advances in the application of deep learning for knowledge injection\nin metaphor recognition tasks. In this paper, we systematically summarize and\ngeneralize the mainstream knowledge and knowledge injection principles, as well\nas review the datasets, evaluation metrics, and benchmark models used in\nmetaphor recognition tasks. Finally, we explore the current issues facing\nknowledge injection methods and provide an outlook on future research\ndirections.\n","authors":["Cheng Yang","Wenye Zhao","Zhiyue Liu","Qingbao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.04306v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.07540v1","updated":"2023-08-15T02:57:00Z","published":"2023-08-15T02:57:00Z","title":"CALYPSO: LLMs as Dungeon Masters' Assistants","summary":" The role of a Dungeon Master, or DM, in the game Dungeons & Dragons is to\nperform multiple tasks simultaneously. The DM must digest information about the\ngame setting and monsters, synthesize scenes to present to other players, and\nrespond to the players' interactions with the scene. Doing all of these tasks\nwhile maintaining consistency within the narrative and story world is no small\nfeat of human cognition, making the task tiring and unapproachable to new\nplayers. Large language models (LLMs) like GPT-3 and ChatGPT have shown\nremarkable abilities to generate coherent natural language text. In this paper,\nwe conduct a formative evaluation with DMs to establish the use cases of LLMs\nin D&D and tabletop gaming generally. We introduce CALYPSO, a system of\nLLM-powered interfaces that support DMs with information and inspiration\nspecific to their own scenario. CALYPSO distills game context into bite-sized\nprose and helps brainstorm ideas without distracting the DM from the game. When\ngiven access to CALYPSO, DMs reported that it generated high-fidelity text\nsuitable for direct presentation to players, and low-fidelity ideas that the DM\ncould develop further while maintaining their creative agency. We see CALYPSO\nas exemplifying a paradigm of AI-augmented tools that provide synchronous\ncreative assistance within established game worlds, and tabletop gaming more\nbroadly.\n","authors":["Andrew Zhu","Lara J. Martin","Andrew Head","Chris Callison-Burch"],"pdf_url":"https://arxiv.org/pdf/2308.07540v1.pdf","comment":"11 pages, 4 figures. AIIDE 2023"},{"id":"http://arxiv.org/abs/2308.07308v2","updated":"2023-08-15T01:33:35Z","published":"2023-08-14T17:54:10Z","title":"LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked","summary":" Large language models (LLMs) have skyrocketed in popularity in recent years\ndue to their ability to generate high-quality text in response to human\nprompting. However, these models have been shown to have the potential to\ngenerate harmful content in response to user prompting (e.g., giving users\ninstructions on how to commit crimes). There has been a focus in the literature\non mitigating these risks, through methods like aligning models with human\nvalues through reinforcement learning. However, it has been shown that even\naligned language models are susceptible to adversarial attacks that bypass\ntheir restrictions on generating harmful text. We propose a simple approach to\ndefending against these attacks by having a large language model filter its own\nresponses. Our current results show that even if a model is not fine-tuned to\nbe aligned with human values, it is possible to stop it from presenting harmful\ncontent to users by validating the content using a language model.\n","authors":["Alec Helbling","Mansi Phute","Matthew Hull","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2308.07308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07522v1","updated":"2023-08-15T01:25:34Z","published":"2023-08-15T01:25:34Z","title":"Finding Stakeholder-Material Information from 10-K Reports using\n Fine-Tuned BERT and LSTM Models","summary":" All public companies are required by federal securities law to disclose their\nbusiness and financial activities in their annual 10-K reports. Each report\ntypically spans hundreds of pages, making it difficult for human readers to\nidentify and extract the material information efficiently. To solve the\nproblem, I have fine-tuned BERT models and RNN models with LSTM layers to\nidentify stakeholder-material information, defined as statements that carry\ninformation about a company's influence on its stakeholders, including\ncustomers, employees, investors, and the community and natural environment. The\nexisting practice uses keyword search to identify such information, which is my\nbaseline model. Using business expert-labeled training data of nearly 6,000\nsentences from 62 10-K reports published in 2022, the best model has achieved\nan accuracy of 0.904 and an F1 score of 0.899 in test data, significantly above\nthe baseline model's 0.781 and 0.749 respectively. Furthermore, the same work\nwas replicated on more granular taxonomies, based on which four distinct groups\nof stakeholders (i.e., customers, investors, employees, and the community and\nnatural environment) are tested separately. Similarly, fined-tuned BERT models\noutperformed LSTM and the baseline. The implications for industry application\nand ideas for future extensions are discussed.\n","authors":["Victor Zitian Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07505v1","updated":"2023-08-15T00:08:43Z","published":"2023-08-15T00:08:43Z","title":"Data Race Detection Using Large Language Models","summary":" Large language models (LLMs) are demonstrating significant promise as an\nalternate strategy to facilitate analyses and optimizations of high-performance\ncomputing programs, circumventing the need for resource-intensive manual tool\ncreation. In this paper, we explore a novel LLM-based data race detection\napproach combining prompting engineering and fine-tuning techniques. We create\na dedicated dataset named DRB-ML, which is derived from DataRaceBench, with\nfine-grain labels showing the presence of data race pairs and their associated\nvariables, line numbers, and read/write information. DRB-ML is then used to\nevaluate representative LLMs and fine-tune open-source ones. Our experiment\nshows that LLMs can be a viable approach to data race detection. However, they\nstill cannot compete with traditional data race detection tools when we need\ndetailed information about variable pairs causing data races.\n","authors":["Le Chen","Xianzhong Ding","Murali Emani","Tristan Vanderbruggen","Pei-hung Lin","Chuanhua Liao"],"pdf_url":"https://arxiv.org/pdf/2308.07505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05342v2","updated":"2023-08-15T22:58:04Z","published":"2023-08-10T05:10:17Z","title":"Metacognitive Prompting Improves Understanding in Large Language Models","summary":" In Large Language Models (LLMs), there have been consistent advancements in\ntask-specific performance, largely influenced by effective prompt design. While\nrecent research on prompting has enhanced the reasoning capabilities of LLMs, a\ngap remains in further improving their understanding abilities. In this study,\nwe introduce Metacognitive Prompting (MP), a strategy inspired by human\nintrospective reasoning processes. Using MP, LLMs undergo a systematic series\nof structured, self-aware evaluations, drawing on both their vast inherent\nknowledge and new insights. Our experiments involve five prevalent LLMs:\nLlama2, Vicuna, PaLM, GPT-3.5, and GPT-4, all of which span various general\nnatural language understanding (NLU) tasks from the GLUE and SuperGLUE\nbenchmarks. Results indicate that, although GPT-4 consistently excels in most\ntasks, PaLM, when equipped with MP, approaches its performance level.\nFurthermore, across models and datasets, MP consistently outperforms existing\nprompting methods, including standard and chain-of-thought prompting. This\nstudy underscores the potential to amplify the understanding abilities of LLMs\nand highlights the benefits of mirroring human introspective reasoning in NLU\ntasks.\n","authors":["Yuqing Wang","Yun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.05342v2.pdf","comment":"9 pages, in submission"},{"id":"http://arxiv.org/abs/2308.08061v1","updated":"2023-08-15T22:26:58Z","published":"2023-08-15T22:26:58Z","title":"The Costly Dilemma: Generalization, Evaluation and Cost-Optimal\n Deployment of Large Language Models","summary":" When deploying machine learning models in production for any\nproduct/application, there are three properties that are commonly desired.\nFirst, the models should be generalizable, in that we can extend it to further\nuse cases as our knowledge of the domain area develops. Second they should be\nevaluable, so that there are clear metrics for performance and the calculation\nof those metrics in production settings are feasible. Finally, the deployment\nshould be cost-optimal as far as possible. In this paper we propose that these\nthree objectives (i.e. generalization, evaluation and cost-optimality) can\noften be relatively orthogonal and that for large language models, despite\ntheir performance over conventional NLP models, enterprises need to carefully\nassess all the three factors before making substantial investments in this\ntechnology. We propose a framework for generalization, evaluation and\ncost-modeling specifically tailored to large language models, offering insights\ninto the intricacies of development, deployment and management for these large\nlanguage models.\n","authors":["Abi Aryan","Aakash Kumar Nain","Andrew McMahon","Lucas Augusto Meyer","Harpreet Singh Sahota"],"pdf_url":"https://arxiv.org/pdf/2308.08061v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2303.16166v4","updated":"2023-08-15T21:53:56Z","published":"2023-03-28T17:28:52Z","title":"When Good and Reproducible Results are a Giant with Feet of Clay: The\n Importance of Software Quality in NLP","summary":" Despite its crucial role in research experiments, code correctness is often\npresumed only on the basis of the perceived quality of results. This assumption\ncomes with the risk of erroneous outcomes and potentially misleading findings.\nTo address this issue, we posit that the current focus on reproducibility\nshould go hand in hand with the emphasis on software quality. We present a case\nstudy in which we identify and fix three bugs in widely used implementations of\nthe state-of-the-art Conformer architecture. Through experiments on speech\nrecognition and translation in various languages, we demonstrate that the\npresence of bugs does not prevent the achievement of good and reproducible\nresults, which however can lead to incorrect conclusions that potentially\nmisguide future research. As a countermeasure, we propose a Code-quality\nChecklist and release pangoliNN, a library dedicated to testing neural models,\nwith the goal of promoting coding best practices and improving research\nsoftware quality within the NLP community.\n","authors":["Sara Papi","Marco Gaido","Andrea Pilzer","Matteo Negri"],"pdf_url":"https://arxiv.org/pdf/2303.16166v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02982v2","updated":"2023-08-15T21:19:13Z","published":"2022-11-05T22:06:50Z","title":"Event and Entity Extraction from Generated Video Captions","summary":" Annotation of multimedia data by humans is time-consuming and costly, while\nreliable automatic generation of semantic metadata is a major challenge. We\npropose a framework to extract semantic metadata from automatically generated\nvideo captions. As metadata, we consider entities, the entities' properties,\nrelations between entities, and the video category. We employ two\nstate-of-the-art dense video captioning models with masked transformer (MT) and\nparallel decoding (PVDC) to generate captions for videos of the ActivityNet\nCaptions dataset. Our experiments show that it is possible to extract entities,\ntheir properties, relations between entities, and the video category from the\ngenerated captions. We observe that the quality of the extracted information is\nmainly influenced by the quality of the event localization in the video as well\nas the performance of the event caption generation.\n","authors":["Johannes Scherer","Ansgar Scherp","Deepayan Bhowmik"],"pdf_url":"https://arxiv.org/pdf/2211.02982v2.pdf","comment":"Paper accepted at CD-MAKE 2023"},{"id":"http://arxiv.org/abs/2308.08043v1","updated":"2023-08-15T21:14:09Z","published":"2023-08-15T21:14:09Z","title":"DiagGPT: An LLM-based Chatbot with Automatic Topic Management for\n Task-Oriented Dialogue","summary":" Large Language Models (LLMs), such as ChatGPT, are becoming increasingly\nsophisticated, demonstrating capabilities that closely resemble those of\nhumans. These AI models are playing an essential role in assisting humans with\na wide array of tasks in daily life. A significant application of AI is its use\nas a chat agent, responding to human inquiries across various domains. Current\nLLMs have shown proficiency in answering general questions. However, basic\nquestion-answering dialogue often falls short in complex diagnostic scenarios,\nsuch as legal or medical consultations. These scenarios typically necessitate\nTask-Oriented Dialogue (TOD), wherein an AI chat agent needs to proactively\npose questions and guide users towards specific task completion. Previous\nfine-tuning models have underperformed in TOD, and current LLMs do not\ninherently possess this capability. In this paper, we introduce DiagGPT\n(Dialogue in Diagnosis GPT), an innovative method that extends LLMs to TOD\nscenarios. Our experiments reveal that DiagGPT exhibits outstanding performance\nin conducting TOD with users, demonstrating its potential for practical\napplications.\n","authors":["Lang Cao"],"pdf_url":"https://arxiv.org/pdf/2308.08043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08032v1","updated":"2023-08-15T20:47:51Z","published":"2023-08-15T20:47:51Z","title":"Using Artificial Populations to Study Psychological Phenomena in Neural\n Models","summary":" The recent proliferation of research into transformer based natural language\nprocessing has led to a number of studies which attempt to detect the presence\nof human-like cognitive behavior in the models. We contend that, as is true of\nhuman psychology, the investigation of cognitive behavior in language models\nmust be conducted in an appropriate population of an appropriate size for the\nresults to be meaningful. We leverage work in uncertainty estimation in a novel\napproach to efficiently construct experimental populations. The resultant tool,\nPopulationLM, has been made open source. We provide theoretical grounding in\nthe uncertainty estimation literature and motivation from current cognitive\nwork regarding language models. We discuss the methodological lessons from\nother scientific communities and attempt to demonstrate their application to\ntwo artificial population studies. Through population based experimentation we\nfind that language models exhibit behavior consistent with typicality effects\namong categories highly represented in training. However, we find that language\nmodels don't tend to exhibit structural priming effects. Generally, our results\nshow that single models tend to over estimate the presence of cognitive\nbehaviors in neural models.\n","authors":["Jesse Roberts","Kyle Moore","Drew Wilenzick","Doug Fisher"],"pdf_url":"https://arxiv.org/pdf/2308.08032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08027v1","updated":"2023-08-15T20:33:25Z","published":"2023-08-15T20:33:25Z","title":"End-to-End Open Vocabulary Keyword Search With Multilingual Neural\n Representations","summary":" Conventional keyword search systems operate on automatic speech recognition\n(ASR) outputs, which causes them to have a complex indexing and search\npipeline. This has led to interest in ASR-free approaches to simplify the\nsearch procedure. We recently proposed a neural ASR-free keyword search model\nwhich achieves competitive performance while maintaining an efficient and\nsimplified pipeline, where queries and documents are encoded with a pair of\nrecurrent neural network encoders and the encodings are combined with a\ndot-product. In this article, we extend this work with multilingual pretraining\nand detailed analysis of the model. Our experiments show that the proposed\nmultilingual training significantly improves the model performance and that\ndespite not matching a strong ASR-based conventional keyword search system for\nshort queries and queries comprising in-vocabulary words, the proposed model\noutperforms the ASR-based system for long queries and queries that do not\nappear in the training data.\n","authors":["Bolaji Yusuf","Jan Cernocky","Murat Saraclar"],"pdf_url":"https://arxiv.org/pdf/2308.08027v1.pdf","comment":"Accepted by IEEE/ACM Transactions on Audio, Speech and Language\n Processing (TASLP), 2023"},{"id":"http://arxiv.org/abs/2306.07384v2","updated":"2023-08-15T18:40:20Z","published":"2023-06-12T19:20:18Z","title":"Probing Quantifier Comprehension in Large Language Models: Another\n Example of Inverse Scaling","summary":" With their increasing size, large language models (LLMs) are becoming\nincreasingly good at language understanding tasks. But even with high\nperformance on specific downstream task, LLMs fail at simple linguistic tests\nfor negation or quantifier understanding. Previous work on quantifier\nunderstanding in LLMs show inverse scaling in understanding few-type\nquantifiers. In this paper, we question the claims of of previous work and show\nthat it is a result of inappropriate testing methodology. We also present\nalternate methods to measure quantifier comprehension in LLMs and show that\nLLMs are able to better understand the difference between the meaning of\nfew-type and most-type quantifiers as their size increases, although they are\nnot particularly good at it. We also observe inverse scaling for most-type\nquantifier understanding, which is contrary to human psycho-linguistic\nexperiments and previous work, where the model's understanding of most-type\nquantifier gets worse as the model size increases. We do this evaluation on\nmodels ranging from 125M-175B parameters, which suggests that LLMs do not do as\nwell as expected with quantifiers. We also discuss the possible reasons for\nthis and the relevance of quantifier understanding in evaluating language\nunderstanding in LLMs.\n","authors":["Akshat Gupta"],"pdf_url":"https://arxiv.org/pdf/2306.07384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07984v1","updated":"2023-08-15T18:34:26Z","published":"2023-08-15T18:34:26Z","title":"Anaphoric Structure Emerges Between Neural Networks","summary":" Pragmatics is core to natural language, enabling speakers to communicate\nefficiently with structures like ellipsis and anaphora that can shorten\nutterances without loss of meaning. These structures require a listener to\ninterpret an ambiguous form - like a pronoun - and infer the speaker's intended\nmeaning - who that pronoun refers to. Despite potential to introduce ambiguity,\nanaphora is ubiquitous across human language. In an effort to better understand\nthe origins of anaphoric structure in natural language, we look to see if\nanalogous structures can emerge between artificial neural networks trained to\nsolve a communicative task. We show that: first, despite the potential for\nincreased ambiguity, languages with anaphoric structures are learnable by\nneural models. Second, anaphoric structures emerge between models 'naturally'\nwithout need for additional constraints. Finally, introducing an explicit\nefficiency pressure on the speaker increases the prevalence of these\nstructures. We conclude that certain pragmatic structures straightforwardly\nemerge between neural networks, without explicit efficiency pressures, but that\nthe competing needs of speakers and listeners conditions the degree and nature\nof their emergence.\n","authors":["Nicholas Edwards","Hannah Rohde","Henry Conklin"],"pdf_url":"https://arxiv.org/pdf/2308.07984v1.pdf","comment":"Published as a conference paper at the Annual Meeting of the\n Cognitive Science Society 2023: 6 Pages, 3 Figures, code available at\n https://github.com/hcoxec/emerge"},{"id":"http://arxiv.org/abs/2308.07973v1","updated":"2023-08-15T18:21:26Z","published":"2023-08-15T18:21:26Z","title":"\"Beware of deception\": Detecting Half-Truth and Debunking it through\n Controlled Claim Editing","summary":" The prevalence of half-truths, which are statements containing some truth but\nthat are ultimately deceptive, has risen with the increasing use of the\ninternet. To help combat this problem, we have created a comprehensive pipeline\nconsisting of a half-truth detection model and a claim editing model. Our\napproach utilizes the T5 model for controlled claim editing; \"controlled\" here\nmeans precise adjustments to select parts of a claim. Our methodology achieves\nan average BLEU score of 0.88 (on a scale of 0-1) and a disinfo-debunk score of\n85% on edited claims. Significantly, our T5-based approach outperforms other\nLanguage Models such as GPT2, RoBERTa, PEGASUS, and Tailor, with average\nimprovements of 82%, 57%, 42%, and 23% in disinfo-debunk scores, respectively.\nBy extending the LIAR PLUS dataset, we achieve an F1 score of 82% for the\nhalf-truth detection model, setting a new benchmark in the field. While\nprevious attempts have been made at half-truth detection, our approach is, to\nthe best of our knowledge, the first to attempt to debunk half-truths.\n","authors":["Sandeep Singamsetty","Nishtha Madaan","Sameep Mehta","Varad Bhatnagar","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2308.07973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07971v1","updated":"2023-08-15T18:18:34Z","published":"2023-08-15T18:18:34Z","title":"MultiSChuBERT: Effective Multimodal Fusion for Scholarly Document\n Quality Prediction","summary":" Automatic assessment of the quality of scholarly documents is a difficult\ntask with high potential impact. Multimodality, in particular the addition of\nvisual information next to text, has been shown to improve the performance on\nscholarly document quality prediction (SDQP) tasks. We propose the multimodal\npredictive model MultiSChuBERT. It combines a textual model based on chunking\nfull paper text and aggregating computed BERT chunk-encodings (SChuBERT), with\na visual model based on Inception V3.Our work contributes to the current\nstate-of-the-art in SDQP in three ways. First, we show that the method of\ncombining visual and textual embeddings can substantially influence the\nresults. Second, we demonstrate that gradual-unfreezing of the weights of the\nvisual sub-model, reduces its tendency to ovefit the data, improving results.\nThird, we show the retained benefit of multimodality when replacing standard\nBERT$_{\\textrm{BASE}}$ embeddings with more recent state-of-the-art text\nembedding models.\n Using BERT$_{\\textrm{BASE}}$ embeddings, on the (log) number of citations\nprediction task with the ACL-BiblioMetry dataset, our MultiSChuBERT\n(text+visual) model obtains an $R^{2}$ score of 0.454 compared to 0.432 for the\nSChuBERT (text only) model. Similar improvements are obtained on the PeerRead\naccept/reject prediction task. In our experiments using SciBERT, scincl,\nSPECTER and SPECTER2.0 embeddings, we show that each of these tailored\nembeddings adds further improvements over the standard BERT$_{\\textrm{BASE}}$\nembeddings, with the SPECTER2.0 embeddings performing best.\n","authors":["Gideon Maillette de Buy Wenniger","Thomas van Dongen","Lambert Schomaker"],"pdf_url":"https://arxiv.org/pdf/2308.07971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07968v1","updated":"2023-08-15T18:06:23Z","published":"2023-08-15T18:06:23Z","title":"Teach LLMs to Personalize -- An Approach inspired by Writing Education","summary":" Personalized text generation is an emerging research area that has attracted\nmuch attention in recent years. Most studies in this direction focus on a\nparticular domain by designing bespoke features or models. In this work, we\npropose a general approach for personalized text generation using large\nlanguage models (LLMs). Inspired by the practice of writing education, we\ndevelop a multistage and multitask framework to teach LLMs for personalized\ngeneration. In writing instruction, the task of writing from sources is often\ndecomposed into multiple steps that involve finding, evaluating, summarizing,\nsynthesizing, and integrating information. Analogously, our approach to\npersonalized text generation consists of multiple stages: retrieval, ranking,\nsummarization, synthesis, and generation. In addition, we introduce a multitask\nsetting that helps the model improve its generation ability further, which is\ninspired by the observation in education that a student's reading proficiency\nand writing ability are often correlated. We evaluate our approach on three\npublic datasets, each of which covers a different and representative domain.\nOur results show significant improvements over a variety of baselines.\n","authors":["Cheng Li","Mingyang Zhang","Qiaozhu Mei","Yaqing Wang","Spurthi Amba Hombaiah","Yi Liang","Michael Bendersky"],"pdf_url":"https://arxiv.org/pdf/2308.07968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08449v1","updated":"2023-08-15T03:31:47Z","published":"2023-08-15T03:31:47Z","title":"Improving CTC-AED model with integrated-CTC and auxiliary loss\n regularization","summary":" Connectionist temporal classification (CTC) and attention-based encoder\ndecoder (AED) joint training has been widely applied in automatic speech\nrecognition (ASR). Unlike most hybrid models that separately calculate the CTC\nand AED losses, our proposed integrated-CTC utilizes the attention mechanism of\nAED to guide the output of CTC. In this paper, we employ two fusion methods,\nnamely direct addition of logits (DAL) and preserving the maximum probability\n(PMP). We achieve dimensional consistency by adaptively affine transforming the\nattention results to match the dimensions of CTC. To accelerate model\nconvergence and improve accuracy, we introduce auxiliary loss regularization\nfor accelerated convergence. Experimental results demonstrate that the DAL\nmethod performs better in attention rescoring, while the PMP method excels in\nCTC prefix beam search and greedy search.\n","authors":["Daobin Zhu","Xiangdong Su","Hongbin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08449v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.07926v1","updated":"2023-08-15T17:59:56Z","published":"2023-08-15T17:59:56Z","title":"CoDeF: Content Deformation Fields for Temporally Consistent Video\n Processing","summary":" We present the content deformation field CoDeF as a new type of video\nrepresentation, which consists of a canonical content field aggregating the\nstatic contents in the entire video and a temporal deformation field recording\nthe transformations from the canonical image (i.e., rendered from the canonical\ncontent field) to each individual frame along the time axis.Given a target\nvideo, these two fields are jointly optimized to reconstruct it through a\ncarefully tailored rendering pipeline.We advisedly introduce some\nregularizations into the optimization process, urging the canonical content\nfield to inherit semantics (e.g., the object shape) from the video.With such a\ndesign, CoDeF naturally supports lifting image algorithms for video processing,\nin the sense that one can apply an image algorithm to the canonical image and\neffortlessly propagate the outcomes to the entire video with the aid of the\ntemporal deformation field.We experimentally show that CoDeF is able to lift\nimage-to-image translation to video-to-video translation and lift keypoint\ndetection to keypoint tracking without any training.More importantly, thanks to\nour lifting strategy that deploys the algorithms on only one image, we achieve\nsuperior cross-frame consistency in processed videos compared to existing\nvideo-to-video translation approaches, and even manage to track non-rigid\nobjects like water and smog.Project page can be found at\nhttps://qiuyu96.github.io/CoDeF/.\n","authors":["Hao Ouyang","Qiuyu Wang","Yuxi Xiao","Qingyan Bai","Juntao Zhang","Kecheng Zheng","Xiaowei Zhou","Qifeng Chen","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2308.07926v1.pdf","comment":"Project Webpage: https://qiuyu96.github.io/CoDeF/, Code:\n https://github.com/qiuyu96/CoDeF"},{"id":"http://arxiv.org/abs/2308.07921v1","updated":"2023-08-15T17:58:45Z","published":"2023-08-15T17:58:45Z","title":"Solving Challenging Math Word Problems Using GPT-4 Code Interpreter with\n Code-based Self-Verification","summary":" Recent progress in large language models (LLMs) like GPT-4 and PaLM-2 has\nbrought significant advancements in addressing math reasoning problems. In\nparticular, OpenAI's latest version of GPT-4, known as GPT-4 Code Interpreter,\nshows remarkable performance on challenging math datasets. In this paper, we\nexplore the effect of code on enhancing LLMs' reasoning capability by\nintroducing different constraints on the \\textit{Code Usage Frequency} of GPT-4\nCode Interpreter. We found that its success can be largely attributed to its\npowerful skills in generating and executing code, evaluating the output of code\nexecution, and rectifying its solution when receiving unreasonable outputs.\nBased on this insight, we propose a novel and effective prompting method,\nexplicit \\uline{c}ode-based \\uline{s}elf-\\uline{v}erification~(CSV), to further\nboost the mathematical reasoning potential of GPT-4 Code Interpreter. This\nmethod employs a zero-shot prompt on GPT-4 Code Interpreter to encourage it to\nuse code to self-verify its answers. In instances where the verification state\nregisters as ``False'', the model shall automatically amend its solution,\nanalogous to our approach of rectifying errors during a mathematics\nexamination. Furthermore, we recognize that the states of the verification\nresult indicate the confidence of a solution, which can improve the\neffectiveness of majority voting. With GPT-4 Code Interpreter and CSV, we\nachieve an impressive zero-shot accuracy on MATH dataset \\textbf{(53.9\\% $\\to$\n84.3\\%)}.\n","authors":["Aojun Zhou","Ke Wang","Zimu Lu","Weikang Shi","Sichun Luo","Zipeng Qin","Shaoqing Lu","Anya Jia","Linqi Song","Mingjie Zhan","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.07921v1.pdf","comment":"Solving Challenging Math Word Problems Using GPT-4 Code Interpreter\n with Code-based Self-Verification"},{"id":"http://arxiv.org/abs/2308.07918v1","updated":"2023-08-15T17:58:11Z","published":"2023-08-15T17:58:11Z","title":"Helping Hands: An Object-Aware Ego-Centric Video Recognition Model","summary":" We introduce an object-aware decoder for improving the performance of\nspatio-temporal representations on ego-centric videos. The key idea is to\nenhance object-awareness during training by tasking the model to predict hand\npositions, object positions, and the semantic label of the objects using paired\ncaptions when available. At inference time the model only requires RGB frames\nas inputs, and is able to track and ground objects (although it has not been\ntrained explicitly for this). We demonstrate the performance of the\nobject-aware representations learnt by our model, by: (i) evaluating it for\nstrong transfer, i.e. through zero-shot testing, on a number of downstream\nvideo-text retrieval and classification benchmarks; and (ii) by using the\nrepresentations learned as input for long-term video understanding tasks (e.g.\nEpisodic Memory in Ego4D). In all cases the performance improves over the state\nof the art -- even compared to networks trained with far larger batch sizes. We\nalso show that by using noisy image-level detection as pseudo-labels in\ntraining, the model learns to provide better bounding boxes using video\nconsistency, as well as grounding the words in the associated text\ndescriptions. Overall, we show that the model can act as a drop-in replacement\nfor an ego-centric video model to improve performance through visual-text\ngrounding.\n","authors":["Chuhan Zhang","Ankush Gupta","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2308.07918v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.07903v1","updated":"2023-08-15T17:42:39Z","published":"2023-08-15T17:42:39Z","title":"Relightable and Animatable Neural Avatar from Sparse-View Video","summary":" This paper tackles the challenge of creating relightable and animatable\nneural avatars from sparse-view (or even monocular) videos of dynamic humans\nunder unknown illumination. Compared to studio environments, this setting is\nmore practical and accessible but poses an extremely challenging ill-posed\nproblem. Previous neural human reconstruction methods are able to reconstruct\nanimatable avatars from sparse views using deformed Signed Distance Fields\n(SDF) but cannot recover material parameters for relighting. While\ndifferentiable inverse rendering-based methods have succeeded in material\nrecovery of static objects, it is not straightforward to extend them to dynamic\nhumans as it is computationally intensive to compute pixel-surface intersection\nand light visibility on deformed SDFs for inverse rendering. To solve this\nchallenge, we propose a Hierarchical Distance Query (HDQ) algorithm to\napproximate the world space distances under arbitrary human poses.\nSpecifically, we estimate coarse distances based on a parametric human model\nand compute fine distances by exploiting the local deformation invariance of\nSDF. Based on the HDQ algorithm, we leverage sphere tracing to efficiently\nestimate the surface intersection and light visibility. This allows us to\ndevelop the first system to recover animatable and relightable neural avatars\nfrom sparse view (or monocular) inputs. Experiments demonstrate that our\napproach is able to produce superior results compared to state-of-the-art\nmethods. Our code will be released for reproducibility.\n","authors":["Zhen Xu","Sida Peng","Chen Geng","Linzhan Mou","Zihan Yan","Jiaming Sun","Hujun Bao","Xiaowei Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07903v1.pdf","comment":"Project page: https://zju3dv.github.io/relightable_avatar"},{"id":"http://arxiv.org/abs/2308.07898v1","updated":"2023-08-15T17:39:52Z","published":"2023-08-15T17:39:52Z","title":"A Foundation LAnguage-Image model of the Retina (FLAIR): Encoding expert\n knowledge in text supervision","summary":" Foundation vision-language models are currently transforming computer vision,\nand are on the rise in medical imaging fueled by their very promising\ngeneralization capabilities. However, the initial attempts to transfer this new\nparadigm to medical imaging have shown less impressive performances than those\nobserved in other domains, due to the significant domain shift and the complex,\nexpert domain knowledge inherent to medical-imaging tasks. Motivated by the\nneed for domain-expert foundation models, we present FLAIR, a pre-trained\nvision-language model for universal retinal fundus image understanding. To this\nend, we compiled 37 open-access, mostly categorical fundus imaging datasets\nfrom various sources, with up to 97 different target conditions and 284,660\nimages. We integrate the expert's domain knowledge in the form of descriptive\ntextual prompts, during both pre-training and zero-shot inference, enhancing\nthe less-informative categorical supervision of the data. Such a textual\nexpert's knowledge, which we compiled from the relevant clinical literature and\ncommunity standards, describes the fine-grained features of the pathologies as\nwell as the hierarchies and dependencies between them. We report comprehensive\nevaluations, which illustrate the benefit of integrating expert knowledge and\nthe strong generalization capabilities of FLAIR under difficult scenarios with\ndomain shifts or unseen categories. When adapted with a lightweight linear\nprobe, FLAIR outperforms fully-trained, dataset-focused models, more so in the\nfew-shot regimes. Interestingly, FLAIR outperforms by a large margin more\ngeneralist, larger-scale image-language models, which emphasizes the potential\nof embedding experts' domain knowledge and the limitations of generalist models\nin medical imaging.\n","authors":["Julio Silva-Rodriguez","Hadi Chakor","Riadh Kobbi","Jose Dolz","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2308.07898v1.pdf","comment":"The pre-trained model is available at:\n https://github.com/jusiro/FLAIR"},{"id":"http://arxiv.org/abs/2308.01246v2","updated":"2023-08-15T17:39:05Z","published":"2023-08-02T16:00:39Z","title":"Tirtha -- An Automated Platform to Crowdsource Images and Create 3D\n Models of Heritage Sites","summary":" Digital preservation of Cultural Heritage (CH) sites is crucial to protect\nthem against damage from natural disasters or human activities. Creating 3D\nmodels of CH sites has become a popular method of digital preservation thanks\nto advancements in computer vision and photogrammetry. However, the process is\ntime-consuming, expensive, and typically requires specialized equipment and\nexpertise, posing challenges in resource-limited developing countries.\nAdditionally, the lack of an open repository for 3D models hinders research and\npublic engagement with their heritage. To address these issues, we propose\nTirtha, a web platform for crowdsourcing images of CH sites and creating their\n3D models. Tirtha utilizes state-of-the-art Structure from Motion (SfM) and\nMulti-View Stereo (MVS) techniques. It is modular, extensible and\ncost-effective, allowing for the incorporation of new techniques as\nphotogrammetry advances. Tirtha is accessible through a web interface at\nhttps://tirtha.niser.ac.in and can be deployed on-premise or in a cloud\nenvironment. In our case studies, we demonstrate the pipeline's effectiveness\nby creating 3D models of temples in Odisha, India, using crowdsourced images.\nThese models are available for viewing, interaction, and download on the Tirtha\nwebsite. Our work aims to provide a dataset of crowdsourced images and 3D\nreconstructions for research in computer vision, heritage conservation, and\nrelated domains. Overall, Tirtha is a step towards democratizing digital\npreservation, primarily in resource-limited developing countries.\n","authors":["Jyotirmaya Shivottam","Subhankar Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.01246v2.pdf","comment":"Accepted at The 28th International ACM Conference on 3D Web\n Technology (Web3D 2023)"},{"id":"http://arxiv.org/abs/2308.07893v1","updated":"2023-08-15T17:34:54Z","published":"2023-08-15T17:34:54Z","title":"Memory-and-Anticipation Transformer for Online Action Understanding","summary":" Most existing forecasting systems are memory-based methods, which attempt to\nmimic human forecasting ability by employing various memory mechanisms and have\nprogressed in temporal modeling for memory dependency. Nevertheless, an obvious\nweakness of this paradigm is that it can only model limited historical\ndependence and can not transcend the past. In this paper, we rethink the\ntemporal dependence of event evolution and propose a novel\nmemory-anticipation-based paradigm to model an entire temporal structure,\nincluding the past, present, and future. Based on this idea, we present\nMemory-and-Anticipation Transformer (MAT), a memory-anticipation-based\napproach, to address the online action detection and anticipation tasks. In\naddition, owing to the inherent superiority of MAT, it can process online\naction detection and anticipation tasks in a unified manner. The proposed MAT\nmodel is tested on four challenging benchmarks TVSeries, THUMOS'14, HDD, and\nEPIC-Kitchens-100, for online action detection and anticipation tasks, and it\nsignificantly outperforms all existing methods. Code is available at\nhttps://github.com/Echo0125/Memory-and-Anticipation-Transformer.\n","authors":["Jiahao Wang","Guo Chen","Yifei Huang","Limin Wang","Tong Lu"],"pdf_url":"https://arxiv.org/pdf/2308.07893v1.pdf","comment":"ICCV 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2211.15377v4","updated":"2023-08-15T17:33:53Z","published":"2022-11-23T09:57:17Z","title":"Whose Emotion Matters? Speaking Activity Localisation without Prior\n Knowledge","summary":" The task of emotion recognition in conversations (ERC) benefits from the\navailability of multiple modalities, as provided, for example, in the\nvideo-based Multimodal EmotionLines Dataset (MELD). However, only a few\nresearch approaches use both acoustic and visual information from the MELD\nvideos. There are two reasons for this: First, label-to-video alignments in\nMELD are noisy, making those videos an unreliable source of emotional speech\ndata. Second, conversations can involve several people in the same scene, which\nrequires the localisation of the utterance source. In this paper, we introduce\nMELD with Fixed Audiovisual Information via Realignment (MELD-FAIR) by using\nrecent active speaker detection and automatic speech recognition models, we are\nable to realign the videos of MELD and capture the facial expressions from\nspeakers in 96.92% of the utterances provided in MELD. Experiments with a\nself-supervised voice recognition model indicate that the realigned MELD-FAIR\nvideos more closely match the transcribed utterances given in the MELD dataset.\nFinally, we devise a model for emotion recognition in conversations trained on\nthe realigned MELD-FAIR videos, which outperforms state-of-the-art models for\nERC based on vision alone. This indicates that localising the source of\nspeaking activities is indeed effective for extracting facial expressions from\nthe uttering speakers and that faces provide more informative visual cues than\nthe visual features state-of-the-art models have been using so far. The\nMELD-FAIR realignment data, and the code of the realignment procedure and of\nthe emotional recognition, are available at\nhttps://github.com/knowledgetechnologyuhh/MELD-FAIR.\n","authors":["Hugo Carneiro","Cornelius Weber","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2211.15377v4.pdf","comment":"17 pages, 8 figures, 7 tables, Published in Neurocomputing"},{"id":"http://arxiv.org/abs/2308.07891v1","updated":"2023-08-15T17:33:24Z","published":"2023-08-15T17:33:24Z","title":"Link-Context Learning for Multimodal LLMs","summary":" The ability to learn from context with novel concepts, and deliver\nappropriate responses are essential in human conversations. Despite current\nMultimodal Large Language Models (MLLMs) and Large Language Models (LLMs) being\ntrained on mega-scale datasets, recognizing unseen images or understanding\nnovel concepts in a training-free manner remains a challenge. In-Context\nLearning (ICL) explores training-free few-shot learning, where models are\nencouraged to ``learn to learn\" from limited tasks and generalize to unseen\ntasks. In this work, we propose link-context learning (LCL), which emphasizes\n\"reasoning from cause and effect\" to augment the learning capabilities of\nMLLMs. LCL goes beyond traditional ICL by explicitly strengthening the causal\nrelationship between the support set and the query set. By providing\ndemonstrations with causal links, LCL guides the model to discern not only the\nanalogy but also the underlying causal associations between data points, which\nempowers MLLMs to recognize unseen images and understand novel concepts more\neffectively. To facilitate the evaluation of this novel approach, we introduce\nthe ISEKAI dataset, comprising exclusively of unseen generated image-label\npairs designed for link-context learning. Extensive experiments show that our\nLCL-MLLM exhibits strong link-context learning capabilities to novel concepts\nover vanilla MLLMs. Code and data will be released at\nhttps://github.com/isekai-portal/Link-Context-Learning.\n","authors":["Yan Tai","Weichen Fan","Zhao Zhang","Feng Zhu","Rui Zhao","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07891v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.07885v1","updated":"2023-08-15T17:22:42Z","published":"2023-08-15T17:22:42Z","title":"The Challenge of Fetal Cardiac MRI Reconstruction Using Deep Learning","summary":" Dynamic free-breathing fetal cardiac MRI is one of the most challenging\nmodalities, which requires high temporal and spatial resolution to depict rapid\nchanges in a small fetal heart. The ability of deep learning methods to recover\nundersampled data could help to optimise the kt-SENSE acquisition strategy and\nimprove non-gated kt-SENSE reconstruction quality. In this work, we explore\nsupervised deep learning networks for reconstruction of kt-SENSE style acquired\ndata using an extensive in vivo dataset. Having access to fully-sampled\nlow-resolution multi-coil fetal cardiac MRI, we study the performance of the\nnetworks to recover fully-sampled data from undersampled data. We consider\nmodel architectures together with training strategies taking into account their\napplication in the real clinical setup used to collect the dataset to enable\nnetworks to recover prospectively undersampled data. We explore a set of\nmodifications to form a baseline performance evaluation for dynamic fetal\ncardiac MRI on real data. We systematically evaluate the models on\ncoil-combined data to reveal the effect of the suggested changes to the\narchitecture in the context of fetal heart properties. We show that the\nbest-performers recover a detailed depiction of the maternal anatomy on a large\nscale, but the dynamic properties of the fetal heart are under-represented.\nTraining directly on multi-coil data improves the performance of the models,\nallows their prospective application to undersampled data and makes them\noutperform CTFNet introduced for adult cardiac cine MRI. However, these models\ndeliver similar qualitative performances recovering the maternal body very well\nbut underestimating the dynamic properties of fetal heart. This dynamic feature\nof fast change of fetal heart that is highly localised suggests both more\ntargeted training and evaluation methods might be needed for fetal heart\napplication.\n","authors":["Denis Prokopenko","Kerstin Hammernik","Thomas Roberts","David F A Lloyd","Daniel Rueckert","Joseph V Hajnal"],"pdf_url":"https://arxiv.org/pdf/2308.07885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07180v2","updated":"2023-08-15T16:58:42Z","published":"2023-08-14T14:39:06Z","title":"SEMI-CenterNet: A Machine Learning Facilitated Approach for\n Semiconductor Defect Inspection","summary":" Continual shrinking of pattern dimensions in the semiconductor domain is\nmaking it increasingly difficult to inspect defects due to factors such as the\npresence of stochastic noise and the dynamic behavior of defect patterns and\ntypes. Conventional rule-based methods and non-parametric supervised machine\nlearning algorithms like KNN mostly fail at the requirements of semiconductor\ndefect inspection at these advanced nodes. Deep Learning (DL)-based methods\nhave gained popularity in the semiconductor defect inspection domain because\nthey have been proven robust towards these challenging scenarios. In this\nresearch work, we have presented an automated DL-based approach for efficient\nlocalization and classification of defects in SEM images. We have proposed\nSEMI-CenterNet (SEMI-CN), a customized CN architecture trained on SEM images of\nsemiconductor wafer defects. The use of the proposed CN approach allows\nimproved computational efficiency compared to previously studied DL models.\nSEMI-CN gets trained to output the center, class, size, and offset of a defect\ninstance. This is different from the approach of most object detection models\nthat use anchors for bounding box prediction. Previous methods predict\nredundant bounding boxes, most of which are discarded in postprocessing. CN\nmitigates this by only predicting boxes for likely defect center points. We\ntrain SEMI-CN on two datasets and benchmark two ResNet backbones for the\nframework. Initially, ResNet models pretrained on the COCO dataset undergo\ntraining using two datasets separately. Primarily, SEMI-CN shows significant\nimprovement in inference time against previous research works. Finally,\ntransfer learning (using weights of custom SEM dataset) is applied from ADI\ndataset to AEI dataset and vice-versa, which reduces the required training time\nfor both backbones to reach the best mAP against conventional training method.\n","authors":["Vic De Ridder","Bappaditya Dey","Enrique Dehaerne","Sandip Halder","Stefan De Gendt","Bartel Van Waeyenberge"],"pdf_url":"https://arxiv.org/pdf/2308.07180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06198v2","updated":"2023-08-15T16:42:07Z","published":"2023-08-11T15:43:37Z","title":"DIG In: Evaluating Disparities in Image Generations with Indicators for\n Geographic Diversity","summary":" The unprecedented photorealistic results achieved by recent text-to-image\ngenerative systems and their increasing use as plug-and-play content creation\nsolutions make it crucial to understand their potential biases. In this work,\nwe introduce three indicators to evaluate the realism, diversity and\nprompt-generation consistency of text-to-image generative systems when prompted\nto generate objects from across the world. Our indicators complement\nqualitative analysis of the broader impact of such systems by enabling\nautomatic and efficient benchmarking of geographic disparities, an important\nstep towards building responsible visual content creation systems. We use our\nproposed indicators to analyze potential geographic biases in state-of-the-art\nvisual content creation systems and find that: (1) models have less realism and\ndiversity of generations when prompting for Africa and West Asia than Europe,\n(2) prompting with geographic information comes at a cost to prompt-consistency\nand diversity of generated images, and (3) models exhibit more region-level\ndisparities for some objects than others. Perhaps most interestingly, our\nindicators suggest that progress in image generation quality has come at the\ncost of real-world geographic representation. Our comprehensive evaluation\nconstitutes a crucial step towards ensuring a positive experience of visual\ncontent creation for everyone.\n","authors":["Melissa Hall","Candace Ross","Adina Williams","Nicolas Carion","Michal Drozdzal","Adriana Romero Soriano"],"pdf_url":"https://arxiv.org/pdf/2308.06198v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07874v1","updated":"2023-08-15T16:40:46Z","published":"2023-08-15T16:40:46Z","title":"SEDA: Self-Ensembling ViT with Defensive Distillation and Adversarial\n Training for robust Chest X-rays Classification","summary":" Deep Learning methods have recently seen increased adoption in medical\nimaging applications. However, elevated vulnerabilities have been explored in\nrecent Deep Learning solutions, which can hinder future adoption. Particularly,\nthe vulnerability of Vision Transformer (ViT) to adversarial, privacy, and\nconfidentiality attacks raise serious concerns about their reliability in\nmedical settings. This work aims to enhance the robustness of self-ensembling\nViTs for the tuberculosis chest x-ray classification task. We propose\nSelf-Ensembling ViT with defensive Distillation and Adversarial training\n(SEDA). SEDA utilizes efficient CNN blocks to learn spatial features with\nvarious levels of abstraction from feature representations extracted from\nintermediate ViT blocks, that are largely unaffected by adversarial\nperturbations. Furthermore, SEDA leverages adversarial training in combination\nwith defensive distillation for improved robustness against adversaries.\nTraining using adversarial examples leads to better model generalizability and\nimproves its ability to handle perturbations. Distillation using soft\nprobabilities introduces uncertainty and variation into the output\nprobabilities, making it more difficult for adversarial and privacy attacks.\nExtensive experiments performed with the proposed architecture and training\nparadigm on publicly available Tuberculosis x-ray dataset shows SOTA efficacy\nof SEDA compared to SEViT in terms of computational efficiency with 70x times\nlighter framework and enhanced robustness of +9%.\n","authors":["Raza Imam","Ibrahim Almakky","Salma Alrashdi","Baketah Alrashdi","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2308.07874v1.pdf","comment":"Accepted at DART (Domain Adaptation and Representation Transfer)\n Workshop, MICCAI, 2023. Code: https://github.com/Razaimam45/SEDA"},{"id":"http://arxiv.org/abs/2308.07871v1","updated":"2023-08-15T16:39:10Z","published":"2023-08-15T16:39:10Z","title":"Emotion Embeddings $\\unicode{x2014}$ Learning Stable and Homogeneous\n Abstractions from Heterogeneous Affective Datasets","summary":" Human emotion is expressed in many communication modalities and media formats\nand so their computational study is equally diversified into natural language\nprocessing, audio signal analysis, computer vision, etc. Similarly, the large\nvariety of representation formats used in previous research to describe\nemotions (polarity scales, basic emotion categories, dimensional approaches,\nappraisal theory, etc.) have led to an ever proliferating diversity of\ndatasets, predictive models, and software tools for emotion analysis. Because\nof these two distinct types of heterogeneity, at the expressional and\nrepresentational level, there is a dire need to unify previous work on\nincreasingly diverging data and label types. This article presents such a\nunifying computational model. We propose a training procedure that learns a\nshared latent representation for emotions, so-called emotion embeddings,\nindependent of different natural languages, communication modalities, media or\nrepresentation label formats, and even disparate model architectures.\nExperiments on a wide range of heterogeneous affective datasets indicate that\nthis approach yields the desired interoperability for the sake of reusability,\ninterpretability and flexibility, without penalizing prediction quality. Code\nand data are archived under https://doi.org/10.5281/zenodo.7405327 .\n","authors":["Sven Buechel","Udo Hahn"],"pdf_url":"https://arxiv.org/pdf/2308.07871v1.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.07868v1","updated":"2023-08-15T16:35:40Z","published":"2023-08-15T16:35:40Z","title":"ObjectSDF++: Improved Object-Compositional Neural Implicit Surfaces","summary":" In recent years, neural implicit surface reconstruction has emerged as a\npopular paradigm for multi-view 3D reconstruction. Unlike traditional\nmulti-view stereo approaches, the neural implicit surface-based methods\nleverage neural networks to represent 3D scenes as signed distance functions\n(SDFs). However, they tend to disregard the reconstruction of individual\nobjects within the scene, which limits their performance and practical\napplications. To address this issue, previous work ObjectSDF introduced a nice\nframework of object-composition neural implicit surfaces, which utilizes 2D\ninstance masks to supervise individual object SDFs. In this paper, we propose a\nnew framework called ObjectSDF++ to overcome the limitations of ObjectSDF.\nFirst, in contrast to ObjectSDF whose performance is primarily restricted by\nits converted semantic field, the core component of our model is an\nocclusion-aware object opacity rendering formulation that directly\nvolume-renders object opacity to be supervised with instance masks. Second, we\ndesign a novel regularization term for object distinction, which can\neffectively mitigate the issue that ObjectSDF may result in unexpected\nreconstruction in invisible regions due to the lack of constraint to prevent\ncollisions. Our extensive experiments demonstrate that our novel framework not\nonly produces superior object reconstruction results but also significantly\nimproves the quality of scene reconstruction. Code and more resources can be\nfound in \\url{https://qianyiwu.github.io/objectsdf++}\n","authors":["Qianyi Wu","Kaisiyuan Wang","Kejie Li","Jianmin Zheng","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2308.07868v1.pdf","comment":"ICCV 2023. Project Page: https://qianyiwu.github.io/objectsdf++ Code:\n https://github.com/QianyiWu/objectsdf_plus"},{"id":"http://arxiv.org/abs/2308.07863v1","updated":"2023-08-15T16:30:49Z","published":"2023-08-15T16:30:49Z","title":"StyleDiffusion: Controllable Disentangled Style Transfer via Diffusion\n Models","summary":" Content and style (C-S) disentanglement is a fundamental problem and critical\nchallenge of style transfer. Existing approaches based on explicit definitions\n(e.g., Gram matrix) or implicit learning (e.g., GANs) are neither interpretable\nnor easy to control, resulting in entangled representations and less satisfying\nresults. In this paper, we propose a new C-S disentangled framework for style\ntransfer without using previous assumptions. The key insight is to explicitly\nextract the content information and implicitly learn the complementary style\ninformation, yielding interpretable and controllable C-S disentanglement and\nstyle transfer. A simple yet effective CLIP-based style disentanglement loss\ncoordinated with a style reconstruction prior is introduced to disentangle C-S\nin the CLIP image space. By further leveraging the powerful style removal and\ngenerative ability of diffusion models, our framework achieves superior results\nthan state of the art and flexible C-S disentanglement and trade-off control.\nOur work provides new insights into the C-S disentanglement in style transfer\nand demonstrates the potential of diffusion models for learning\nwell-disentangled C-S characteristics.\n","authors":["Zhizhong Wang","Lei Zhao","Wei Xing"],"pdf_url":"https://arxiv.org/pdf/2308.07863v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2305.17401v3","updated":"2023-08-15T16:02:43Z","published":"2023-05-27T07:59:49Z","title":"A Framework For Refining Text Classification and Object Recognition from\n Academic Articles","summary":" With the widespread use of the internet, it has become increasingly crucial\nto extract specific information from vast amounts of academic articles\nefficiently. Data mining techniques are generally employed to solve this issue.\nHowever, data mining for academic articles is challenging since it requires\nautomatically extracting specific patterns in complex and unstructured layout\ndocuments. Current data mining methods for academic articles employ\nrule-based(RB) or machine learning(ML) approaches. However, using rule-based\nmethods incurs a high coding cost for complex typesetting articles. On the\nother hand, simply using machine learning methods requires annotation work for\ncomplex content types within the paper, which can be costly. Furthermore, only\nusing machine learning can lead to cases where patterns easily recognized by\nrule-based methods are mistakenly extracted. To overcome these issues, from the\nperspective of analyzing the standard layout and typesetting used in the\nspecified publication, we emphasize implementing specific methods for specific\ncharacteristics in academic articles. We have developed a novel Text Block\nRefinement Framework (TBRF), a machine learning and rule-based scheme hybrid.\nWe used the well-known ACL proceeding articles as experimental data for the\nvalidation experiment. The experiment shows that our approach achieved over 95%\nclassification accuracy and 90% detection accuracy for tables and figures.\n","authors":["Jinghong Li","Koichi Ota","Wen Gu","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2305.17401v3.pdf","comment":"This paper has been accepted at 'The International Symposium on\n Innovations in Intelligent Systems and Applications 2023 (INISTA 2023)'"},{"id":"http://arxiv.org/abs/2308.03202v3","updated":"2023-08-15T15:47:34Z","published":"2023-08-06T20:19:06Z","title":"Source-free Domain Adaptive Human Pose Estimation","summary":" Human Pose Estimation (HPE) is widely used in various fields, including\nmotion analysis, healthcare, and virtual reality. However, the great expenses\nof labeled real-world datasets present a significant challenge for HPE. To\novercome this, one approach is to train HPE models on synthetic datasets and\nthen perform domain adaptation (DA) on real-world data. Unfortunately, existing\nDA methods for HPE neglect data privacy and security by using both source and\ntarget data in the adaptation process. To this end, we propose a new task,\nnamed source-free domain adaptive HPE, which aims to address the challenges of\ncross-domain learning of HPE without access to source data during the\nadaptation process. We further propose a novel framework that consists of three\nmodels: source model, intermediate model, and target model, which explores the\ntask from both source-protect and target-relevant perspectives. The\nsource-protect module preserves source information more effectively while\nresisting noise, and the target-relevant module reduces the sparsity of spatial\nrepresentations by building a novel spatial probability space, and\npose-specific contrastive learning and information maximization are proposed on\nthe basis of this space. Comprehensive experiments on several domain adaptive\nHPE benchmarks show that the proposed method outperforms existing approaches by\na considerable margin. The codes are available at\nhttps://github.com/davidpengucf/SFDAHPE.\n","authors":["Qucheng Peng","Ce Zheng","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03202v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07837v1","updated":"2023-08-15T15:27:42Z","published":"2023-08-15T15:27:42Z","title":"CCD-3DR: Consistent Conditioning in Diffusion for Single-Image 3D\n Reconstruction","summary":" In this paper, we present a novel shape reconstruction method leveraging\ndiffusion model to generate 3D sparse point cloud for the object captured in a\nsingle RGB image. Recent methods typically leverage global embedding or local\nprojection-based features as the condition to guide the diffusion model.\nHowever, such strategies fail to consistently align the denoised point cloud\nwith the given image, leading to unstable conditioning and inferior\nperformance. In this paper, we present CCD-3DR, which exploits a novel centered\ndiffusion probabilistic model for consistent local feature conditioning. We\nconstrain the noise and sampled point cloud from the diffusion model into a\nsubspace where the point cloud center remains unchanged during the forward\ndiffusion process and reverse process. The stable point cloud center further\nserves as an anchor to align each point with its corresponding local\nprojection-based features. Extensive experiments on synthetic benchmark\nShapeNet-R2N2 demonstrate that CCD-3DR outperforms all competitors by a large\nmargin, with over 40% improvement. We also provide results on real-world\ndataset Pix3D to thoroughly demonstrate the potential of CCD-3DR in real-world\napplications. Codes will be released soon\n","authors":["Yan Di","Chenyangguang Zhang","Pengyuan Wang","Guangyao Zhai","Ruida Zhang","Fabian Manhardt","Benjamin Busam","Xiangyang Ji","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2308.07837v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2308.07827v1","updated":"2023-08-15T15:11:13Z","published":"2023-08-15T15:11:13Z","title":"Learning Better Keypoints for Multi-Object 6DoF Pose Estimation","summary":" We investigate the impact of pre-defined keypoints for pose estimation, and\nfound that accuracy and efficiency can be improved by training a graph network\nto select a set of disperse keypoints with similarly distributed votes. These\nvotes, learned by a regression network to accumulate evidence for the keypoint\nlocations, can be regressed more accurately compared to previous heuristic\nkeypoint algorithms. The proposed KeyGNet, supervised by a combined loss\nmeasuring both Wassserstein distance and dispersion, learns the color and\ngeometry features of the target objects to estimate optimal keypoint locations.\nExperiments demonstrate the keypoints selected by KeyGNet improved the accuracy\nfor all evaluation metrics of all seven datasets tested, for three keypoint\nvoting methods. The challenging Occlusion LINEMOD dataset notably improved\nADD(S) by +16.4% on PVN3D, and all core BOP datasets showed an AR improvement\nfor all objects, of between +1% and +21.5%. There was also a notable increase\nin performance when transitioning from single object to multiple object\ntraining using KeyGNet keypoints, essentially eliminating the SISO-MIMO gap for\nOcclusion LINEMOD.\n","authors":["Yangzheng Wu","Michael Greenspan"],"pdf_url":"https://arxiv.org/pdf/2308.07827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.00992v2","updated":"2023-08-15T15:06:47Z","published":"2022-10-03T14:58:17Z","title":"Feature Embedding by Template Matching as a ResNet Block","summary":" Convolution blocks serve as local feature extractors and are the key to\nsuccess of the neural networks. To make local semantic feature embedding rather\nexplicit, we reformulate convolution blocks as feature selection according to\nthe best matching kernel. In this manner, we show that typical ResNet blocks\nindeed perform local feature embedding via template matching once batch\nnormalization (BN) followed by a rectified linear unit (ReLU) is interpreted as\narg-max optimizer. Following this perspective, we tailor a residual block that\nexplicitly forces semantically meaningful local feature embedding through using\nlabel information. Specifically, we assign a feature vector to each local\nregion according to the classes that the corresponding region matches. We\nevaluate our method on three popular benchmark datasets with several\narchitectures for image classification and consistently show that our approach\nsubstantially improves the performance of the baseline architectures.\n","authors":["Ada Gorgun","Yeti Z. Gurbuz","A. Aydin Alatan"],"pdf_url":"https://arxiv.org/pdf/2210.00992v2.pdf","comment":"Accepted at the British Machine Vision Conference 2022 (BMVC 2022)"},{"id":"http://arxiv.org/abs/2307.06542v2","updated":"2023-08-15T15:06:30Z","published":"2023-07-13T03:11:09Z","title":"Quantum Image Denoising: A Framework via Boltzmann Machines, QUBO, and\n Quantum Annealing","summary":" We investigate a framework for binary image denoising via restricted\nBoltzmann machines (RBMs) that introduces a denoising objective in quadratic\nunconstrained binary optimization (QUBO) form and is well-suited for quantum\nannealing. The denoising objective is attained by balancing the distribution\nlearned by a trained RBM with a penalty term for derivations from the noisy\nimage. We derive the statistically optimal choice of the penalty parameter\nassuming the target distribution has been well-approximated, and further\nsuggest an empirically supported modification to make the method robust to that\nidealistic assumption. We also show under additional assumptions that the\ndenoised images attained by our method are, in expectation, strictly closer to\nthe noise-free images than the noisy images are. While we frame the model as an\nimage denoising model, it can be applied to any binary data. As the QUBO\nformulation is well-suited for implementation on quantum annealers, we test the\nmodel on a D-Wave Advantage machine, and also test on data too large for\ncurrent quantum annealers by approximating QUBO solutions through classical\nheuristics.\n","authors":["Phillip Kerger","Ryoji Miyazaki"],"pdf_url":"https://arxiv.org/pdf/2307.06542v2.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2303.18125v3","updated":"2023-08-15T15:06:24Z","published":"2023-03-31T15:09:18Z","title":"Towards Nonlinear-Motion-Aware and Occlusion-Robust Rolling Shutter\n Correction","summary":" This paper addresses the problem of rolling shutter correction in complex\nnonlinear and dynamic scenes with extreme occlusion. Existing methods suffer\nfrom two main drawbacks. Firstly, they face challenges in estimating the\naccurate correction field due to the uniform velocity assumption, leading to\nsignificant image correction errors under complex motion. Secondly, the drastic\nocclusion in dynamic scenes prevents current solutions from achieving better\nimage quality because of the inherent difficulties in aligning and aggregating\nmultiple frames. To tackle these challenges, we model the curvilinear\ntrajectory of pixels analytically and propose a geometry-based Quadratic\nRolling Shutter (QRS) motion solver, which precisely estimates the high-order\ncorrection field of individual pixels. Besides, to reconstruct high-quality\nocclusion frames in dynamic scenes, we present a 3D video architecture that\neffectively Aligns and Aggregates multi-frame context, namely, RSA2-Net. We\nevaluate our method across a broad range of cameras and video sequences,\ndemonstrating its significant superiority. Specifically, our method surpasses\nthe state-of-the-art by +4.98, +0.77, and +4.33 of PSNR on Carla-RS, Fastec-RS,\nand BS-RSC datasets, respectively. Code is available at\nhttps://github.com/DelinQu/qrsc.\n","authors":["Delin Qu","Yizhen Lao","Zhigang Wang","Dong Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2303.18125v3.pdf","comment":"accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07815v1","updated":"2023-08-15T14:46:32Z","published":"2023-08-15T14:46:32Z","title":"ImbSAM: A Closer Look at Sharpness-Aware Minimization in\n Class-Imbalanced Recognition","summary":" Class imbalance is a common challenge in real-world recognition tasks, where\nthe majority of classes have few samples, also known as tail classes. We\naddress this challenge with the perspective of generalization and empirically\nfind that the promising Sharpness-Aware Minimization (SAM) fails to address\ngeneralization issues under the class-imbalanced setting. Through investigating\nthis specific type of task, we identify that its generalization bottleneck\nprimarily lies in the severe overfitting for tail classes with limited training\ndata. To overcome this bottleneck, we leverage class priors to restrict the\ngeneralization scope of the class-agnostic SAM and propose a class-aware\nsmoothness optimization algorithm named Imbalanced-SAM (ImbSAM). With the\nguidance of class priors, our ImbSAM specifically improves generalization\ntargeting tail classes. We also verify the efficacy of ImbSAM on two\nprototypical applications of class-imbalanced recognition: long-tailed\nclassification and semi-supervised anomaly detection, where our ImbSAM\ndemonstrates remarkable performance improvements for tail classes and anomaly.\nOur code implementation is available at\nhttps://github.com/cool-xuan/Imbalanced_SAM.\n","authors":["Yixuan Zhou","Yi Qu","Xing Xu","Hengtao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.07815v1.pdf","comment":"Accepted by International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.07807v1","updated":"2023-08-15T14:33:17Z","published":"2023-08-15T14:33:17Z","title":"Grasp Transfer based on Self-Aligning Implicit Representations of Local\n Surfaces","summary":" Objects we interact with and manipulate often share similar parts, such as\nhandles, that allow us to transfer our actions flexibly due to their shared\nfunctionality. This work addresses the problem of transferring a grasp\nexperience or a demonstration to a novel object that shares shape similarities\nwith objects the robot has previously encountered. Existing approaches for\nsolving this problem are typically restricted to a specific object category or\na parametric shape. Our approach, however, can transfer grasps associated with\nimplicit models of local surfaces shared across object categories.\nSpecifically, we employ a single expert grasp demonstration to learn an\nimplicit local surface representation model from a small dataset of object\nmeshes. At inference time, this model is used to transfer grasps to novel\nobjects by identifying the most geometrically similar surfaces to the one on\nwhich the expert grasp is demonstrated. Our model is trained entirely in\nsimulation and is evaluated on simulated and real-world objects that are not\nseen during training. Evaluations indicate that grasp transfer to unseen object\ncategories using this approach can be successfully performed both in simulation\nand real-world experiments. The simulation results also show that the proposed\napproach leads to better spatial precision and grasp accuracy compared to a\nbaseline approach.\n","authors":["Ahmet Tekden","Marc Peter Deisenroth","Yasemin Bekiroglu"],"pdf_url":"https://arxiv.org/pdf/2308.07807v1.pdf","comment":"Accepted by IEEE RAL. 8 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.16694v2","updated":"2023-08-15T14:28:02Z","published":"2023-07-31T14:09:03Z","title":"Investigating and Improving Latent Density Segmentation Models for\n Aleatoric Uncertainty Quantification in Medical Imaging","summary":" Data uncertainties, such as sensor noise or occlusions, can introduce\nirreducible ambiguities in images, which result in varying, yet plausible,\nsemantic hypotheses. In Machine Learning, this ambiguity is commonly referred\nto as aleatoric uncertainty. Latent density models can be utilized to address\nthis problem in image segmentation. The most popular approach is the\nProbabilistic U-Net (PU-Net), which uses latent Normal densities to optimize\nthe conditional data log-likelihood Evidence Lower Bound. In this work, we\ndemonstrate that the PU- Net latent space is severely inhomogenous. As a\nresult, the effectiveness of gradient descent is inhibited and the model\nbecomes extremely sensitive to the localization of the latent space samples,\nresulting in defective predictions. To address this, we present the Sinkhorn\nPU-Net (SPU-Net), which uses the Sinkhorn Divergence to promote homogeneity\nacross all latent dimensions, effectively improving gradient-descent updates\nand model robustness. Our results show that by applying this on public datasets\nof various clinical segmentation problems, the SPU-Net receives up to 11%\nperformance gains compared against preceding latent variable models for\nprobabilistic segmentation on the Hungarian-Matched metric. The results\nindicate that by encouraging a homogeneous latent space, one can significantly\nimprove latent density modeling for medical image segmentation.\n","authors":["M. M. Amaan Valiuddin","Christiaan G. A. Viviers","Ruud J. G. van Sloun","Peter H. N. de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2307.16694v2.pdf","comment":"12 pages incl. references, 11 figures"},{"id":"http://arxiv.org/abs/2308.07802v1","updated":"2023-08-15T14:27:46Z","published":"2023-08-15T14:27:46Z","title":"Neuromorphic Seatbelt State Detection for In-Cabin Monitoring with Event\n Cameras","summary":" Neuromorphic vision sensors, or event cameras, differ from conventional\ncameras in that they do not capture images at a specified rate. Instead, they\nasynchronously log local brightness changes at each pixel. As a result, event\ncameras only record changes in a given scene, and do so with very high temporal\nresolution, high dynamic range, and low power requirements. Recent research has\ndemonstrated how these characteristics make event cameras extremely practical\nsensors in driver monitoring systems (DMS), enabling the tracking of high-speed\neye motion and blinks. This research provides a proof of concept to expand\nevent-based DMS techniques to include seatbelt state detection. Using an event\nsimulator, a dataset of 108,691 synthetic neuromorphic frames of car occupants\nwas generated from a near-infrared (NIR) dataset, and split into training,\nvalidation, and test sets for a seatbelt state detection algorithm based on a\nrecurrent convolutional neural network (CNN). In addition, a smaller set of\nreal event data was collected and reserved for testing. In a binary\nclassification task, the fastened/unfastened frames were identified with an F1\nscore of 0.989 and 0.944 on the simulated and real test sets respectively. When\nthe problem extended to also classify the action of fastening/unfastening the\nseatbelt, respective F1 scores of 0.964 and 0.846 were achieved.\n","authors":["Paul Kielty","Cian Ryan","Mehdi Sefidgar Dilmaghani","Waseem Shariff","Joe Lemley","Peter Corcoran"],"pdf_url":"https://arxiv.org/pdf/2308.07802v1.pdf","comment":"4 pages, 3 figures, IMVIP 2023"},{"id":"http://arxiv.org/abs/2308.07799v1","updated":"2023-08-15T14:25:53Z","published":"2023-08-15T14:25:53Z","title":"Handwritten Stenography Recognition and the LION Dataset","summary":" Purpose: In this paper, we establish a baseline for handwritten stenography\nrecognition, using the novel LION dataset, and investigate the impact of\nincluding selected aspects of stenographic theory into the recognition process.\nWe make the LION dataset publicly available with the aim of encouraging future\nresearch in handwritten stenography recognition.\n Methods: A state-of-the-art text recognition model is trained to establish a\nbaseline. Stenographic domain knowledge is integrated by applying four\ndifferent encoding methods that transform the target sequence into\nrepresentations, which approximate selected aspects of the writing system.\nResults are further improved by integrating a pre-training scheme, based on\nsynthetic data.\n Results: The baseline model achieves an average test character error rate\n(CER) of 29.81% and a word error rate (WER) of 55.14%. Test error rates are\nreduced significantly by combining stenography-specific target sequence\nencodings with pre-training and fine-tuning, yielding CERs in the range of\n24.5% - 26% and WERs of 44.8% - 48.2%.\n Conclusion: The obtained results demonstrate the challenging nature of\nstenography recognition. Integrating stenography-specific knowledge, in\nconjunction with pre-training and fine-tuning on synthetic data, yields\nconsiderable improvements. Together with our precursor study on the subject,\nthis is the first work to apply modern handwritten text recognition to\nstenography. The dataset and our code are publicly available via Zenodo.\n","authors":["Raphaela Heil","Malin Nauwerck"],"pdf_url":"https://arxiv.org/pdf/2308.07799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07795v1","updated":"2023-08-15T14:21:24Z","published":"2023-08-15T14:21:24Z","title":"Learning to Identify Critical States for Reinforcement Learning from\n Videos","summary":" Recent work on deep reinforcement learning (DRL) has pointed out that\nalgorithmic information about good policies can be extracted from offline data\nwhich lack explicit information about executed actions. For example, videos of\nhumans or robots may convey a lot of implicit information about rewarding\naction sequences, but a DRL machine that wants to profit from watching such\nvideos must first learn by itself to identify and recognize relevant\nstates/actions/rewards. Without relying on ground-truth annotations, our new\nmethod called Deep State Identifier learns to predict returns from episodes\nencoded as videos. Then it uses a kind of mask-based sensitivity analysis to\nextract/identify important critical states. Extensive experiments showcase our\nmethod's potential for understanding and improving agent behavior. The source\ncode and the generated datasets are available at\nhttps://github.com/AI-Initiative-KAUST/VideoRLCS.\n","authors":["Haozhe Liu","Mingchen Zhuge","Bing Li","Yuhui Wang","Francesco Faccio","Bernard Ghanem","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2308.07795v1.pdf","comment":"This paper was accepted to ICCV23"},{"id":"http://arxiv.org/abs/2305.12250v2","updated":"2023-08-15T14:13:22Z","published":"2023-05-20T17:43:09Z","title":"DAC: Detector-Agnostic Spatial Covariances for Deep Local Features","summary":" Current deep visual local feature detectors do not model the spatial\nuncertainty of detected features, producing suboptimal results in downstream\napplications. In this work, we propose two post-hoc covariance estimates that\ncan be plugged into any pretrained deep feature detector: a simple, isotropic\ncovariance estimate that uses the predicted score at a given pixel location,\nand a full covariance estimate via the local structure tensor of the learned\nscore maps. Both methods are easy to implement and can be applied to any deep\nfeature detector. We show that these covariances are directly related to errors\nin feature matching, leading to improvements in downstream tasks, including\nsolving the perspective-n-point problem and motion-only bundle adjustment. Code\nis available at https://github.com/javrtg/DAC\n","authors":["Javier Tirado-Garín","Frederik Warburg","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2305.12250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07787v1","updated":"2023-08-15T14:07:41Z","published":"2023-08-15T14:07:41Z","title":"DiffV2S: Diffusion-based Video-to-Speech Synthesis with Vision-guided\n Speaker Embedding","summary":" Recent research has demonstrated impressive results in video-to-speech\nsynthesis which involves reconstructing speech solely from visual input.\nHowever, previous works have struggled to accurately synthesize speech due to a\nlack of sufficient guidance for the model to infer the correct content with the\nappropriate sound. To resolve the issue, they have adopted an extra speaker\nembedding as a speaking style guidance from a reference auditory information.\nNevertheless, it is not always possible to obtain the audio information from\nthe corresponding video input, especially during the inference time. In this\npaper, we present a novel vision-guided speaker embedding extractor using a\nself-supervised pre-trained model and prompt tuning technique. In doing so, the\nrich speaker embedding information can be produced solely from input visual\ninformation, and the extra audio information is not necessary during the\ninference time. Using the extracted vision-guided speaker embedding\nrepresentations, we further develop a diffusion-based video-to-speech synthesis\nmodel, so called DiffV2S, conditioned on those speaker embeddings and the\nvisual representation extracted from the input video. The proposed DiffV2S not\nonly maintains phoneme details contained in the input video frames, but also\ncreates a highly intelligible mel-spectrogram in which the speaker identities\nof the multiple speakers are all preserved. Our experimental results show that\nDiffV2S achieves the state-of-the-art performance compared to the previous\nvideo-to-speech synthesis technique.\n","authors":["Jeongsoo Choi","Joanna Hong","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2308.07787v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07783v1","updated":"2023-08-15T14:04:50Z","published":"2023-08-15T14:04:50Z","title":"Future Video Prediction from a Single Frame for Video Anomaly Detection","summary":" Video anomaly detection (VAD) is an important but challenging task in\ncomputer vision. The main challenge rises due to the rarity of training samples\nto model all anomaly cases. Hence, semi-supervised anomaly detection methods\nhave gotten more attention, since they focus on modeling normals and they\ndetect anomalies by measuring the deviations from normal patterns. Despite\nimpressive advances of these methods in modeling normal motion and appearance,\nlong-term motion modeling has not been effectively explored so far. Inspired by\nthe abilities of the future frame prediction proxy-task, we introduce the task\nof future video prediction from a single frame, as a novel proxy-task for video\nanomaly detection. This proxy-task alleviates the challenges of previous\nmethods in learning longer motion patterns. Moreover, we replace the initial\nand future raw frames with their corresponding semantic segmentation map, which\nnot only makes the method aware of object class but also makes the prediction\ntask less complex for the model. Extensive experiments on the benchmark\ndatasets (ShanghaiTech, UCSD-Ped1, and UCSD-Ped2) show the effectiveness of the\nmethod and the superiority of its performance compared to SOTA prediction-based\nVAD methods.\n","authors":["Mohammad Baradaran","Robert Bergevin"],"pdf_url":"https://arxiv.org/pdf/2308.07783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07781v1","updated":"2023-08-15T13:59:47Z","published":"2023-08-15T13:59:47Z","title":"Learning Image Deraining Transformer Network with Dynamic Dual\n Self-Attention","summary":" Recently, Transformer-based architecture has been introduced into single\nimage deraining task due to its advantage in modeling non-local information.\nHowever, existing approaches tend to integrate global features based on a dense\nself-attention strategy since it tend to uses all similarities of the tokens\nbetween the queries and keys. In fact, this strategy leads to ignoring the most\nrelevant information and inducing blurry effect by the irrelevant\nrepresentations during the feature aggregation. To this end, this paper\nproposes an effective image deraining Transformer with dynamic dual\nself-attention (DDSA), which combines both dense and sparse attention\nstrategies to better facilitate clear image reconstruction. Specifically, we\nonly select the most useful similarity values based on top-k approximate\ncalculation to achieve sparse attention. In addition, we also develop a novel\nspatial-enhanced feed-forward network (SEFN) to further obtain a more accurate\nrepresentation for achieving high-quality derained results. Extensive\nexperiments on benchmark datasets demonstrate the effectiveness of our proposed\nmethod.\n","authors":["Zhentao Fan","Hongming Chen","Yufeng Li"],"pdf_url":"https://arxiv.org/pdf/2308.07781v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.07778v1","updated":"2023-08-15T13:54:50Z","published":"2023-08-15T13:54:50Z","title":"An Interpretable Machine Learning Model with Deep Learning-based Imaging\n Biomarkers for Diagnosis of Alzheimer's Disease","summary":" Machine learning methods have shown large potential for the automatic early\ndiagnosis of Alzheimer's Disease (AD). However, some machine learning methods\nbased on imaging data have poor interpretability because it is usually unclear\nhow they make their decisions. Explainable Boosting Machines (EBMs) are\ninterpretable machine learning models based on the statistical framework of\ngeneralized additive modeling, but have so far only been used for tabular data.\nTherefore, we propose a framework that combines the strength of EBM with\nhigh-dimensional imaging data using deep learning-based feature extraction. The\nproposed framework is interpretable because it provides the importance of each\nfeature. We validated the proposed framework on the Alzheimer's Disease\nNeuroimaging Initiative (ADNI) dataset, achieving accuracy of 0.883 and\narea-under-the-curve (AUC) of 0.970 on AD and control classification.\nFurthermore, we validated the proposed framework on an external testing set,\nachieving accuracy of 0.778 and AUC of 0.887 on AD and subjective cognitive\ndecline (SCD) classification. The proposed framework significantly outperformed\nan EBM model using volume biomarkers instead of deep learning-based features,\nas well as an end-to-end convolutional neural network (CNN) with optimized\narchitecture.\n","authors":["Wenjie Kang","Bo Li","Janne M. Papma","Lize C. Jiskoot","Peter Paul De Deyn","Geert Jan Biessels","Jurgen A. H. R. Claassen","Huub A. M. Middelkoop","Wiesje M. van der Flier","Inez H. G. B. Ramakers","Stefan Klein","Esther E. Bron"],"pdf_url":"https://arxiv.org/pdf/2308.07778v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.07771v1","updated":"2023-08-15T13:45:45Z","published":"2023-08-15T13:45:45Z","title":"Dual-path TokenLearner for Remote Photoplethysmography-based\n Physiological Measurement with Facial Videos","summary":" Remote photoplethysmography (rPPG) based physiological measurement is an\nemerging yet crucial vision task, whose challenge lies in exploring accurate\nrPPG prediction from facial videos accompanied by noises of illumination\nvariations, facial occlusions, head movements, \\etc, in a non-contact manner.\nExisting mainstream CNN-based models make efforts to detect physiological\nsignals by capturing subtle color changes in facial regions of interest (ROI)\ncaused by heartbeats. However, such models are constrained by the limited local\nspatial or temporal receptive fields in the neural units. Unlike them, a native\nTransformer-based framework called Dual-path TokenLearner (Dual-TL) is proposed\nin this paper, which utilizes the concept of learnable tokens to integrate both\nspatial and temporal informative contexts from the global perspective of the\nvideo. Specifically, the proposed Dual-TL uses a Spatial TokenLearner (S-TL) to\nexplore associations in different facial ROIs, which promises the rPPG\nprediction far away from noisy ROI disturbances. Complementarily, a Temporal\nTokenLearner (T-TL) is designed to infer the quasi-periodic pattern of\nheartbeats, which eliminates temporal disturbances such as head movements. The\ntwo TokenLearners, S-TL and T-TL, are executed in a dual-path mode. This\nenables the model to reduce noise disturbances for final rPPG signal\nprediction. Extensive experiments on four physiological measurement benchmark\ndatasets are conducted. The Dual-TL achieves state-of-the-art performances in\nboth intra- and cross-dataset testings, demonstrating its immense potential as\na basic backbone for rPPG measurement. The source code is available at\n\\href{https://github.com/VUT-HFUT/Dual-TL}{https://github.com/VUT-HFUT/Dual-TL}\n","authors":["Wei Qian","Dan Guo","Kun Li","Xilan Tian","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07770v1","updated":"2023-08-15T13:43:48Z","published":"2023-08-15T13:43:48Z","title":"Multi-scale Promoted Self-adjusting Correlation Learning for Facial\n Action Unit Detection","summary":" Facial Action Unit (AU) detection is a crucial task in affective computing\nand social robotics as it helps to identify emotions expressed through facial\nexpressions. Anatomically, there are innumerable correlations between AUs,\nwhich contain rich information and are vital for AU detection. Previous methods\nused fixed AU correlations based on expert experience or statistical rules on\nspecific benchmarks, but it is challenging to comprehensively reflect complex\ncorrelations between AUs via hand-crafted settings. There are alternative\nmethods that employ a fully connected graph to learn these dependencies\nexhaustively. However, these approaches can result in a computational explosion\nand high dependency with a large dataset. To address these challenges, this\npaper proposes a novel self-adjusting AU-correlation learning (SACL) method\nwith less computation for AU detection. This method adaptively learns and\nupdates AU correlation graphs by efficiently leveraging the characteristics of\ndifferent levels of AU motion and emotion representation information extracted\nin different stages of the network. Moreover, this paper explores the role of\nmulti-scale learning in correlation information extraction, and design a simple\nyet effective multi-scale feature learning (MSFL) method to promote better\nperformance in AU detection. By integrating AU correlation information with\nmulti-scale features, the proposed method obtains a more robust feature\nrepresentation for the final AU detection. Extensive experiments show that the\nproposed method outperforms the state-of-the-art methods on widely used AU\ndetection benchmark datasets, with only 28.7\\% and 12.0\\% of the parameters and\nFLOPs of the best method, respectively. The code for this method is available\nat \\url{https://github.com/linuxsino/Self-adjusting-AU}.\n","authors":["Xin Liu","Kaishen Yuan","Xuesong Niu","Jingang Shi","Zitong Yu","Huanjing Yue","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2308.07770v1.pdf","comment":"13pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.07766v1","updated":"2023-08-15T13:35:29Z","published":"2023-08-15T13:35:29Z","title":"Whale Detection Enhancement through Synthetic Satellite Images","summary":" With a number of marine populations in rapid decline, collecting and\nanalyzing data about marine populations has become increasingly important to\ndevelop effective conservation policies for a wide range of marine animals,\nincluding whales. Modern computer vision algorithms allow us to detect whales\nin images in a wide range of domains, further speeding up and enhancing the\nmonitoring process. However, these algorithms heavily rely on large training\ndatasets, which are challenging and time-consuming to collect particularly in\nmarine or aquatic environments. Recent advances in AI however have made it\npossible to synthetically create datasets for training machine learning\nalgorithms, thus enabling new solutions that were not possible before. In this\nwork, we present a solution - SeaDroneSim2 benchmark suite, which addresses\nthis challenge by generating aerial, and satellite synthetic image datasets to\nimprove the detection of whales and reduce the effort required for training\ndata collection. We show that we can achieve a 15% performance boost on whale\ndetection compared to using the real data alone for training, by augmenting a\n10% real data. We open source both the code of the simulation platform\nSeaDroneSim2 and the dataset generated through it.\n","authors":["Akshaj Gaur","Cheng Liu","Xiaomin Lin","Nare Karapetyan","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2308.07766v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06097v2","updated":"2023-08-15T13:34:25Z","published":"2023-08-11T12:17:24Z","title":"RIGID: Recurrent GAN Inversion and Editing of Real Face Videos","summary":" GAN inversion is indispensable for applying the powerful editability of GAN\nto real images. However, existing methods invert video frames individually\noften leading to undesired inconsistent results over time. In this paper, we\npropose a unified recurrent framework, named \\textbf{R}ecurrent v\\textbf{I}deo\n\\textbf{G}AN \\textbf{I}nversion and e\\textbf{D}iting (RIGID), to explicitly and\nsimultaneously enforce temporally coherent GAN inversion and facial editing of\nreal videos. Our approach models the temporal relations between current and\nprevious frames from three aspects. To enable a faithful real video\nreconstruction, we first maximize the inversion fidelity and consistency by\nlearning a temporal compensated latent code. Second, we observe incoherent\nnoises lie in the high-frequency domain that can be disentangled from the\nlatent space. Third, to remove the inconsistency after attribute manipulation,\nwe propose an \\textit{in-between frame composition constraint} such that the\narbitrary frame must be a direct composite of its neighboring frames. Our\nunified framework learns the inherent coherence between input frames in an\nend-to-end manner, and therefore it is agnostic to a specific attribute and can\nbe applied to arbitrary editing of the same video without re-training.\nExtensive experiments demonstrate that RIGID outperforms state-of-the-art\nmethods qualitatively and quantitatively in both inversion and editing tasks.\nThe deliverables can be found in \\url{https://cnnlstm.github.io/RIGID}\n","authors":["Yangyang Xu","Shengfeng He","Kwan-Yee K. Wong","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2308.06097v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2211.16198v4","updated":"2023-08-15T13:31:15Z","published":"2022-11-28T16:48:41Z","title":"SuS-X: Training-Free Name-Only Transfer of Vision-Language Models","summary":" Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet\neffective way to train large-scale vision-language models. CLIP demonstrates\nimpressive zero-shot classification and retrieval on diverse downstream tasks.\nHowever, to leverage its full potential, fine-tuning still appears to be\nnecessary. Fine-tuning the entire CLIP model can be resource-intensive and\nunstable. Moreover, recent methods that aim to circumvent this need for\nfine-tuning still require access to images from the target distribution. In\nthis paper, we pursue a different approach and explore the regime of\ntraining-free \"name-only transfer\" in which the only knowledge we possess about\nthe downstream task comprises the names of downstream target categories. We\npropose a novel method, SuS-X, consisting of two key building blocks -- SuS and\nTIP-X, that requires neither intensive fine-tuning nor costly labelled data.\nSuS-X achieves state-of-the-art zero-shot classification results on 19\nbenchmark datasets. We further show the utility of TIP-X in the training-free\nfew-shot setting, where we again achieve state-of-the-art results over strong\ntraining-free baselines. Code is available at\nhttps://github.com/vishaal27/SuS-X.\n","authors":["Vishaal Udandarao","Ankush Gupta","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2211.16198v4.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.07751v1","updated":"2023-08-15T13:09:33Z","published":"2023-08-15T13:09:33Z","title":"CASPNet++: Joint Multi-Agent Motion Prediction","summary":" The prediction of road users' future motion is a critical task in supporting\nadvanced driver-assistance systems (ADAS). It plays an even more crucial role\nfor autonomous driving (AD) in enabling the planning and execution of safe\ndriving maneuvers. Based on our previous work, Context-Aware Scene Prediction\nNetwork (CASPNet), an improved system, CASPNet++, is proposed. In this work, we\nfocus on further enhancing the interaction modeling and scene understanding to\nsupport the joint prediction of all road users in a scene using spatiotemporal\ngrids to model future occupancy. Moreover, an instance-based output head is\nintroduced to provide multi-modal trajectories for agents of interest. In\nextensive quantitative and qualitative analysis, we demonstrate the scalability\nof CASPNet++ in utilizing and fusing diverse environmental input sources such\nas HD maps, Radar detection, and Lidar segmentation. Tested on the\nurban-focused prediction dataset nuScenes, CASPNet++ reaches state-of-the-art\nperformance. The model has been deployed in a testing vehicle, running in\nreal-time with moderate computational resources.\n","authors":["Maximilian Schäfer","Kun Zhao","Anton Kummert"],"pdf_url":"https://arxiv.org/pdf/2308.07751v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2104.05642v7","updated":"2023-08-15T13:03:12Z","published":"2021-04-12T17:03:42Z","title":"Common Limitations of Image Processing Metrics: A Picture Story","summary":" While the importance of automatic image analysis is continuously increasing,\nrecent meta-research revealed major flaws with respect to algorithm validation.\nPerformance metrics are particularly key for meaningful, objective, and\ntransparent performance assessment and validation of the used automatic\nalgorithms, but relatively little attention has been given to the practical\npitfalls when using specific metrics for a given image analysis task. These are\ntypically related to (1) the disregard of inherent metric properties, such as\nthe behaviour in the presence of class imbalance or small target structures,\n(2) the disregard of inherent data set properties, such as the non-independence\nof the test cases, and (3) the disregard of the actual biomedical domain\ninterest that the metrics should reflect. This living dynamically document has\nthe purpose to illustrate important limitations of performance metrics commonly\napplied in the field of image analysis. In this context, it focuses on\nbiomedical image analysis problems that can be phrased as image-level\nclassification, semantic segmentation, instance segmentation, or object\ndetection task. The current version is based on a Delphi process on metrics\nconducted by an international consortium of image analysis experts from more\nthan 60 institutions worldwide.\n","authors":["Annika Reinke","Minu D. Tizabi","Carole H. Sudre","Matthias Eisenmann","Tim Rädsch","Michael Baumgartner","Laura Acion","Michela Antonelli","Tal Arbel","Spyridon Bakas","Peter Bankhead","Arriel Benis","Matthew Blaschko","Florian Büttner","M. Jorge Cardoso","Jianxu Chen","Veronika Cheplygina","Evangelia Christodoulou","Beth Cimini","Gary S. Collins","Sandy Engelhardt","Keyvan Farahani","Luciana Ferrer","Adrian Galdran","Bram van Ginneken","Ben Glocker","Patrick Godau","Robert Haase","Fred Hamprecht","Daniel A. Hashimoto","Doreen Heckmann-Nötzel","Peter Hirsch","Michael M. Hoffman","Merel Huisman","Fabian Isensee","Pierre Jannin","Charles E. Kahn","Dagmar Kainmueller","Bernhard Kainz","Alexandros Karargyris","Alan Karthikesalingam","A. Emre Kavur","Hannes Kenngott","Jens Kleesiek","Andreas Kleppe","Sven Kohler","Florian Kofler","Annette Kopp-Schneider","Thijs Kooi","Michal Kozubek","Anna Kreshuk","Tahsin Kurc","Bennett A. Landman","Geert Litjens","Amin Madani","Klaus Maier-Hein","Anne L. Martel","Peter Mattson","Erik Meijering","Bjoern Menze","David Moher","Karel G. M. Moons","Henning Müller","Brennan Nichyporuk","Felix Nickel","M. Alican Noyan","Jens Petersen","Gorkem Polat","Susanne M. Rafelski","Nasir Rajpoot","Mauricio Reyes","Nicola Rieke","Michael Riegler","Hassan Rivaz","Julio Saez-Rodriguez","Clara I. Sánchez","Julien Schroeter","Anindo Saha","M. Alper Selver","Lalith Sharan","Shravya Shetty","Maarten van Smeden","Bram Stieltjes","Ronald M. Summers","Abdel A. Taha","Aleksei Tiulpin","Sotirios A. Tsaftaris","Ben Van Calster","Gaël Varoquaux","Manuel Wiesenfarth","Ziv R. Yaniv","Paul Jäger","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2104.05642v7.pdf","comment":"Shared first authors: Annika Reinke and Minu D. Tizabi. This is a\n dynamic paper on limitations of commonly used metrics. It discusses metrics\n for image-level classification, semantic and instance segmentation, and\n object detection. For missing use cases, comments or questions, please\n contact a.reinke@dkfz.de. Substantial contributions to this document will be\n acknowledged with a co-authorship"},{"id":"http://arxiv.org/abs/2308.07749v1","updated":"2023-08-15T13:00:42Z","published":"2023-08-15T13:00:42Z","title":"Dancing Avatar: Pose and Text-Guided Human Motion Videos Synthesis with\n Image Diffusion Model","summary":" The rising demand for creating lifelike avatars in the digital realm has led\nto an increased need for generating high-quality human videos guided by textual\ndescriptions and poses. We propose Dancing Avatar, designed to fabricate human\nmotion videos driven by poses and textual cues. Our approach employs a\npretrained T2I diffusion model to generate each video frame in an\nautoregressive fashion. The crux of innovation lies in our adept utilization of\nthe T2I diffusion model for producing video frames successively while\npreserving contextual relevance. We surmount the hurdles posed by maintaining\nhuman character and clothing consistency across varying poses, along with\nupholding the background's continuity amidst diverse human movements. To ensure\nconsistent human appearances across the entire video, we devise an intra-frame\nalignment module. This module assimilates text-guided synthesized human\ncharacter knowledge into the pretrained T2I diffusion model, synergizing\ninsights from ChatGPT. For preserving background continuity, we put forth a\nbackground alignment pipeline, amalgamating insights from segment anything and\nimage inpainting techniques. Furthermore, we propose an inter-frame alignment\nmodule that draws inspiration from an auto-regressive pipeline to augment\ntemporal consistency between adjacent frames, where the preceding frame guides\nthe synthesis process of the current frame. Comparisons with state-of-the-art\nmethods demonstrate that Dancing Avatar exhibits the capacity to generate human\nvideos with markedly superior quality, both in terms of human and background\nfidelity, as well as temporal coherence compared to existing state-of-the-art\napproaches.\n","authors":["Bosheng Qin","Wentao Ye","Qifan Yu","Siliang Tang","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2308.07749v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2212.11613v4","updated":"2023-08-15T12:58:33Z","published":"2022-12-22T11:17:57Z","title":"DDColor: Towards Photo-Realistic Image Colorization via Dual Decoders","summary":" Image colorization is a challenging problem due to multi-modal uncertainty\nand high ill-posedness. Directly training a deep neural network usually leads\nto incorrect semantic colors and low color richness. While transformer-based\nmethods can deliver better results, they often rely on manually designed\npriors, suffer from poor generalization ability, and introduce color bleeding\neffects. To address these issues, we propose DDColor, an end-to-end method with\ndual decoders for image colorization. Our approach includes a pixel decoder and\na query-based color decoder. The former restores the spatial resolution of the\nimage, while the latter utilizes rich visual features to refine color queries,\nthus avoiding hand-crafted priors. Our two decoders work together to establish\ncorrelations between color and multi-scale semantic representations via\ncross-attention, significantly alleviating the color bleeding effect.\nAdditionally, a simple yet effective colorfulness loss is introduced to enhance\nthe color richness. Extensive experiments demonstrate that DDColor achieves\nsuperior performance to existing state-of-the-art works both quantitatively and\nqualitatively. The codes and models are publicly available at\nhttps://github.com/piddnad/DDColor.\n","authors":["Xiaoyang Kang","Tao Yang","Wenqi Ouyang","Peiran Ren","Lingzhi Li","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2212.11613v4.pdf","comment":"ICCV 2023; Code: https://github.com/piddnad/DDColor"},{"id":"http://arxiv.org/abs/2308.07748v1","updated":"2023-08-15T12:58:06Z","published":"2023-08-15T12:58:06Z","title":"Exploiting Sparsity in Automotive Radar Object Detection Networks","summary":" Having precise perception of the environment is crucial for ensuring the\nsecure and reliable functioning of autonomous driving systems. Radar object\ndetection networks are one fundamental part of such systems. CNN-based object\ndetectors showed good performance in this context, but they require large\ncompute resources. This paper investigates sparse convolutional object\ndetection networks, which combine powerful grid-based detection with low\ncompute resources. We investigate radar specific challenges and propose sparse\nkernel point pillars (SKPP) and dual voxel point convolutions (DVPC) as\nremedies for the grid rendering and sparse backbone architectures. We evaluate\nour SKPP-DPVCN architecture on nuScenes, which outperforms the baseline by\n5.89% and the previous state of the art by 4.19% in Car AP4.0. Moreover,\nSKPP-DPVCN reduces the average scale error (ASE) by 21.41% over the baseline.\n","authors":["Marius Lippke","Maurice Quach","Sascha Braun","Daniel Köhler","Michael Ulrich","Bastian Bischoff","Wei Yap Tan"],"pdf_url":"https://arxiv.org/pdf/2308.07748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07743v1","updated":"2023-08-15T12:50:06Z","published":"2023-08-15T12:50:06Z","title":"ChartDETR: A Multi-shape Detection Network for Visual Chart Recognition","summary":" Visual chart recognition systems are gaining increasing attention due to the\ngrowing demand for automatically identifying table headers and values from\nchart images. Current methods rely on keypoint detection to estimate data\nelement shapes in charts but suffer from grouping errors in post-processing. To\naddress this issue, we propose ChartDETR, a transformer-based multi-shape\ndetector that localizes keypoints at the corners of regular shapes to\nreconstruct multiple data elements in a single chart image. Our method predicts\nall data element shapes at once by introducing query groups in set prediction,\neliminating the need for further postprocessing. This property allows ChartDETR\nto serve as a unified framework capable of representing various chart types\nwithout altering the network architecture, effectively detecting data elements\nof diverse shapes. We evaluated ChartDETR on three datasets, achieving\ncompetitive results across all chart types without any additional enhancements.\nFor example, ChartDETR achieved an F1 score of 0.98 on Adobe Synthetic,\nsignificantly outperforming the previous best model with a 0.71 F1 score.\nAdditionally, we obtained a new state-of-the-art result of 0.97 on\nExcelChart400k. The code will be made publicly available.\n","authors":["Wenyuan Xue","Dapeng Chen","Baosheng Yu","Yifei Chen","Sai Zhou","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2308.07743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03251v2","updated":"2023-08-15T12:31:33Z","published":"2023-04-06T17:36:23Z","title":"SALUDA: Surface-based Automotive Lidar Unsupervised Domain Adaptation","summary":" Learning models on one labeled dataset that generalize well on another domain\nis a difficult task, as several shifts might happen between the data domains.\nThis is notably the case for lidar data, for which models can exhibit large\nperformance discrepancies due for instance to different lidar patterns or\nchanges in acquisition conditions. This paper addresses the corresponding\nUnsupervised Domain Adaptation (UDA) task for semantic segmentation. To\nmitigate this problem, we introduce an unsupervised auxiliary task of learning\nan implicit underlying surface representation simultaneously on source and\ntarget data. As both domains share the same latent representation, the model is\nforced to accommodate discrepancies between the two sources of data. This novel\nstrategy differs from classical minimization of statistical divergences or\nlidar-specific domain adaptation techniques. Our experiments demonstrate that\nour method achieves a better performance than the current state of the art,\nboth in real-to-real and synthetic-to-real scenarios.\n","authors":["Bjoern Michele","Alexandre Boulch","Gilles Puy","Tuan-Hung Vu","Renaud Marlet","Nicolas Courty"],"pdf_url":"https://arxiv.org/pdf/2304.03251v2.pdf","comment":"Project repository: github.com/valeoai/SALUDA"},{"id":"http://arxiv.org/abs/2308.07737v1","updated":"2023-08-15T12:30:22Z","published":"2023-08-15T12:30:22Z","title":"Identity-Consistent Aggregation for Video Object Detection","summary":" In Video Object Detection (VID), a common practice is to leverage the rich\ntemporal contexts from the video to enhance the object representations in each\nframe. Existing methods treat the temporal contexts obtained from different\nobjects indiscriminately and ignore their different identities. While\nintuitively, aggregating local views of the same object in different frames may\nfacilitate a better understanding of the object. Thus, in this paper, we aim to\nenable the model to focus on the identity-consistent temporal contexts of each\nobject to obtain more comprehensive object representations and handle the rapid\nobject appearance variations such as occlusion, motion blur, etc. However,\nrealizing this goal on top of existing VID models faces low-efficiency problems\ndue to their redundant region proposals and nonparallel frame-wise prediction\nmanner. To aid this, we propose ClipVID, a VID model equipped with\nIdentity-Consistent Aggregation (ICA) layers specifically designed for mining\nfine-grained and identity-consistent temporal contexts. It effectively reduces\nthe redundancies through the set prediction strategy, making the ICA layers\nvery efficient and further allowing us to design an architecture that makes\nparallel clip-wise predictions for the whole video clip. Extensive experimental\nresults demonstrate the superiority of our method: a state-of-the-art (SOTA)\nperformance (84.7% mAP) on the ImageNet VID dataset while running at a speed\nabout 7x faster (39.3 fps) than previous SOTAs.\n","authors":["Chaorui Deng","Da Chen","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.07737v1.pdf","comment":"to be appeared at ICCV2023"},{"id":"http://arxiv.org/abs/2308.07733v1","updated":"2023-08-15T12:17:46Z","published":"2023-08-15T12:17:46Z","title":"Dynamic Low-Rank Instance Adaptation for Universal Neural Image\n Compression","summary":" The latest advancements in neural image compression show great potential in\nsurpassing the rate-distortion performance of conventional standard codecs.\nNevertheless, there exists an indelible domain gap between the datasets\nutilized for training (i.e., natural images) and those utilized for inference\n(e.g., artistic images). Our proposal involves a low-rank adaptation approach\naimed at addressing the rate-distortion drop observed in out-of-domain\ndatasets. Specifically, we perform low-rank matrix decomposition to update\ncertain adaptation parameters of the client's decoder. These updated\nparameters, along with image latents, are encoded into a bitstream and\ntransmitted to the decoder in practical scenarios. Due to the low-rank\nconstraint imposed on the adaptation parameters, the resulting bit rate\noverhead is small. Furthermore, the bit rate allocation of low-rank adaptation\nis \\emph{non-trivial}, considering the diverse inputs require varying\nadaptation bitstreams. We thus introduce a dynamic gating network on top of the\nlow-rank adaptation method, in order to decide which decoder layer should\nemploy adaptation. The dynamic adaptation network is optimized end-to-end using\nrate-distortion loss. Our proposed method exhibits universality across diverse\nimage datasets. Extensive results demonstrate that this paradigm significantly\nmitigates the domain gap, surpassing non-adaptive methods with an average\nBD-rate improvement of approximately $19\\%$ across out-of-domain images.\nFurthermore, it outperforms the most advanced instance adaptive methods by\nroughly $5\\%$ BD-rate. Ablation studies confirm our method's ability to\nuniversally enhance various image compression architectures.\n","authors":["Yue Lv","Jinxi Xiang","Jun Zhang","Wenming Yang","Xiao Han","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2308.07733v1.pdf","comment":"Accepted by ACM MM 2023, 13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.07732v1","updated":"2023-08-15T12:13:44Z","published":"2023-08-15T12:13:44Z","title":"UniTR: A Unified and Efficient Multi-Modal Transformer for\n Bird's-Eye-View Representation","summary":" Jointly processing information from multiple sensors is crucial to achieving\naccurate and robust perception for reliable autonomous driving systems.\nHowever, current 3D perception research follows a modality-specific paradigm,\nleading to additional computation overheads and inefficient collaboration\nbetween different sensor data. In this paper, we present an efficient\nmulti-modal backbone for outdoor 3D perception named UniTR, which processes a\nvariety of modalities with unified modeling and shared parameters. Unlike\nprevious works, UniTR introduces a modality-agnostic transformer encoder to\nhandle these view-discrepant sensor data for parallel modal-wise representation\nlearning and automatic cross-modal interaction without additional fusion steps.\nMore importantly, to make full use of these complementary sensor types, we\npresent a novel multi-modal integration strategy by both considering\nsemantic-abundant 2D perspective and geometry-aware 3D sparse neighborhood\nrelations. UniTR is also a fundamentally task-agnostic backbone that naturally\nsupports different 3D perception tasks. It sets a new state-of-the-art\nperformance on the nuScenes benchmark, achieving +1.1 NDS higher for 3D object\ndetection and +12.0 higher mIoU for BEV map segmentation with lower inference\nlatency. Code will be available at https://github.com/Haiyang-W/UniTR .\n","authors":["Haiyang Wang","Hao Tang","Shaoshuai Shi","Aoxue Li","Zhenguo Li","Bernt Schiele","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07732v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.07731v1","updated":"2023-08-15T12:11:33Z","published":"2023-08-15T12:11:33Z","title":"Context-Aware Pseudo-Label Refinement for Source-Free Domain Adaptive\n Fundus Image Segmentation","summary":" In the domain adaptation problem, source data may be unavailable to the\ntarget client side due to privacy or intellectual property issues. Source-free\nunsupervised domain adaptation (SF-UDA) aims at adapting a model trained on the\nsource side to align the target distribution with only the source model and\nunlabeled target data. The source model usually produces noisy and\ncontext-inconsistent pseudo-labels on the target domain, i.e., neighbouring\nregions that have a similar visual appearance are annotated with different\npseudo-labels. This observation motivates us to refine pseudo-labels with\ncontext relations. Another observation is that features of the same class tend\nto form a cluster despite the domain gap, which implies context relations can\nbe readily calculated from feature distances. To this end, we propose a\ncontext-aware pseudo-label refinement method for SF-UDA. Specifically, a\ncontext-similarity learning module is developed to learn context relations.\nNext, pseudo-label revision is designed utilizing the learned context\nrelations. Further, we propose calibrating the revised pseudo-labels to\ncompensate for wrong revision caused by inaccurate context relations.\nAdditionally, we adopt a pixel-level and class-level denoising scheme to select\nreliable pseudo-labels for domain adaptation. Experiments on cross-domain\nfundus images indicate that our approach yields the state-of-the-art results.\nCode is available at https://github.com/xmed-lab/CPR.\n","authors":["Zheang Huai","Xinpeng Ding","Yi Li","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2308.07731v1.pdf","comment":"Accepted by MICCAI 2023, 11 pages"},{"id":"http://arxiv.org/abs/2308.07717v1","updated":"2023-08-15T11:50:57Z","published":"2023-08-15T11:50:57Z","title":"Real-time Automatic M-mode Echocardiography Measurement with Panel\n Attention from Local-to-Global Pixels","summary":" Motion mode (M-mode) recording is an essential part of echocardiography to\nmeasure cardiac dimension and function. However, the current diagnosis cannot\nbuild an automatic scheme, as there are three fundamental obstructs: Firstly,\nthere is no open dataset available to build the automation for ensuring\nconstant results and bridging M-mode echocardiography with real-time instance\nsegmentation (RIS); Secondly, the examination is involving the time-consuming\nmanual labelling upon M-mode echocardiograms; Thirdly, as objects in\nechocardiograms occupy a significant portion of pixels, the limited receptive\nfield in existing backbones (e.g., ResNet) composed from multiple convolution\nlayers are inefficient to cover the period of a valve movement. Existing\nnon-local attentions (NL) compromise being unable real-time with a high\ncomputation overhead or losing information from a simplified version of the\nnon-local block. Therefore, we proposed RAMEM, a real-time automatic M-mode\nechocardiography measurement scheme, contributes three aspects to answer the\nproblems: 1) provide MEIS, a dataset of M-mode echocardiograms for instance\nsegmentation, to enable consistent results and support the development of an\nautomatic scheme; 2) propose panel attention, local-to-global efficient\nattention by pixel-unshuffling, embedding with updated UPANets V2 in a RIS\nscheme toward big object detection with global receptive field; 3) develop and\nimplement AMEM, an efficient algorithm of automatic M-mode echocardiography\nmeasurement enabling fast and accurate automatic labelling among diagnosis. The\nexperimental results show that RAMEM surpasses existing RIS backbones (with\nnon-local attention) in PASCAL 2012 SBD and human performances in real-time\nMEIS tested. The code of MEIS and dataset are available at\nhttps://github.com/hanktseng131415go/RAME.\n","authors":["Ching-Hsun Tseng","Shao-Ju Chien","Po-Shen Wang","Shin-Jye Lee","Wei-Huan Hu","Bin Pu","Xiao-jun Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.07717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07706v1","updated":"2023-08-15T11:28:21Z","published":"2023-08-15T11:28:21Z","title":"Exploring Transfer Learning in Medical Image Segmentation using\n Vision-Language Models","summary":" Medical Image Segmentation is crucial in various clinical applications within\nthe medical domain. While state-of-the-art segmentation models have proven\neffective, integrating textual guidance to enhance visual features for this\ntask remains an area with limited progress. Existing segmentation models that\nutilize textual guidance are primarily trained on open-domain images, raising\nconcerns about their direct applicability in the medical domain without manual\nintervention or fine-tuning.\n To address these challenges, we propose using multimodal vision-language\nmodels for capturing semantic information from image descriptions and images,\nenabling the segmentation of diverse medical images. This study comprehensively\nevaluates existing vision language models across multiple datasets to assess\ntheir transferability from the open domain to the medical field. Furthermore,\nwe introduce variations of image descriptions for previously unseen images in\nthe dataset, revealing notable variations in model performance based on the\ngenerated prompts.\n Our findings highlight the distribution shift between the open-domain images\nand the medical domain and show that the segmentation models trained on\nopen-domain images are not directly transferrable to the medical field. But\ntheir performance can be increased by finetuning them in the medical datasets.\nWe report the zero-shot and finetuned segmentation performance of 4 Vision\nLanguage Models (VLMs) on 11 medical datasets using 9 types of prompts derived\nfrom 14 attributes.\n","authors":["Kanchan Poudel","Manish Dhakal","Prasiddha Bhandari","Rabin Adhikari","Safal Thapaliya","Bishesh Khanal"],"pdf_url":"https://arxiv.org/pdf/2308.07706v1.pdf","comment":"25 pages, 9 figures"},{"id":"http://arxiv.org/abs/2307.15958v2","updated":"2023-08-15T11:26:36Z","published":"2023-07-29T11:18:23Z","title":"XMem++: Production-level Video Segmentation From Few Annotated Frames","summary":" Despite advancements in user-guided video segmentation, extracting complex\nobjects consistently for highly complex scenes is still a labor-intensive task,\nespecially for production. It is not uncommon that a majority of frames need to\nbe annotated. We introduce a novel semi-supervised video object segmentation\n(SSVOS) model, XMem++, that improves existing memory-based models, with a\npermanent memory module. Most existing methods focus on single frame\nannotations, while our approach can effectively handle multiple user-selected\nframes with varying appearances of the same object or region. Our method can\nextract highly consistent results while keeping the required number of frame\nannotations low. We further introduce an iterative and attention-based frame\nsuggestion mechanism, which computes the next best frame for annotation. Our\nmethod is real-time and does not require retraining after each user input. We\nalso introduce a new dataset, PUMaVOS, which covers new challenging use cases\nnot found in previous benchmarks. We demonstrate SOTA performance on\nchallenging (partial and multi-class) segmentation scenarios as well as long\nvideos, while ensuring significantly fewer frame annotations than any existing\nmethod. Project page: https://max810.github.io/xmem2-project-page/\n","authors":["Maksym Bekuzarov","Ariana Bermudez","Joon-Young Lee","Hao Li"],"pdf_url":"https://arxiv.org/pdf/2307.15958v2.pdf","comment":"Accepted to ICCV 2023. 18 pages, 16 figures"},{"id":"http://arxiv.org/abs/2212.13925v2","updated":"2023-08-15T10:49:26Z","published":"2022-12-25T14:49:37Z","title":"Quality at the Tail","summary":" Benchmarking and evaluating deep learning models and systems necessitate a\nmeticulous approach to ensure comprehensive assessment. In practical\napplications, it is paramount to consider both the inference quality and the\ninference time, particularly within critical contexts, where stringent\nrequirements demand the simultaneous satisfaction of both metrics. Neglecting\neither aspect can result in severe and irreversible consequences, including\nloss of human life and property damage. Unfortunately, many studies lack a\ncomprehensive consideration of these metrics, often conducted under ideal or\npermissive conditions, thereby leading to incomplete or non-intuitive\nevaluation methodologies.\n This study reveals that deep learning inference quality exhibits\nfluctuations, which further introduces complications and challenges to the\nbenchmarking and evaluation. To better characterize the phenomenon, the concept\nof \"tail quality\" is introduced, which indicates the quality at the tail of\ndistributions. \"Tail quality\" can offer a more objective evaluation, overcoming\nthe limitations of conventional inference quality and inference time metrics in\ncapturing the quality fluctuation phenomenon. To capture the phenomenon, this\npaper also proposes a pioneering evaluation framework for comprehensive\nassessment and analysis of various factors affecting inference time and\nquality. Leveraging this framework enables the anticipation of the potential\ndistribution of inference time and inference quality, thus capturing \"tail\nquality\" before practically applying deep learning. The effectiveness of the\nevaluation framework is validated through experiments conducted on deep\nlearning models for three different tasks across four systems. Furthermore,\nemploying this evaluation framework, the experiments conducted a preliminary\nanalysis of several factors influencing inference quality and inference time.\n","authors":["Zhengxin Yang","Wanling Gao","Chunjie Luo","Lei Wang","Fei Tang","Xu Wen","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2212.13925v2.pdf","comment":"11 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.07688v1","updated":"2023-08-15T10:37:13Z","published":"2023-08-15T10:37:13Z","title":"Enhancing Network Initialization for Medical AI Models Using\n Large-Scale, Unlabeled Natural Images","summary":" Pre-training datasets, like ImageNet, have become the gold standard in\nmedical image analysis. However, the emergence of self-supervised learning\n(SSL), which leverages unlabeled data to learn robust features, presents an\nopportunity to bypass the intensive labeling process. In this study, we\nexplored if SSL for pre-training on non-medical images can be applied to chest\nradiographs and how it compares to supervised pre-training on non-medical\nimages and on medical images. We utilized a vision transformer and initialized\nits weights based on (i) SSL pre-training on natural images (DINOv2), (ii) SL\npre-training on natural images (ImageNet dataset), and (iii) SL pre-training on\nchest radiographs from the MIMIC-CXR database. We tested our approach on over\n800,000 chest radiographs from six large global datasets, diagnosing more than\n20 different imaging findings. Our SSL pre-training on curated images not only\noutperformed ImageNet-based pre-training (P<0.001 for all datasets) but, in\ncertain cases, also exceeded SL on the MIMIC-CXR dataset. Our findings suggest\nthat selecting the right pre-training strategy, especially with SSL, can be\npivotal for improving artificial intelligence (AI)'s diagnostic accuracy in\nmedical imaging. By demonstrating the promise of SSL in chest radiograph\nanalysis, we underline a transformative shift towards more efficient and\naccurate AI models in medical imaging.\n","authors":["Soroosh Tayebi Arasteh","Leo Misera","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.07688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07687v1","updated":"2023-08-15T10:37:04Z","published":"2023-08-15T10:37:04Z","title":"DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using\n Pre-trained Diffusion Models","summary":" Given a classifier, the inherent property of semantic Out-of-Distribution\n(OOD) samples is that their contents differ from all legal classes in terms of\nsemantics, namely semantic mismatch. There is a recent work that directly\napplies it to OOD detection, which employs a conditional Generative Adversarial\nNetwork (cGAN) to enlarge semantic mismatch in the image space. While achieving\nremarkable OOD detection performance on small datasets, it is not applicable to\nImageNet-scale datasets due to the difficulty in training cGANs with both input\nimages and labels as conditions. As diffusion models are much easier to train\nand amenable to various conditions compared to cGANs, in this work, we propose\nto directly use pre-trained diffusion models for semantic mismatch-guided OOD\ndetection, named DiffGuard. Specifically, given an OOD input image and the\npredicted label from the classifier, we try to enlarge the semantic difference\nbetween the reconstructed OOD image under these conditions and the original\ninput image. We also present several test-time techniques to further strengthen\nsuch differences. Experimental results show that DiffGuard is effective on both\nCifar-10 and hard cases of the large-scale ImageNet, and it can be easily\ncombined with existing OOD detection techniques to achieve state-of-the-art OOD\ndetection results.\n","authors":["Ruiyuan Gao","Chenchen Zhao","Lanqing Hong","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.07687v1.pdf","comment":"Accepted by ICCV2023, with supplementary materials"},{"id":"http://arxiv.org/abs/2308.07686v1","updated":"2023-08-15T10:37:03Z","published":"2023-08-15T10:37:03Z","title":"Boosting Multi-modal Model Performance with Adaptive Gradient Modulation","summary":" While the field of multi-modal learning keeps growing fast, the deficiency of\nthe standard joint training paradigm has become clear through recent studies.\nThey attribute the sub-optimal performance of the jointly trained model to the\nmodality competition phenomenon. Existing works attempt to improve the jointly\ntrained model by modulating the training process. Despite their effectiveness,\nthose methods can only apply to late fusion models. More importantly, the\nmechanism of the modality competition remains unexplored. In this paper, we\nfirst propose an adaptive gradient modulation method that can boost the\nperformance of multi-modal models with various fusion strategies. Extensive\nexperiments show that our method surpasses all existing modulation methods.\nFurthermore, to have a quantitative understanding of the modality competition\nand the mechanism behind the effectiveness of our modulation method, we\nintroduce a novel metric to measure the competition strength. This metric is\nbuilt on the mono-modal concept, a function that is designed to represent the\ncompetition-less state of a modality. Through systematic investigation, our\nresults confirm the intuition that the modulation encourages the model to rely\non the more informative modality. In addition, we find that the jointly trained\nmodel typically has a preferred modality on which the competition is weaker\nthan other modalities. However, this preferred modality need not dominate\nothers. Our code will be available at\nhttps://github.com/lihong2303/AGM_ICCV2023.\n","authors":["Hong Li","Xingyu Li","Pengbo Hu","Yinuo Lei","Chunxiao Li","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07686v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2303.04418v2","updated":"2023-08-15T09:58:08Z","published":"2023-03-08T07:45:06Z","title":"FUSQA: Fetal Ultrasound Segmentation Quality Assessment","summary":" Deep learning models have been effective for various fetal ultrasound\nsegmentation tasks. However, generalization to new unseen data has raised\nquestions about their effectiveness for clinical adoption. Normally, a\ntransition to new unseen data requires time-consuming and costly quality\nassurance processes to validate the segmentation performance post-transition.\nSegmentation quality assessment efforts have focused on natural images, where\nthe problem has been typically formulated as a dice score regression task. In\nthis paper, we propose a simplified Fetal Ultrasound Segmentation Quality\nAssessment (FUSQA) model to tackle the segmentation quality assessment when no\nmasks exist to compare with. We formulate the segmentation quality assessment\nprocess as an automated classification task to distinguish between good and\npoor-quality segmentation masks for more accurate gestational age estimation.\nWe validate the performance of our proposed approach on two datasets we collect\nfrom two hospitals using different ultrasound machines. We compare different\narchitectures, with our best-performing architecture achieving over 90%\nclassification accuracy on distinguishing between good and poor-quality\nsegmentation masks from an unseen dataset. Additionally, there was only a\n1.45-day difference between the gestational age reported by doctors and\nestimated based on CRL measurements using well-segmented masks. On the other\nhand, this difference increased and reached up to 7.73 days when we calculated\nCRL from the poorly segmented masks. As a result, AI-based approaches can\npotentially aid fetal ultrasound segmentation quality assessment and might\ndetect poor segmentation in real-time screening in the future.\n","authors":["Sevim Cengiz","Ibrahim Almakky","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2303.04418v2.pdf","comment":"13 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.07673v1","updated":"2023-08-15T09:43:10Z","published":"2023-08-15T09:43:10Z","title":"A Review of Adversarial Attacks in Computer Vision","summary":" Deep neural networks have been widely used in various downstream tasks,\nespecially those safety-critical scenario such as autonomous driving, but deep\nnetworks are often threatened by adversarial samples. Such adversarial attacks\ncan be invisible to human eyes, but can lead to DNN misclassification, and\noften exhibits transferability between deep learning and machine learning\nmodels and real-world achievability. Adversarial attacks can be divided into\nwhite-box attacks, for which the attacker knows the parameters and gradient of\nthe model, and black-box attacks, for the latter, the attacker can only obtain\nthe input and output of the model. In terms of the attacker's purpose, it can\nbe divided into targeted attacks and non-targeted attacks, which means that the\nattacker wants the model to misclassify the original sample into the specified\nclass, which is more practical, while the non-targeted attack just needs to\nmake the model misclassify the sample. The black box setting is a scenario we\nwill encounter in practice.\n","authors":["Yutong Zhang","Yao Li","Yin Li","Zhichang Guo"],"pdf_url":"https://arxiv.org/pdf/2308.07673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07126v2","updated":"2023-08-15T09:39:00Z","published":"2023-08-14T13:13:50Z","title":"A Time-aware tensor decomposition for tracking evolving patterns","summary":" Time-evolving data sets can often be arranged as a higher-order tensor with\none of the modes being the time mode. While tensor factorizations have been\nsuccessfully used to capture the underlying patterns in such higher-order data\nsets, the temporal aspect is often ignored, allowing for the reordering of time\npoints. In recent studies, temporal regularizers are incorporated in the time\nmode to tackle this issue. Nevertheless, existing approaches still do not allow\nunderlying patterns to change in time (e.g., spatial changes in the brain,\ncontextual changes in topics). In this paper, we propose temporal PARAFAC2\n(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal\nregularization to extract gradually evolving patterns from temporal data.\nThrough extensive experiments on synthetic data, we demonstrate that tPARAFAC2\ncan capture the underlying evolving patterns accurately performing better than\nPARAFAC2 and coupled matrix factorization with temporal smoothness\nregularization.\n","authors":["Christos Chatzis","Max Pfeffer","Pedro Lind","Evrim Acar"],"pdf_url":"https://arxiv.org/pdf/2308.07126v2.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.05424v2","updated":"2023-08-15T09:37:05Z","published":"2023-05-09T13:15:52Z","title":"Echo from noise: synthetic ultrasound image generation using diffusion\n models for real image segmentation","summary":" We propose a novel pipeline for the generation of synthetic ultrasound images\nvia Denoising Diffusion Probabilistic Models (DDPMs) guided by cardiac semantic\nlabel maps. We show that these synthetic images can serve as a viable\nsubstitute for real data in the training of deep-learning models for ultrasound\nimage analysis tasks such as cardiac segmentation. To demonstrate the\neffectiveness of this approach, we generated synthetic 2D echocardiograms and\ntrained a neural network for segmenting the left ventricle and left atrium. The\nperformance of the network trained on exclusively synthetic images was\nevaluated on an unseen dataset of real images and yielded mean Dice scores of\n88.6 $\\pm 4.91$ , 91.9 $\\pm 4.22$, 85.2 $\\pm 4.83$ \\% for left ventricular\nendocardium, epicardium and left atrial segmentation respectively. This\nrepresents a relative increase of $9.2$, $3.3$ and $13.9$ \\% in Dice scores\ncompared to the previous state-of-the-art. The proposed pipeline has potential\nfor application to a wide range of other tasks across various medical imaging\nmodalities.\n","authors":["David Stojanovski","Uxio Hermida","Pablo Lamata","Arian Beqiri","Alberto Gomez"],"pdf_url":"https://arxiv.org/pdf/2305.05424v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07665v1","updated":"2023-08-15T09:27:57Z","published":"2023-08-15T09:27:57Z","title":"Inversion-by-Inversion: Exemplar-based Sketch-to-Photo Synthesis via\n Stochastic Differential Equations without Training","summary":" Exemplar-based sketch-to-photo synthesis allows users to generate\nphoto-realistic images based on sketches. Recently, diffusion-based methods\nhave achieved impressive performance on image generation tasks, enabling\nhighly-flexible control through text-driven generation or energy functions.\nHowever, generating photo-realistic images with color and texture from sketch\nimages remains challenging for diffusion models. Sketches typically consist of\nonly a few strokes, with most regions left blank, making it difficult for\ndiffusion-based methods to produce photo-realistic images. In this work, we\npropose a two-stage method named ``Inversion-by-Inversion\" for exemplar-based\nsketch-to-photo synthesis. This approach includes shape-enhancing inversion and\nfull-control inversion. During the shape-enhancing inversion process, an\nuncolored photo is generated with the guidance of a shape-energy function. This\nstep is essential to ensure control over the shape of the generated photo. In\nthe full-control inversion process, we propose an appearance-energy function to\ncontrol the color and texture of the final generated photo.Importantly, our\nInversion-by-Inversion pipeline is training-free and can accept different types\nof exemplars for color and texture control. We conducted extensive experiments\nto evaluate our proposed method, and the results demonstrate its effectiveness.\n","authors":["Ximing Xing","Chuang Wang","Haitao Zhou","Zhihao Hu","Chongxuan Li","Dong Xu","Qian Yu"],"pdf_url":"https://arxiv.org/pdf/2308.07665v1.pdf","comment":"15 pages, preprint version"},{"id":"http://arxiv.org/abs/2308.07662v1","updated":"2023-08-15T09:25:11Z","published":"2023-08-15T09:25:11Z","title":"Gradient-Based Post-Training Quantization: Challenging the Status Quo","summary":" Quantization has become a crucial step for the efficient deployment of deep\nneural networks, where floating point operations are converted to simpler fixed\npoint operations. In its most naive form, it simply consists in a combination\nof scaling and rounding transformations, leading to either a limited\ncompression rate or a significant accuracy drop. Recently, Gradient-based\npost-training quantization (GPTQ) methods appears to be constitute a suitable\ntrade-off between such simple methods and more powerful, yet expensive\nQuantization-Aware Training (QAT) approaches, particularly when attempting to\nquantize LLMs, where scalability of the quantization process is of paramount\nimportance. GPTQ essentially consists in learning the rounding operation using\na small calibration set. In this work, we challenge common choices in GPTQ\nmethods. In particular, we show that the process is, to a certain extent,\nrobust to a number of variables (weight selection, feature augmentation, choice\nof calibration set). More importantly, we derive a number of best practices for\ndesigning more efficient and scalable GPTQ methods, regarding the problem\nformulation (loss, degrees of freedom, use of non-uniform quantization schemes)\nor optimization process (choice of variable and optimizer). Lastly, we propose\na novel importance-based mixed-precision technique. Those guidelines lead to\nsignificant performance improvements on all the tested state-of-the-art GPTQ\nmethods and networks (e.g. +6.819 points on ViT for 4-bit quantization), paving\nthe way for the design of scalable, yet effective quantization methods.\n","authors":["Edouard Yvinec","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2308.07662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12398v4","updated":"2023-08-15T09:05:54Z","published":"2023-03-22T09:06:07Z","title":"Multiscale Attention via Wavelet Neural Operators for Vision\n Transformers","summary":" Transformers have achieved widespread success in computer vision. At their\nheart, there is a Self-Attention (SA) mechanism, an inductive bias that\nassociates each token in the input with every other token through a weighted\nbasis. The standard SA mechanism has quadratic complexity with the sequence\nlength, which impedes its utility to long sequences appearing in high\nresolution vision. Recently, inspired by operator learning for PDEs, Adaptive\nFourier Neural Operators (AFNO) were introduced for high resolution attention\nbased on global convolution that is efficiently implemented via FFT. However,\nthe AFNO global filtering cannot well represent small and moderate scale\nstructures that commonly appear in natural images. To leverage the\ncoarse-to-fine scale structures we introduce a Multiscale Wavelet Attention\n(MWA) by leveraging wavelet neural operators which incurs linear complexity in\nthe sequence size. We replace the attention in ViT with MWA and our experiments\nwith CIFAR and Tiny-ImageNet classification demonstrate significant improvement\nover alternative Fourier-based attentions such as AFNO and Global Filter\nNetwork (GFN).\n","authors":["Anahita Nekoozadeh","Mohammad Reza Ahmadzadeh","Zahra Mardani"],"pdf_url":"https://arxiv.org/pdf/2303.12398v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07652v1","updated":"2023-08-15T09:00:21Z","published":"2023-08-15T09:00:21Z","title":"Geometry of the Visual Cortex with Applications to Image Inpainting and\n Enhancement","summary":" Equipping the rototranslation group $SE(2)$ with a sub-Riemannian structure\ninspired by the visual cortex V1, we propose algorithms for image inpainting\nand enhancement based on hypoelliptic diffusion. We innovate on previous\nimplementations of the methods by Citti, Sarti and Boscain et al., by proposing\nan alternative that prevents fading and capable of producing sharper results in\na procedure that we call WaxOn-WaxOff. We also exploit the sub-Riemannian\nstructure to define a completely new unsharp using $SE(2)$, analogous of the\nclassical unsharp filter for 2D image processing, with applications to image\nenhancement. We demonstrate our method on blood vessels enhancement in retinal\nscans.\n","authors":["Francesco Ballerin","Erlend Grong"],"pdf_url":"https://arxiv.org/pdf/2308.07652v1.pdf","comment":"Associated python package available at\n https://github.com/ballerin/v1diffusion"},{"id":"http://arxiv.org/abs/2308.07650v1","updated":"2023-08-15T08:57:03Z","published":"2023-08-15T08:57:03Z","title":"EQ-Net: Elastic Quantization Neural Networks","summary":" Current model quantization methods have shown their promising capability in\nreducing storage space and computation complexity. However, due to the\ndiversity of quantization forms supported by different hardware, one limitation\nof existing solutions is that usually require repeated optimization for\ndifferent scenarios. How to construct a model with flexible quantization forms\nhas been less studied. In this paper, we explore a one-shot network\nquantization regime, named Elastic Quantization Neural Networks (EQ-Net), which\naims to train a robust weight-sharing quantization supernet. First of all, we\npropose an elastic quantization space (including elastic bit-width,\ngranularity, and symmetry) to adapt to various mainstream quantitative forms.\nSecondly, we propose the Weight Distribution Regularization Loss (WDR-Loss) and\nGroup Progressive Guidance Loss (GPG-Loss) to bridge the inconsistency of the\ndistribution for weights and output logits in the elastic quantization space\ngap. Lastly, we incorporate genetic algorithms and the proposed Conditional\nQuantization-Aware Accuracy Predictor (CQAP) as an estimator to quickly search\nmixed-precision quantized neural networks in supernet. Extensive experiments\ndemonstrate that our EQ-Net is close to or even better than its static\ncounterparts as well as state-of-the-art robust bit-width methods. Code can be\navailable at\n\\href{https://github.com/xuke225/EQ-Net.git}{https://github.com/xuke225/EQ-Net}.\n","authors":["Ke Xu","Lei Han","Ye Tian","Shangshang Yang","Xingyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07648v1","updated":"2023-08-15T08:54:25Z","published":"2023-08-15T08:54:25Z","title":"Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval","summary":" In text-video retrieval, recent works have benefited from the powerful\nlearning capabilities of pre-trained text-image foundation models (e.g., CLIP)\nby adapting them to the video domain. A critical problem for them is how to\neffectively capture the rich semantics inside the video using the image encoder\nof CLIP. To tackle this, state-of-the-art methods adopt complex cross-modal\nmodeling techniques to fuse the text information into video frame\nrepresentations, which, however, incurs severe efficiency issues in large-scale\nretrieval systems as the video representations must be recomputed online for\nevery text query. In this paper, we discard this problematic cross-modal fusion\nprocess and aim to learn semantically-enhanced representations purely from the\nvideo, so that the video representations can be computed offline and reused for\ndifferent texts. Concretely, we first introduce a spatial-temporal \"Prompt\nCube\" into the CLIP image encoder and iteratively switch it within the encoder\nlayers to efficiently incorporate the global video semantics into frame\nrepresentations. We then propose to apply an auxiliary video captioning\nobjective to train the frame representations, which facilitates the learning of\ndetailed video semantics by providing fine-grained guidance in the semantic\nspace. With a naive temporal fusion strategy (i.e., mean-pooling) on the\nenhanced frame representations, we obtain state-of-the-art performances on\nthree benchmark datasets, i.e., MSR-VTT, MSVD, and LSMDC.\n","authors":["Chaorui Deng","Qi Chen","Pengda Qin","Da Chen","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.07648v1.pdf","comment":"to be appeared in ICCV2023"},{"id":"http://arxiv.org/abs/2209.14770v2","updated":"2023-08-15T08:49:10Z","published":"2022-09-29T13:28:34Z","title":"R2C-GAN: Restore-to-Classify GANs for Blind X-Ray Restoration and\n COVID-19 Classification","summary":" Restoration of poor quality images with a blended set of artifacts plays a\nvital role for a reliable diagnosis. Existing studies have focused on specific\nrestoration problems such as image deblurring, denoising, and exposure\ncorrection where there is usually a strong assumption on the artifact type and\nseverity. As a pioneer study in blind X-ray restoration, we propose a joint\nmodel for generic image restoration and classification: Restore-to-Classify\nGenerative Adversarial Networks (R2C-GANs). Such a jointly optimized model\nkeeps any disease intact after the restoration. Therefore, this will naturally\nlead to a higher diagnosis performance thanks to the improved X-ray image\nquality. To accomplish this crucial objective, we define the restoration task\nas an Image-to-Image translation problem from poor quality having noisy,\nblurry, or over/under-exposed images to high quality image domain. The proposed\nR2C-GAN model is able to learn forward and inverse transforms between the two\ndomains using unpaired training samples. Simultaneously, the joint\nclassification preserves the disease label during restoration. Moreover, the\nR2C-GANs are equipped with operational layers/neurons reducing the network\ndepth and further boosting both restoration and classification performances.\nThe proposed joint model is extensively evaluated over the QaTa-COV19 dataset\nfor Coronavirus Disease 2019 (COVID-19) classification. The proposed\nrestoration approach achieves over 90% F1-Score which is significantly higher\nthan the performance of any deep model. Moreover, in the qualitative analysis,\nthe restoration performance of R2C-GANs is approved by a group of medical\ndoctors. We share the software implementation at\nhttps://github.com/meteahishali/R2C-GAN.\n","authors":["Mete Ahishali","Aysen Degerli","Serkan Kiranyaz","Tahir Hamid","Rashid Mazhar","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2209.14770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15199v2","updated":"2023-08-15T08:30:45Z","published":"2023-07-27T21:14:46Z","title":"PromptStyler: Prompt-driven Style Generation for Source-free Domain\n Generalization","summary":" In a joint vision-language space, a text feature (e.g., from \"a photo of a\ndog\") could effectively represent its relevant image features (e.g., from dog\nphotos). Also, a recent study has demonstrated the cross-modal transferability\nphenomenon of this joint space. From these observations, we propose\nPromptStyler which simulates various distribution shifts in the joint space by\nsynthesizing diverse styles via prompts without using any images to deal with\nsource-free domain generalization. The proposed method learns to generate a\nvariety of style features (from \"a S* style of a\") via learnable style word\nvectors for pseudo-words S*. To ensure that learned styles do not distort\ncontent information, we force style-content features (from \"a S* style of a\n[class]\") to be located nearby their corresponding content features (from\n\"[class]\") in the joint vision-language space. After learning style word\nvectors, we train a linear classifier using synthesized style-content features.\nPromptStyler achieves the state of the art on PACS, VLCS, OfficeHome and\nDomainNet, even though it does not require any images for training.\n","authors":["Junhyeong Cho","Gilhyun Nam","Sungyeon Kim","Hunmin Yang","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.15199v2.pdf","comment":"Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/"},{"id":"http://arxiv.org/abs/2303.05073v2","updated":"2023-08-15T08:27:26Z","published":"2023-03-09T07:11:30Z","title":"Learn More for Food Recognition via Progressive Self-Distillation","summary":" Food recognition has a wide range of applications, such as health-aware\nrecommendation and self-service restaurants. Most previous methods of food\nrecognition firstly locate informative regions in some weakly-supervised\nmanners and then aggregate their features. However, location errors of\ninformative regions limit the effectiveness of these methods to some extent.\nInstead of locating multiple regions, we propose a Progressive\nSelf-Distillation (PSD) method, which progressively enhances the ability of\nnetwork to mine more details for food recognition. The training of PSD\nsimultaneously contains multiple self-distillations, in which a teacher network\nand a student network share the same embedding network. Since the student\nnetwork receives a modified image from its teacher network by masking some\ninformative regions, the teacher network outputs stronger semantic\nrepresentations than the student network. Guided by such teacher network with\nstronger semantics, the student network is encouraged to mine more useful\nregions from the modified image by enhancing its own ability. The ability of\nthe teacher network is also enhanced with the shared embedding network. By\nusing progressive training, the teacher network incrementally improves its\nability to mine more discriminative regions. In inference phase, only the\nteacher network is used without the help of the student network. Extensive\nexperiments on three datasets demonstrate the effectiveness of our proposed\nmethod and state-of-the-art performance.\n","authors":["Yaohui Zhu","Linhu Liu","Jiang Tian"],"pdf_url":"https://arxiv.org/pdf/2303.05073v2.pdf","comment":"Accepted by AAAI 2023"},{"id":"http://arxiv.org/abs/2307.07710v2","updated":"2023-08-15T08:23:21Z","published":"2023-07-15T04:48:35Z","title":"ExposureDiffusion: Learning to Expose for Low-light Image Enhancement","summary":" Previous raw image-based low-light image enhancement methods predominantly\nrelied on feed-forward neural networks to learn deterministic mappings from\nlow-light to normally-exposed images. However, they failed to capture critical\ndistribution information, leading to visually undesirable results. This work\naddresses the issue by seamlessly integrating a diffusion model with a\nphysics-based exposure model. Different from a vanilla diffusion model that has\nto perform Gaussian denoising, with the injected physics-based exposure model,\nour restoration process can directly start from a noisy image instead of pure\nnoise. As such, our method obtains significantly improved performance and\nreduced inference time compared with vanilla diffusion models. To make full use\nof the advantages of different intermediate steps, we further propose an\nadaptive residual layer that effectively screens out the side-effect in the\niterative refinement when the intermediate results have been already\nwell-exposed. The proposed framework can work with both real-paired datasets,\nSOTA noise models, and different backbone networks. Note that, the proposed\nframework is compatible with real-paired datasets, real/synthetic noise models,\nand different backbone networks. We evaluate the proposed method on various\npublic benchmarks, achieving promising results with consistent improvements\nusing different exposure models and backbones. Besides, the proposed method\nachieves better generalization capacity for unseen amplifying ratios and better\nperformance than a larger feedforward neural model when few parameters are\nadopted.\n","authors":["Yufei Wang","Yi Yu","Wenhan Yang","Lanqing Guo","Lap-Pui Chau","Alex C. Kot","Bihan Wen"],"pdf_url":"https://arxiv.org/pdf/2307.07710v2.pdf","comment":"accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.07625v1","updated":"2023-08-15T08:21:20Z","published":"2023-08-15T08:21:20Z","title":"Backpropagation Path Search On Adversarial Transferability","summary":" Deep neural networks are vulnerable to adversarial examples, dictating the\nimperativeness to test the model's robustness before deployment. Transfer-based\nattackers craft adversarial examples against surrogate models and transfer them\nto victim models deployed in the black-box situation. To enhance the\nadversarial transferability, structure-based attackers adjust the\nbackpropagation path to avoid the attack from overfitting the surrogate model.\nHowever, existing structure-based attackers fail to explore the convolution\nmodule in CNNs and modify the backpropagation graph heuristically, leading to\nlimited effectiveness. In this paper, we propose backPropagation pAth Search\n(PAS), solving the aforementioned two problems. We first propose SkipConv to\nadjust the backpropagation path of convolution by structural\nreparameterization. To overcome the drawback of heuristically designed\nbackpropagation paths, we further construct a DAG-based search space, utilize\none-step approximation for path evaluation and employ Bayesian Optimization to\nsearch for the optimal path. We conduct comprehensive experiments in a wide\nrange of transfer settings, showing that PAS improves the attack success rate\nby a huge margin for both normally trained and defense models.\n","authors":["Zhuoer Xu","Zhangxuan Gu","Jianping Zhang","Shiwen Cui","Changhua Meng","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07625v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.07624v1","updated":"2023-08-15T08:20:07Z","published":"2023-08-15T08:20:07Z","title":"Self-Prompting Large Vision Models for Few-Shot Medical Image\n Segmentation","summary":" Recent advancements in large foundation models have shown promising potential\nin the medical industry due to their flexible prompting capability. One such\nmodel, the Segment Anything Model (SAM), a prompt-driven segmentation model,\nhas shown remarkable performance improvements, surpassing state-of-the-art\napproaches in medical image segmentation. However, existing methods primarily\nrely on tuning strategies that require extensive data or prior prompts tailored\nto the specific task, making it particularly challenging when only a limited\nnumber of data samples are available. In this paper, we propose a novel\nperspective on self-prompting in medical vision applications. Specifically, we\nharness the embedding space of SAM to prompt itself through a simple yet\neffective linear pixel-wise classifier. By preserving the encoding capabilities\nof the large model, the contextual information from its decoder, and leveraging\nits interactive promptability, we achieve competitive results on multiple\ndatasets (i.e. improvement of more than 15% compared to fine-tuning the mask\ndecoder using a few images).\n","authors":["Qi Wu","Yuyao Zhang","Marawan Elbatel"],"pdf_url":"https://arxiv.org/pdf/2308.07624v1.pdf","comment":"8.5 pages + 2 pages of supplementary materials + 2 pages of\n references, 3 figures, submitted to 5th MICCAI Workshop on Domain Adaptation\n and Representation Transfer (DART)"},{"id":"http://arxiv.org/abs/2304.10465v2","updated":"2023-08-15T08:04:00Z","published":"2023-04-20T17:11:01Z","title":"Implicit Temporal Modeling with Learnable Alignment for Video\n Recognition","summary":" Contrastive language-image pretraining (CLIP) has demonstrated remarkable\nsuccess in various image tasks. However, how to extend CLIP with effective\ntemporal modeling is still an open and crucial problem. Existing factorized or\njoint spatial-temporal modeling trades off between the efficiency and\nperformance. While modeling temporal information within straight through tube\nis widely adopted in literature, we find that simple frame alignment already\nprovides enough essence without temporal attention. To this end, in this paper,\nwe proposed a novel Implicit Learnable Alignment (ILA) method, which minimizes\nthe temporal modeling effort while achieving incredibly high performance.\nSpecifically, for a frame pair, an interactive point is predicted in each\nframe, serving as a mutual information rich region. By enhancing the features\naround the interactive point, two frames are implicitly aligned. The aligned\nfeatures are then pooled into a single token, which is leveraged in the\nsubsequent spatial self-attention. Our method allows eliminating the costly or\ninsufficient temporal self-attention in video. Extensive experiments on\nbenchmarks demonstrate the superiority and generality of our module.\nParticularly, the proposed ILA achieves a top-1 accuracy of 88.7% on\nKinetics-400 with much fewer FLOPs compared with Swin-L and ViViT-H. Code is\nreleased at https://github.com/Francis-Rings/ILA .\n","authors":["Shuyuan Tu","Qi Dai","Zuxuan Wu","Zhi-Qi Cheng","Han Hu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2304.10465v2.pdf","comment":"ICCV 2023 oral. 14 pages, 7 figures. Code released at\n https://github.com/Francis-Rings/ILA"},{"id":"http://arxiv.org/abs/2212.02073v3","updated":"2023-08-15T07:55:10Z","published":"2022-12-05T07:37:32Z","title":"Minimum Latency Deep Online Video Stabilization","summary":" We present a novel camera path optimization framework for the task of online\nvideo stabilization. Typically, a stabilization pipeline consists of three\nsteps: motion estimating, path smoothing, and novel view rendering. Most\nprevious methods concentrate on motion estimation, proposing various global or\nlocal motion models. In contrast, path optimization receives relatively less\nattention, especially in the important online setting, where no future frames\nare available. In this work, we adopt recent off-the-shelf high-quality deep\nmotion models for motion estimation to recover the camera trajectory and focus\non the latter two steps. Our network takes a short 2D camera path in a sliding\nwindow as input and outputs the stabilizing warp field of the last frame in the\nwindow, which warps the coming frame to its stabilized position. A hybrid loss\nis well-defined to constrain the spatial and temporal consistency. In addition,\nwe build a motion dataset that contains stable and unstable motion pairs for\nthe training. Extensive experiments demonstrate that our approach significantly\noutperforms state-of-the-art online methods both qualitatively and\nquantitatively and achieves comparable performance to offline methods. Our code\nand dataset are available at https://github.com/liuzhen03/NNDVS\n","authors":["Zhuofan Zhang","Zhen Liu","Ping Tan","Bing Zeng","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2212.02073v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07615v1","updated":"2023-08-15T07:51:53Z","published":"2023-08-15T07:51:53Z","title":"Self-supervised Hypergraphs for Learning Multiple World Interpretations","summary":" We present a method for learning multiple scene representations given a small\nlabeled set, by exploiting the relationships between such representations in\nthe form of a multi-task hypergraph. We also show how we can use the hypergraph\nto improve a powerful pretrained VisTransformer model without any additional\nlabeled data. In our hypergraph, each node is an interpretation layer (e.g.,\ndepth or segmentation) of the scene. Within each hyperedge, one or several\ninput nodes predict the layer at the output node. Thus, each node could be an\ninput node in some hyperedges and an output node in others. In this way,\nmultiple paths can reach the same node, to form ensembles from which we obtain\nrobust pseudolabels, which allow self-supervised learning in the hypergraph. We\ntest different ensemble models and different types of hyperedges and show\nsuperior performance to other multi-task graph models in the field. We also\nintroduce Dronescapes, a large video dataset captured with UAVs in different\ncomplex real-world scenes, with multiple representations, suitable for\nmulti-task learning.\n","authors":["Alina Marcu","Mihai Pirvu","Dragos Costea","Emanuela Haller","Emil Slusanschi","Ahmed Nabil Belbachir","Rahul Sukthankar","Marius Leordeanu"],"pdf_url":"https://arxiv.org/pdf/2308.07615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12872v4","updated":"2023-08-15T07:48:36Z","published":"2022-11-23T11:26:24Z","title":"μSplit: efficient image decomposition for microscopy data","summary":" We present {\\mu}Split, a dedicated approach for trained image decomposition\nin the context of fluorescence microscopy images. We find that best results\nusing regular deep architectures are achieved when large image patches are used\nduring training, making memory consumption the limiting factor to further\nimproving performance. We therefore introduce lateral contextualization (LC), a\nmemory efficient way to train powerful networks and show that LC leads to\nconsistent and significant improvements on the task at hand. We integrate LC\nwith U-Nets, Hierarchical AEs, and Hierarchical VAEs, for which we formulate a\nmodified ELBO loss. Additionally, LC enables training deeper hierarchical\nmodels than otherwise possible and, interestingly, helps to reduce tiling\nartefacts that are inherently impossible to avoid when using tiled VAE\npredictions. We apply {\\mu}Split to five decomposition tasks, one on a\nsynthetic dataset, four others derived from real microscopy data. LC achieves\nSOTA results (average improvements to the best baseline of 2.36 dB PSNR), while\nsimultaneously requiring considerably less GPU memory.\n","authors":[" Ashesh","Alexander Krull","Moises Di Sante","Francesco Silvio Pasqualini","Florian Jug"],"pdf_url":"https://arxiv.org/pdf/2211.12872v4.pdf","comment":"Published at ICCV 2023. 10 pages, 7 figures, 9 pages supplement, 8\n supplementary figures"},{"id":"http://arxiv.org/abs/2206.08242v2","updated":"2023-08-15T07:43:44Z","published":"2022-06-16T15:22:39Z","title":"Catastrophic overfitting can be induced with discriminative non-robust\n features","summary":" Adversarial training (AT) is the de facto method for building robust neural\nnetworks, but it can be computationally expensive. To mitigate this, fast\nsingle-step attacks can be used, but this may lead to catastrophic overfitting\n(CO). This phenomenon appears when networks gain non-trivial robustness during\nthe first stages of AT, but then reach a breaking point where they become\nvulnerable in just a few iterations. The mechanisms that lead to this failure\nmode are still poorly understood. In this work, we study the onset of CO in\nsingle-step AT methods through controlled modifications of typical datasets of\nnatural images. In particular, we show that CO can be induced at much smaller\n$\\epsilon$ values than it was observed before just by injecting images with\nseemingly innocuous features. These features aid non-robust classification but\nare not enough to achieve robustness on their own. Through extensive\nexperiments we analyze this novel phenomenon and discover that the presence of\nthese easy features induces a learning shortcut that leads to CO. Our findings\nprovide new insights into the mechanisms of CO and improve our understanding of\nthe dynamics of AT. The code to reproduce our experiments can be found at\nhttps://github.com/gortizji/co_features.\n","authors":["Guillermo Ortiz-Jiménez","Pau de Jorge","Amartya Sanyal","Adel Bibi","Puneet K. Dokania","Pascal Frossard","Gregory Rogéz","Philip H. S. Torr"],"pdf_url":"https://arxiv.org/pdf/2206.08242v2.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2308.07611v1","updated":"2023-08-15T07:43:00Z","published":"2023-08-15T07:43:00Z","title":"GAMER-MRIL identifies Disability-Related Brain Changes in Multiple\n Sclerosis","summary":" Objective: Identifying disability-related brain changes is important for\nmultiple sclerosis (MS) patients. Currently, there is no clear understanding\nabout which pathological features drive disability in single MS patients. In\nthis work, we propose a novel comprehensive approach, GAMER-MRIL, leveraging\nwhole-brain quantitative MRI (qMRI), convolutional neural network (CNN), and an\ninterpretability method from classifying MS patients with severe disability to\ninvestigating relevant pathological brain changes. Methods:\nOne-hundred-sixty-six MS patients underwent 3T MRI acquisitions. qMRI\ninformative of microstructural brain properties was reconstructed, including\nquantitative T1 (qT1), myelin water fraction (MWF), and neurite density index\n(NDI). To fully utilize the qMRI, GAMER-MRIL extended a gated-attention-based\nCNN (GAMER-MRI), which was developed to select patch-based qMRI important for a\ngiven task/question, to the whole-brain image. To find out disability-related\nbrain regions, GAMER-MRIL modified a structure-aware interpretability method,\nLayer-wise Relevance Propagation (LRP), to incorporate qMRI. Results: The test\nperformance was AUC=0.885. qT1 was the most sensitive measure related to\ndisability, followed by NDI. The proposed LRP approach obtained more\nspecifically relevant regions than other interpretability methods, including\nthe saliency map, the integrated gradients, and the original LRP. The relevant\nregions included the corticospinal tract, where average qT1 and NDI\nsignificantly correlated with patients' disability scores ($\\rho$=-0.37 and\n0.44). Conclusion: These results demonstrated that GAMER-MRIL can classify\npatients with severe disability using qMRI and subsequently identify brain\nregions potentially important to the integrity of the mobile function.\nSignificance: GAMER-MRIL holds promise for developing biomarkers and increasing\nclinicians' trust in NN.\n","authors":["Po-Jui Lu","Benjamin Odry","Muhamed Barakovic","Matthias Weigel","Robin Sandkühler","Reza Rahmanzadeh","Xinjie Chen","Mario Ocampo-Pineda","Jens Kuhle","Ludwig Kappos","Philippe Cattin","Cristina Granziera"],"pdf_url":"https://arxiv.org/pdf/2308.07611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05234v2","updated":"2023-08-15T07:32:29Z","published":"2023-03-09T13:17:13Z","title":"GPGait: Generalized Pose-based Gait Recognition","summary":" Recent works on pose-based gait recognition have demonstrated the potential\nof using such simple information to achieve results comparable to\nsilhouette-based methods. However, the generalization ability of pose-based\nmethods on different datasets is undesirably inferior to that of\nsilhouette-based ones, which has received little attention but hinders the\napplication of these methods in real-world scenarios. To improve the\ngeneralization ability of pose-based methods across datasets, we propose a\n\\textbf{G}eneralized \\textbf{P}ose-based \\textbf{Gait} recognition\n(\\textbf{GPGait}) framework. First, a Human-Oriented Transformation (HOT) and a\nseries of Human-Oriented Descriptors (HOD) are proposed to obtain a unified\npose representation with discriminative multi-features. Then, given the slight\nvariations in the unified representation after HOT and HOD, it becomes crucial\nfor the network to extract local-global relationships between the keypoints. To\nthis end, a Part-Aware Graph Convolutional Network (PAGCN) is proposed to\nenable efficient graph partition and local-global spatial feature extraction.\nExperiments on four public gait recognition datasets, CASIA-B, OUMVLP-Pose,\nGait3D and GREW, show that our model demonstrates better and more stable\ncross-domain capabilities compared to existing skeleton-based methods,\nachieving comparable recognition results to silhouette-based ones. Code is\navailable at https://github.com/BNU-IVC/FastPoseGait.\n","authors":["Yang Fu","Shibei Meng","Saihui Hou","Xuecai Hu","Yongzhen Huang"],"pdf_url":"https://arxiv.org/pdf/2303.05234v2.pdf","comment":"ICCV Camera Ready"},{"id":"http://arxiv.org/abs/2307.16586v2","updated":"2023-08-15T07:30:08Z","published":"2023-07-31T11:40:53Z","title":"SAMFlow: Eliminating Any Fragmentation in Optical Flow with Segment\n Anything Model","summary":" Optical Flow Estimation aims to find the 2D dense motion field between two\nframes. Due to the limitation of model structures and training datasets,\nexisting methods often rely too much on local clues and ignore the integrity of\nobjects, resulting in fragmented motion estimation. Through theoretical\nanalysis, we find the pre-trained large vision models are helpful in optical\nflow estimation, and we notice that the recently famous Segment Anything Model\n(SAM) demonstrates a strong ability to segment complete objects, which is\nsuitable for solving the fragmentation problem. We thus propose a solution to\nembed the frozen SAM image encoder into FlowFormer to enhance object\nperception. To address the challenge of in-depth utilizing SAM in\nnon-segmentation tasks like optical flow estimation, we propose an Optical Flow\nTask-Specific Adaption scheme, including a Context Fusion Module to fuse the\nSAM encoder with the optical flow context encoder, and a Context Adaption\nModule to adapt the SAM features for optical flow task with Learned\nTask-Specific Embedding. Our proposed SAMFlow model reaches 0.86/2.10\nclean/final EPE and 3.55/12.32 EPE/F1-all on Sintel and KITTI-15 training set,\nsurpassing Flowformer by 8.5%/9.9% and 13.2%/16.3%. Furthermore, our model\nachieves state-of-the-art performance on the Sintel and KITTI-15 benchmarks,\nranking #1 among all two-frame methods on Sintel clean pass.\n","authors":["Shili Zhou","Ruian He","Weimin Tan","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2307.16586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.08231v2","updated":"2023-08-15T07:24:55Z","published":"2022-09-17T03:25:46Z","title":"Learning Distinct and Representative Styles for Image Captioning","summary":" Over the years, state-of-the-art (SoTA) image captioning methods have\nachieved promising results on some evaluation metrics (e.g., CIDEr). However,\nrecent findings show that the captions generated by these methods tend to be\nbiased toward the \"average\" caption that only captures the most general mode\n(a.k.a, language pattern) in the training corpus, i.e., the so-called mode\ncollapse problem. Affected by it, the generated captions are limited in\ndiversity and usually less informative than natural image descriptions made by\nhumans. In this paper, we seek to avoid this problem by proposing a Discrete\nMode Learning (DML) paradigm for image captioning. Our innovative idea is to\nexplore the rich modes in the training caption corpus to learn a set of \"mode\nembeddings\", and further use them to control the mode of the generated captions\nfor existing image captioning models. Specifically, the proposed DML optimizes\na dual architecture that consists of an image-conditioned discrete variational\nautoencoder (CdVAE) branch and a mode-conditioned image captioning (MIC)\nbranch. The CdVAE branch maps each image caption to one of the mode embeddings\nstored in a learned codebook, and is trained with a pure non-autoregressive\ngeneration objective to make the modes distinct and representative. The MIC\nbranch can be simply modified from an existing image captioning model, where\nthe mode embedding is added to the original word embeddings as the control\nsignal. In the experiments, we apply the proposed DML to two widely used image\ncaptioning models, Transformer and AoANet. The results show that the learned\nmode embedding successfully facilitates these models to generate high-quality\nimage captions with different modes, further leading to better performance for\nboth diversity and quality on the MSCOCO dataset.\n","authors":["Qi Chen","Chaorui Deng","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2209.08231v2.pdf","comment":"NeurIPS 2022"},{"id":"http://arxiv.org/abs/2308.07605v1","updated":"2023-08-15T07:20:22Z","published":"2023-08-15T07:20:22Z","title":"SGDiff: A Style Guided Diffusion Model for Fashion Synthesis","summary":" This paper reports on the development of \\textbf{a novel style guided\ndiffusion model (SGDiff)} which overcomes certain weaknesses inherent in\nexisting models for image synthesis. The proposed SGDiff combines image\nmodality with a pretrained text-to-image diffusion model to facilitate creative\nfashion image synthesis. It addresses the limitations of text-to-image\ndiffusion models by incorporating supplementary style guidance, substantially\nreducing training costs, and overcoming the difficulties of controlling\nsynthesized styles with text-only inputs. This paper also introduces a new\ndataset -- SG-Fashion, specifically designed for fashion image synthesis\napplications, offering high-resolution images and an extensive range of garment\ncategories. By means of comprehensive ablation study, we examine the\napplication of classifier-free guidance to a variety of conditions and validate\nthe effectiveness of the proposed model for generating fashion images of the\ndesired categories, product attributes, and styles. The contributions of this\npaper include a novel classifier-free guidance method for multi-modal feature\nfusion, a comprehensive dataset for fashion image synthesis application, a\nthorough investigation on conditioned text-to-image synthesis, and valuable\ninsights for future research in the text-to-image synthesis domain. The code\nand dataset are available at: \\url{https://github.com/taited/SGDiff}.\n","authors":["Zhengwentai Sun","Yanghong Zhou","Honghong He","P. Y. Mok"],"pdf_url":"https://arxiv.org/pdf/2308.07605v1.pdf","comment":"Accepted by ACM MM'23"},{"id":"http://arxiv.org/abs/2303.11589v2","updated":"2023-08-15T06:55:06Z","published":"2023-03-21T04:41:02Z","title":"LayoutDiffusion: Improving Graphic Layout Generation by Discrete\n Diffusion Probabilistic Models","summary":" Creating graphic layouts is a fundamental step in graphic designs. In this\nwork, we present a novel generative model named LayoutDiffusion for automatic\nlayout generation. As layout is typically represented as a sequence of discrete\ntokens, LayoutDiffusion models layout generation as a discrete denoising\ndiffusion process. It learns to reverse a mild forward process, in which\nlayouts become increasingly chaotic with the growth of forward steps and\nlayouts in the neighboring steps do not differ too much. Designing such a mild\nforward process is however very challenging as layout has both categorical\nattributes and ordinal attributes. To tackle the challenge, we summarize three\ncritical factors for achieving a mild forward process for the layout, i.e.,\nlegality, coordinate proximity and type disruption. Based on the factors, we\npropose a block-wise transition matrix coupled with a piece-wise linear noise\nschedule. Experiments on RICO and PubLayNet datasets show that LayoutDiffusion\noutperforms state-of-the-art approaches significantly. Moreover, it enables two\nconditional layout generation tasks in a plug-and-play manner without\nre-training and achieves better performance than existing methods.\n","authors":["Junyi Zhang","Jiaqi Guo","Shizhao Sun","Jian-Guang Lou","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.11589v2.pdf","comment":"Accepted by ICCV2023, project page: https://layoutdiffusion.github.io"},{"id":"http://arxiv.org/abs/2308.07593v1","updated":"2023-08-15T06:38:38Z","published":"2023-08-15T06:38:38Z","title":"AKVSR: Audio Knowledge Empowered Visual Speech Recognition by\n Compressing Audio Knowledge of a Pretrained Model","summary":" Visual Speech Recognition (VSR) is the task of predicting spoken words from\nsilent lip movements. VSR is regarded as a challenging task because of the\ninsufficient information on lip movements. In this paper, we propose an Audio\nKnowledge empowered Visual Speech Recognition framework (AKVSR) to complement\nthe insufficient speech information of visual modality by using audio modality.\nDifferent from the previous methods, the proposed AKVSR 1) utilizes rich audio\nknowledge encoded by a large-scale pretrained audio model, 2) saves the\nlinguistic information of audio knowledge in compact audio memory by discarding\nthe non-linguistic information from the audio through quantization, and 3)\nincludes Audio Bridging Module which can find the best-matched audio features\nfrom the compact audio memory, which makes our training possible without audio\ninputs, once after the compact audio memory is composed. We validate the\neffectiveness of the proposed method through extensive experiments, and achieve\nnew state-of-the-art performances on the widely-used datasets, LRS2 and LRS3.\n","authors":["Jeong Hun Yeo","Minsu Kim","Jeongsoo Choi","Dae Hoe Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2308.07593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07592v1","updated":"2023-08-15T06:30:19Z","published":"2023-08-15T06:30:19Z","title":"Graph-Segmenter: Graph Transformer with Boundary-aware Attention for\n Semantic Segmentation","summary":" The transformer-based semantic segmentation approaches, which divide the\nimage into different regions by sliding windows and model the relation inside\neach window, have achieved outstanding success. However, since the relation\nmodeling between windows was not the primary emphasis of previous work, it was\nnot fully utilized. To address this issue, we propose a Graph-Segmenter,\nincluding a Graph Transformer and a Boundary-aware Attention module, which is\nan effective network for simultaneously modeling the more profound relation\nbetween windows in a global view and various pixels inside each window as a\nlocal one, and for substantial low-cost boundary adjustment. Specifically, we\ntreat every window and pixel inside the window as nodes to construct graphs for\nboth views and devise the Graph Transformer. The introduced boundary-aware\nattention module optimizes the edge information of the target objects by\nmodeling the relationship between the pixel on the object's edge. Extensive\nexperiments on three widely used semantic segmentation datasets (Cityscapes,\nADE-20k and PASCAL Context) demonstrate that our proposed network, a Graph\nTransformer with Boundary-aware Attention, can achieve state-of-the-art\nsegmentation performance.\n","authors":["Zizhang Wu","Yuanzhu Gan","Tianhao Xu","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08093v2","updated":"2023-08-15T06:29:24Z","published":"2023-07-16T16:29:40Z","title":"Cross-Ray Neural Radiance Fields for Novel-view Synthesis from\n Unconstrained Image Collections","summary":" Neural Radiance Fields (NeRF) is a revolutionary approach for rendering\nscenes by sampling a single ray per pixel and it has demonstrated impressive\ncapabilities in novel-view synthesis from static scene images. However, in\npractice, we usually need to recover NeRF from unconstrained image collections,\nwhich poses two challenges: 1) the images often have dynamic changes in\nappearance because of different capturing time and camera settings; 2) the\nimages may contain transient objects such as humans and cars, leading to\nocclusion and ghosting artifacts. Conventional approaches seek to address these\nchallenges by locally utilizing a single ray to synthesize a color of a pixel.\nIn contrast, humans typically perceive appearance and objects by globally\nutilizing information across multiple pixels. To mimic the perception process\nof humans, in this paper, we propose Cross-Ray NeRF (CR-NeRF) that leverages\ninteractive information across multiple rays to synthesize occlusion-free novel\nviews with the same appearances as the images. Specifically, to model varying\nappearances, we first propose to represent multiple rays with a novel cross-ray\nfeature and then recover the appearance by fusing global statistics, i.e.,\nfeature covariance of the rays and the image appearance. Moreover, to avoid\nocclusion introduced by transient objects, we propose a transient objects\nhandler and introduce a grid sampling strategy for masking out the transient\nobjects. We theoretically find that leveraging correlation across multiple rays\npromotes capturing more global information. Moreover, extensive experimental\nresults on large real-world datasets verify the effectiveness of CR-NeRF.\n","authors":["Yifan Yang","Shuhai Zhang","Zixiong Huang","Yubing Zhang","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2307.08093v2.pdf","comment":"ICCV 2023 Oral"},{"id":"http://arxiv.org/abs/2307.12101v2","updated":"2023-08-15T06:24:14Z","published":"2023-07-22T15:20:25Z","title":"Spatial Self-Distillation for Object Detection with Inaccurate Bounding\n Boxes","summary":" Object detection via inaccurate bounding boxes supervision has boosted a\nbroad interest due to the expensive high-quality annotation data or the\noccasional inevitability of low annotation quality (\\eg tiny objects). The\nprevious works usually utilize multiple instance learning (MIL), which highly\ndepends on category information, to select and refine a low-quality box. Those\nmethods suffer from object drift, group prediction and part domination problems\nwithout exploring spatial information. In this paper, we heuristically propose\na \\textbf{Spatial Self-Distillation based Object Detector (SSD-Det)} to mine\nspatial information to refine the inaccurate box in a self-distillation\nfashion. SSD-Det utilizes a Spatial Position Self-Distillation \\textbf{(SPSD)}\nmodule to exploit spatial information and an interactive structure to combine\nspatial information and category information, thus constructing a high-quality\nproposal bag. To further improve the selection procedure, a Spatial Identity\nSelf-Distillation \\textbf{(SISD)} module is introduced in SSD-Det to obtain\nspatial confidence to help select the best proposals. Experiments on MS-COCO\nand VOC datasets with noisy box annotation verify our method's effectiveness\nand achieve state-of-the-art performance. The code is available at\nhttps://github.com/ucas-vg/PointTinyBenchmark/tree/SSD-Det.\n","authors":["Di Wu","Pengfei Chen","Xuehui Yu","Guorong Li","Zhenjun Han","Jianbin Jiao"],"pdf_url":"https://arxiv.org/pdf/2307.12101v2.pdf","comment":"accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07590v1","updated":"2023-08-15T06:21:56Z","published":"2023-08-15T06:21:56Z","title":"ADD: An Automatic Desensitization Fisheye Dataset for Autonomous Driving","summary":" Autonomous driving systems require many images for analyzing the surrounding\nenvironment. However, there is fewer data protection for private information\namong these captured images, such as pedestrian faces or vehicle license\nplates, which has become a significant issue. In this paper, in response to the\ncall for data security laws and regulations and based on the advantages of\nlarge Field of View(FoV) of the fisheye camera, we build the first Autopilot\nDesensitization Dataset, called ADD, and formulate the first\ndeep-learning-based image desensitization framework, to promote the study of\nimage desensitization in autonomous driving scenarios. The compiled dataset\nconsists of 650K images, including different face and vehicle license plate\ninformation captured by the surround-view fisheye camera. It covers various\nautonomous driving scenarios, including diverse facial characteristics and\nlicense plate colors. Then, we propose an efficient multitask desensitization\nnetwork called DesCenterNet as a benchmark on the ADD dataset, which can\nperform face and vehicle license plate detection and desensitization tasks.\nBased on ADD, we further provide an evaluation criterion for desensitization\nperformance, and extensive comparison experiments have verified the\neffectiveness and superiority of our method on image desensitization.\n","authors":["Zizhang Wu","Chenxin Yuan","Hongyang Wei","Fan Song","Tianhao Xu"],"pdf_url":"https://arxiv.org/pdf/2308.07590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07580v1","updated":"2023-08-15T05:51:25Z","published":"2023-08-15T05:51:25Z","title":"AutoLTS: Automating Cycling Stress Assessment via Contrastive Learning\n and Spatial Post-processing","summary":" Cycling stress assessment, which quantifies cyclists' perceived stress\nimposed by the built environment and motor traffics, increasingly informs\ncycling infrastructure planning and cycling route recommendation. However,\ncurrently calculating cycling stress is slow and data-intensive, which hinders\nits broader application. In this paper, We propose a deep learning framework to\nsupport accurate, fast, and large-scale cycling stress assessments for urban\nroad networks based on street-view images. Our framework features i) a\ncontrastive learning approach that leverages the ordinal relationship among\ncycling stress labels, and ii) a post-processing technique that enforces\nspatial smoothness into our predictions. On a dataset of 39,153 road segments\ncollected in Toronto, Canada, our results demonstrate the effectiveness of our\ndeep learning framework and the value of using image data for cycling stress\nassessment in the absence of high-quality road geometry and motor traffic data.\n","authors":["Bo Lin","Shoshanna Saxe","Timothy C. Y. Chan"],"pdf_url":"https://arxiv.org/pdf/2308.07580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07575v1","updated":"2023-08-15T05:08:12Z","published":"2023-08-15T05:08:12Z","title":"Story Visualization by Online Text Augmentation with Context Memory","summary":" Story visualization (SV) is a challenging text-to-image generation task for\nthe difficulty of not only rendering visual details from the text descriptions\nbut also encoding a long-term context across multiple sentences. While prior\nefforts mostly focus on generating a semantically relevant image for each\nsentence, encoding a context spread across the given paragraph to generate\ncontextually convincing images (e.g., with a correct character or with a proper\nbackground of the scene) remains a challenge. To this end, we propose a novel\nmemory architecture for the Bi-directional Transformers with an online text\naugmentation that generates multiple pseudo-descriptions as supplementary\nsupervision during training, for better generalization to the language\nvariation at inference. In extensive experiments on the two popular SV\nbenchmarks, i.e., the Pororo-SV and Flintstones-SV, the proposed method\nsignificantly outperforms the state of the arts in various evaluation metrics\nincluding FID, character F1, frame accuracy, BLEU-2/3, and R-precision with\nsimilar or less computational complexity.\n","authors":["Daechul Ahn","Daneul Kim","Gwangmo Song","Seung Hwan Kim","Honglak Lee","Dongyeop Kang","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2308.07575v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15353v2","updated":"2023-08-15T05:04:26Z","published":"2023-07-28T07:03:18Z","title":"Supervised Homography Learning with Realistic Dataset Generation","summary":" In this paper, we propose an iterative framework, which consists of two\nphases: a generation phase and a training phase, to generate realistic training\ndata and yield a supervised homography network. In the generation phase, given\nan unlabeled image pair, we utilize the pre-estimated dominant plane masks and\nhomography of the pair, along with another sampled homography that serves as\nground truth to generate a new labeled training pair with realistic motion. In\nthe training phase, the generated data is used to train the supervised\nhomography network, in which the training data is refined via a content\nconsistency module and a quality assessment module. Once an iteration is\nfinished, the trained network is used in the next data generation phase to\nupdate the pre-estimated homography. Through such an iterative strategy, the\nquality of the dataset and the performance of the network can be gradually and\nsimultaneously improved. Experimental results show that our method achieves\nstate-of-the-art performance and existing supervised methods can be also\nimproved based on the generated dataset. Code and dataset are available at\nhttps://github.com/JianghaiSCU/RealSH.\n","authors":["Hai Jiang","Haipeng Li","Songchen Han","Haoqiang Fan","Bing Zeng","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2307.15353v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07573v1","updated":"2023-08-15T05:04:04Z","published":"2023-08-15T05:04:04Z","title":"Synthetic data generation method for hybrid image-tabular data using two\n generative adversarial networks","summary":" The generation of synthetic medical records using generative adversarial\nnetworks (GANs) has become increasingly important for addressing privacy\nconcerns and promoting data sharing in the medical field. In this paper, we\npropose a novel method for generating synthetic hybrid medical records\nconsisting of chest X-ray images (CXRs) and structured tabular data (including\nanthropometric data and laboratory tests) using an auto-encoding GAN\n({\\alpha}GAN) and a conditional tabular GAN (CTGAN). Our approach involves\ntraining a {\\alpha}GAN model on a large public database (pDB) to reduce the\ndimensionality of CXRs. We then applied the trained encoder of the GAN model to\nthe images in original database (oDB) to obtain the latent vectors. These\nlatent vectors were combined with tabular data in oDB, and these joint data\nwere used to train the CTGAN model. We successfully generated diverse synthetic\nrecords of hybrid CXR and tabular data, maintaining correspondence between\nthem. We evaluated this synthetic database (sDB) through visual assessment,\ndistribution of interrecord distances, and classification tasks. Our evaluation\nresults showed that the sDB captured the features of the oDB while maintaining\nthe correspondence between the images and tabular data. Although our approach\nrelies on the availability of a large-scale pDB containing a substantial number\nof images with the same modality and imaging region as those in the oDB, this\nmethod has the potential for the public release of synthetic datasets without\ncompromising the secondary use of data.\n","authors":["Tomohiro Kikuchi","Shouhei Hanaoka","Takahiro Nakao","Tomomi Takenaga","Yukihiro Nomura","Harushi Mori","Takeharu Yoshikawa"],"pdf_url":"https://arxiv.org/pdf/2308.07573v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2302.00988v2","updated":"2023-08-15T04:51:27Z","published":"2023-02-02T10:13:04Z","title":"HaMuCo: Hand Pose Estimation via Multiview Collaborative Self-Supervised\n Learning","summary":" Recent advancements in 3D hand pose estimation have shown promising results,\nbut its effectiveness has primarily relied on the availability of large-scale\nannotated datasets, the creation of which is a laborious and costly process. To\nalleviate the label-hungry limitation, we propose a self-supervised learning\nframework, HaMuCo, that learns a single-view hand pose estimator from\nmulti-view pseudo 2D labels. However, one of the main challenges of\nself-supervised learning is the presence of noisy labels and the ``groupthink''\neffect from multiple views. To overcome these issues, we introduce a cross-view\ninteraction network that distills the single-view estimator by utilizing the\ncross-view correlated features and enforcing multi-view consistency to achieve\ncollaborative learning. Both the single-view estimator and the cross-view\ninteraction network are trained jointly in an end-to-end manner. Extensive\nexperiments show that our method can achieve state-of-the-art performance on\nmulti-view self-supervised hand pose estimation. Furthermore, the proposed\ncross-view interaction network can also be applied to hand pose estimation from\nmulti-view input and outperforms previous methods under the same settings.\n","authors":["Xiaozheng Zheng","Chao Wen","Zhou Xue","Pengfei Ren","Jingyu Wang"],"pdf_url":"https://arxiv.org/pdf/2302.00988v2.pdf","comment":"Accepted to ICCV 2023. Won first place in the HANDS22 Challenge Task\n 2. Project page: https://zxz267.github.io/HaMuCo"},{"id":"http://arxiv.org/abs/2308.07571v1","updated":"2023-08-15T04:49:11Z","published":"2023-08-15T04:49:11Z","title":"Ske2Grid: Skeleton-to-Grid Representation Learning for Action\n Recognition","summary":" This paper presents Ske2Grid, a new representation learning framework for\nimproved skeleton-based action recognition. In Ske2Grid, we define a regular\nconvolution operation upon a novel grid representation of human skeleton, which\nis a compact image-like grid patch constructed and learned through three novel\ndesigns. Specifically, we propose a graph-node index transform (GIT) to\nconstruct a regular grid patch through assigning the nodes in the skeleton\ngraph one by one to the desired grid cells. To ensure that GIT is a bijection\nand enrich the expressiveness of the grid representation, an up-sampling\ntransform (UPT) is learned to interpolate the skeleton graph nodes for filling\nthe grid patch to the full. To resolve the problem when the one-step UPT is\naggressive and further exploit the representation capability of the grid patch\nwith increasing spatial size, a progressive learning strategy (PLS) is proposed\nwhich decouples the UPT into multiple steps and aligns them to multiple paired\nGITs through a compact cascaded design learned progressively. We construct\nnetworks upon prevailing graph convolution networks and conduct experiments on\nsix mainstream skeleton-based action recognition datasets. Experiments show\nthat our Ske2Grid significantly outperforms existing GCN-based solutions under\ndifferent benchmark settings, without bells and whistles. Code and models are\navailable at https://github.com/OSVAI/Ske2Grid\n","authors":["Dongqi Cai","Yangyuxuan Kang","Anbang Yao","Yurong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07571v1.pdf","comment":"The paper of Ske2Grid is published at ICML 2023. Code and models are\n available at https://github.com/OSVAI/Ske2Grid"},{"id":"http://arxiv.org/abs/2308.02804v2","updated":"2023-08-15T04:29:40Z","published":"2023-08-05T06:29:46Z","title":"MiAMix: Enhancing Image Classification through a Multi-stage Augmented\n Mixed Sample Data Augmentation Method","summary":" Despite substantial progress in the field of deep learning, overfitting\npersists as a critical challenge, and data augmentation has emerged as a\nparticularly promising approach due to its capacity to enhance model\ngeneralization in various computer vision tasks. While various strategies have\nbeen proposed, Mixed Sample Data Augmentation (MSDA) has shown great potential\nfor enhancing model performance and generalization. We introduce a novel mixup\nmethod called MiAMix, which stands for Multi-stage Augmented Mixup. MiAMix\nintegrates image augmentation into the mixup framework, utilizes multiple\ndiversified mixing methods concurrently, and improves the mixing method by\nrandomly selecting mixing mask augmentation methods. Recent methods utilize\nsaliency information and the MiAMix is designed for computational efficiency as\nwell, reducing additional overhead and offering easy integration into existing\ntraining pipelines. We comprehensively evaluate MiaMix using four image\nbenchmarks and pitting it against current state-of-the-art mixed sample data\naugmentation techniques to demonstrate that MIAMix improves performance without\nheavy computational overhead.\n","authors":["Wen Liang","Youzhi Liang","Jianguo Jia"],"pdf_url":"https://arxiv.org/pdf/2308.02804v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.08510v4","updated":"2023-08-15T04:07:36Z","published":"2022-02-17T08:33:52Z","title":"Multi-Scale Hybrid Vision Transformer for Learning Gastric Histology:\n AI-Based Decision Support System for Gastric Cancer Treatment","summary":" Gastric endoscopic screening is an effective way to decide appropriate\ngastric cancer (GC) treatment at an early stage, reducing GC-associated\nmortality rate. Although artificial intelligence (AI) has brought a great\npromise to assist pathologist to screen digitalized whole slide images,\nexisting AI systems are limited in fine-grained cancer subclassifications and\nhave little usability in planning cancer treatment. We propose a practical AI\nsystem that enables five subclassifications of GC pathology, which can be\ndirectly matched to general GC treatment guidance. The AI system is designed to\nefficiently differentiate multi-classes of GC through multi-scale\nself-attention mechanism using 2-stage hybrid Vision Transformer (ViT)\nnetworks, by mimicking the way how human pathologists understand histology. The\nAI system demonstrates reliable diagnostic performance by achieving\nclass-average sensitivity of above 0.85 on a total of 1,212 slides from\nmulticentric cohort. Furthermore, AI-assisted pathologists show significantly\nimproved diagnostic sensitivity by 12% in addition to 18% reduced screening\ntime compared to human pathologists. Our results demonstrate that AI-assisted\ngastric endoscopic screening has a great potential for providing presumptive\npathologic opinion and appropriate cancer treatment of gastric cancer in\npractical clinical settings.\n","authors":["Yujin Oh","Go Eun Bae","Kyung-Hee Kim","Min-Kyung Yeo","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2202.08510v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14685v2","updated":"2023-08-15T03:57:38Z","published":"2023-06-26T13:30:38Z","title":"DiffSketcher: Text Guided Vector Sketch Synthesis through Latent\n Diffusion Models","summary":" Even though trained mainly on images, we discover that pretrained diffusion\nmodels show impressive power in guiding sketch synthesis. In this paper, we\npresent DiffSketcher, an innovative algorithm that creates vectorized free-hand\nsketches using natural language input. DiffSketcher is developed based on a\npre-trained text-to-image diffusion model. It performs the task by directly\noptimizing a set of Bezier curves with an extended version of the score\ndistillation sampling (SDS) loss, which allows us to use a raster-level\ndiffusion model as a prior for optimizing a parametric vectorized sketch\ngenerator. Furthermore, we explore attention maps embedded in the diffusion\nmodel for effective stroke initialization to speed up the generation process.\nThe generated sketches demonstrate multiple levels of abstraction while\nmaintaining recognizability, underlying structure, and essential visual details\nof the subject drawn. Our experiments show that DiffSketcher achieves greater\nquality than prior work.\n","authors":["Ximing Xing","Chuang Wang","Haitao Zhou","Jing Zhang","Qian Yu","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2306.14685v2.pdf","comment":"14 pages, 8 figures. update: improved experiment analysis, fixed\n typos, and fixed image errors"},{"id":"http://arxiv.org/abs/2308.07558v1","updated":"2023-08-15T03:56:46Z","published":"2023-08-15T03:56:46Z","title":"Action Class Relation Detection and Classification Across Multiple Video\n Datasets","summary":" The Meta Video Dataset (MetaVD) provides annotated relations between action\nclasses in major datasets for human action recognition in videos. Although\nthese annotated relations enable dataset augmentation, it is only applicable to\nthose covered by MetaVD. For an external dataset to enjoy the same benefit, the\nrelations between its action classes and those in MetaVD need to be determined.\nTo address this issue, we consider two new machine learning tasks: action class\nrelation detection and classification. We propose a unified model to predict\nrelations between action classes, using language and visual information\nassociated with classes. Experimental results show that (i) pre-trained recent\nneural network models for texts and videos contribute to high predictive\nperformance, (ii) the relation prediction based on action label texts is more\naccurate than based on videos, and (iii) a blending approach that combines\npredictions by both modalities can further improve the predictive performance\nin some cases.\n","authors":["Yuya Yoshikawa","Yutaro Shigeto","Masashi Shimbo","Akikazu Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2308.07558v1.pdf","comment":"Accepted to Pattern Recognition Letters. 12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2303.01043v2","updated":"2023-08-15T03:53:36Z","published":"2023-03-02T07:56:04Z","title":"I2P-Rec: Recognizing Images on Large-scale Point Cloud Maps through\n Bird's Eye View Projections","summary":" Place recognition is an important technique for autonomous cars to achieve\nfull autonomy since it can provide an initial guess to online localization\nalgorithms. Although current methods based on images or point clouds have\nachieved satisfactory performance, localizing the images on a large-scale point\ncloud map remains a fairly unexplored problem. This cross-modal matching task\nis challenging due to the difficulty in extracting consistent descriptors from\nimages and point clouds. In this paper, we propose the I2P-Rec method to solve\nthe problem by transforming the cross-modal data into the same modality.\nSpecifically, we leverage on the recent success of depth estimation networks to\nrecover point clouds from images. We then project the point clouds into Bird's\nEye View (BEV) images. Using the BEV image as an intermediate representation,\nwe extract global features with a Convolutional Neural Network followed by a\nNetVLAD layer to perform matching. The experimental results evaluated on the\nKITTI dataset show that, with only a small set of training data, I2P-Rec\nachieves recall rates at Top-1\\% over 80\\% and 90\\%, when localizing monocular\nand stereo images on point cloud maps, respectively. We further evaluate\nI2P-Rec on a 1 km trajectory dataset collected by an autonomous logistics car\nand show that I2P-Rec can generalize well to previously unseen environments.\n","authors":["Shuhang Zheng","Yixuan Li","Zhu Yu","Beinan Yu","Si-Yuan Cao","Minhang Wang","Jintao Xu","Rui Ai","Weihao Gu","Lun Luo","Hui-Liang Shen"],"pdf_url":"https://arxiv.org/pdf/2303.01043v2.pdf","comment":"Accepted by IROS 2023"},{"id":"http://arxiv.org/abs/2308.07555v1","updated":"2023-08-15T03:49:05Z","published":"2023-08-15T03:49:05Z","title":"SST: A Simplified Swin Transformer-based Model for Taxi Destination\n Prediction based on Existing Trajectory","summary":" Accurately predicting the destination of taxi trajectories can have various\nbenefits for intelligent location-based services. One potential method to\naccomplish this prediction is by converting the taxi trajectory into a\ntwo-dimensional grid and using computer vision techniques. While the Swin\nTransformer is an innovative computer vision architecture with demonstrated\nsuccess in vision downstream tasks, it is not commonly used to solve real-world\ntrajectory problems. In this paper, we propose a simplified Swin Transformer\n(SST) structure that does not use the shifted window idea in the traditional\nSwin Transformer, as trajectory data is consecutive in nature. Our\ncomprehensive experiments, based on real trajectory data, demonstrate that SST\ncan achieve higher accuracy compared to state-of-the-art methods.\n","authors":["Zepu Wang","Yifei Sun","Zhiyu Lei","Xincheng Zhu","Peng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.07555v1.pdf","comment":"Accepted by IEEE ITSC"},{"id":"http://arxiv.org/abs/2308.02158v2","updated":"2023-08-15T03:45:50Z","published":"2023-08-04T06:37:28Z","title":"CTP-Net: Character Texture Perception Network for Document Image Forgery\n Localization","summary":" Due to the progression of information technology in recent years, document\nimages have been widely disseminated on social networks. With the help of\npowerful image editing tools, document images are easily forged without leaving\nvisible manipulation traces, which leads to severe issues if significant\ninformation is falsified for malicious use. Therefore, the research of document\nimage forensics is worth further exploring. In this paper, we propose a\nCharacter Texture Perception Network (CTP-Net) to localize the forged regions\nin document images. Specifically, considering the characters with semantics in\na document image are highly vulnerable, capturing the forgery traces is the key\nto localize the forged regions. We design a Character Texture Stream (CTS)\nbased on optical character recognition to capture features of text areas that\nare essential components of a document image. Meanwhile, texture features of\nthe whole document image are exploited by an Image Texture Stream (ITS).\nCombining the features extracted from the CTS and the ITS, the CTP-Net can\nreveal more subtle forgery traces from document images. Moreover, to overcome\nthe challenge caused by the lack of fake document images, we design a data\ngeneration strategy that is utilized to construct a Fake Chinese Trademark\ndataset (FCTM). Experimental results on different datasets demonstrate that the\nproposed CTP-Net is able to localize multi-scale forged areas in document\nimages, and outperform the state-of-the-art forgery localization methods, even\nthough post-processing operations are applied.\n","authors":["Xin Liao","Siliang Chen","Jiaxin Chen","Tianyi Wang","Xiehua Li"],"pdf_url":"https://arxiv.org/pdf/2308.02158v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14325v3","updated":"2023-08-15T03:44:00Z","published":"2023-02-28T05:37:45Z","title":"BEVPlace: Learning LiDAR-based Place Recognition using Bird's Eye View\n Images","summary":" Place recognition is a key module for long-term SLAM systems. Current\nLiDAR-based place recognition methods usually use representations of point\nclouds such as unordered points or range images. These methods achieve high\nrecall rates of retrieval, but their performance may degrade in the case of\nview variation or scene changes. In this work, we explore the potential of a\ndifferent representation in place recognition, i.e. bird's eye view (BEV)\nimages. We observe that the structural contents of BEV images are less\ninfluenced by rotations and translations of point clouds. We validate that,\nwithout any delicate design, a simple VGGNet trained on BEV images achieves\ncomparable performance with the state-of-the-art place recognition methods in\nscenes of slight viewpoint changes. For more robust place recognition, we\ndesign a rotation-invariant network called BEVPlace. We use group convolution\nto extract rotation-equivariant local features from the images and NetVLAD for\nglobal feature aggregation. In addition, we observe that the distance between\nBEV features is correlated with the geometry distance of point clouds. Based on\nthe observation, we develop a method to estimate the position of the query\ncloud, extending the usage of place recognition. The experiments conducted on\nlarge-scale public datasets show that our method 1) achieves state-of-the-art\nperformance in terms of recall rates, 2) is robust to view changes, 3) shows\nstrong generalization ability, and 4) can estimate the positions of query point\nclouds. Source codes are publicly available at\nhttps://github.com/zjuluolun/BEVPlace.\n","authors":["Lun Luo","Shuhang Zheng","Yixuan Li","Yongzhi Fan","Beinan Yu","Siyuan Cao","Huiliang Shen"],"pdf_url":"https://arxiv.org/pdf/2302.14325v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07551v1","updated":"2023-08-15T03:42:19Z","published":"2023-08-15T03:42:19Z","title":"Multi-view 3D Face Reconstruction Based on Flame","summary":" At present, face 3D reconstruction has broad application prospects in various\nfields, but the research on it is still in the development stage. In this\npaper, we hope to achieve better face 3D reconstruction quality by combining\nmulti-view training framework with face parametric model Flame, propose a\nmulti-view training and testing model MFNet (Multi-view Flame Network). We\nbuild a self-supervised training framework and implement constraints such as\nmulti-view optical flow loss function and face landmark loss, and finally\nobtain a complete MFNet. We propose innovative implementations of multi-view\noptical flow loss and the covisible mask. We test our model on AFLW and\nfacescape datasets and also take pictures of our faces to reconstruct 3D faces\nwhile simulating actual scenarios as much as possible, which achieves good\nresults. Our work mainly addresses the problem of combining parametric models\nof faces with multi-view face 3D reconstruction and explores the implementation\nof a Flame based multi-view training and testing framework for contributing to\nthe field of face 3D reconstruction.\n","authors":["Wenzhuo Zheng","Junhao Zhao","Xiaohong Liu","Yongyang Pan","Zhenghao Gan","Haozhe Han","Ning Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07546v1","updated":"2023-08-15T03:29:31Z","published":"2023-08-15T03:29:31Z","title":"3DHacker: Spectrum-based Decision Boundary Generation for Hard-label 3D\n Point Cloud Attack","summary":" With the maturity of depth sensors, the vulnerability of 3D point cloud\nmodels has received increasing attention in various applications such as\nautonomous driving and robot navigation. Previous 3D adversarial attackers\neither follow the white-box setting to iteratively update the coordinate\nperturbations based on gradients, or utilize the output model logits to\nestimate noisy gradients in the black-box setting. However, these attack\nmethods are hard to be deployed in real-world scenarios since realistic 3D\napplications will not share any model details to users. Therefore, we explore a\nmore challenging yet practical 3D attack setting, \\textit{i.e.}, attacking\npoint clouds with black-box hard labels, in which the attacker can only have\naccess to the prediction label of the input. To tackle this setting, we propose\na novel 3D attack method, termed \\textbf{3D} \\textbf{H}ard-label\natt\\textbf{acker} (\\textbf{3DHacker}), based on the developed decision boundary\nalgorithm to generate adversarial samples solely with the knowledge of class\nlabels. Specifically, to construct the class-aware model decision boundary,\n3DHacker first randomly fuses two point clouds of different classes in the\nspectral domain to craft their intermediate sample with high imperceptibility,\nthen projects it onto the decision boundary via binary search. To restrict the\nfinal perturbation size, 3DHacker further introduces an iterative optimization\nstrategy to move the intermediate sample along the decision boundary for\ngenerating adversarial point clouds with smallest trivial perturbations.\nExtensive evaluations show that, even in the challenging hard-label setting,\n3DHacker still competitively outperforms existing 3D attacks regarding the\nattack performance as well as adversary quality.\n","authors":["Yunbo Tao","Daizong Liu","Pan Zhou","Yulai Xie","Wei Du","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2308.07546v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2305.06794v2","updated":"2023-08-15T03:24:57Z","published":"2023-05-11T13:34:02Z","title":"MMF-Track: Multi-modal Multi-level Fusion for 3D Single Object Tracking","summary":" 3D single object tracking plays a crucial role in computer vision. Mainstream\nmethods mainly rely on point clouds to achieve geometry matching between target\ntemplate and search area. However, textureless and incomplete point clouds make\nit difficult for single-modal trackers to distinguish objects with similar\nstructures. To overcome the limitations of geometry matching, we propose a\nMulti-modal Multi-level Fusion Tracker (MMF-Track), which exploits the image\ntexture and geometry characteristic of point clouds to track 3D target.\nSpecifically, we first propose a Space Alignment Module (SAM) to align RGB\nimages with point clouds in 3D space, which is the prerequisite for\nconstructing inter-modal associations. Then, in feature interaction level, we\ndesign a Feature Interaction Module (FIM) based on dual-stream structure, which\nenhances intra-modal features in parallel and constructs inter-modal semantic\nassociations. Meanwhile, in order to refine each modal feature, we introduce a\nCoarse-to-Fine Interaction Module (CFIM) to realize the hierarchical feature\ninteraction at different scales. Finally, in similarity fusion level, we\npropose a Similarity Fusion Module (SFM) to aggregate geometry and texture\nclues from the target. Experiments show that our method achieves\nstate-of-the-art performance on KITTI (39% Success and 42% Precision gains\nagainst previous multi-modal method) and is also competitive on NuScenes.\n","authors":["Zhiheng Li","Yubo Cui","Yu Lin","Zheng Fang"],"pdf_url":"https://arxiv.org/pdf/2305.06794v2.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.07545v1","updated":"2023-08-15T03:22:40Z","published":"2023-08-15T03:22:40Z","title":"Multimodal Dataset Distillation for Image-Text Retrieval","summary":" Dataset distillation methods offer the promise of reducing a large-scale\ndataset down to a significantly smaller set of (potentially synthetic) training\nexamples, which preserve sufficient information for training a new model from\nscratch. So far dataset distillation methods have been developed for image\nclassification. However, with the rise in capabilities of vision-language\nmodels, and especially given the scale of datasets necessary to train these\nmodels, the time is ripe to expand dataset distillation methods beyond image\nclassification. In this work, we take the first steps towards this goal by\nexpanding on the idea of trajectory matching to create a distillation method\nfor vision-language datasets. The key challenge is that vision-language\ndatasets do not have a set of discrete classes. To overcome this, our proposed\nmultimodal dataset distillation method jointly distill the images and their\ncorresponding language descriptions in a contrastive formulation. Since there\nare no existing baselines, we compare our approach to three coreset selection\nmethods (strategic subsampling of the training dataset), which we adapt to the\nvision-language setting. We demonstrate significant improvements on the\nchallenging Flickr30K and COCO retrieval benchmark: the best coreset selection\nmethod which selects 1000 image-text pairs for training is able to achieve only\n5.6% image-to-text retrieval accuracy (recall@1); in contrast, our dataset\ndistillation approach almost doubles that with just 100 (an order of magnitude\nfewer) training pairs.\n","authors":["Xindi Wu","Zhiwei Deng","Olga Russakovsky"],"pdf_url":"https://arxiv.org/pdf/2308.07545v1.pdf","comment":"28 pages, 11 figures"},{"id":"http://arxiv.org/abs/2304.01489v2","updated":"2023-08-15T03:14:47Z","published":"2023-04-04T03:08:02Z","title":"Improved Visual Fine-tuning with Natural Language Supervision","summary":" Fine-tuning a visual pre-trained model can leverage the semantic information\nfrom large-scale pre-training data and mitigate the over-fitting problem on\ndownstream vision tasks with limited training examples. While the problem of\ncatastrophic forgetting in pre-trained backbone has been extensively studied\nfor fine-tuning, its potential bias from the corresponding pre-training task\nand data, attracts less attention. In this work, we investigate this problem by\ndemonstrating that the obtained classifier after fine-tuning will be close to\nthat induced by the pre-trained model. To reduce the bias in the classifier\neffectively, we introduce a reference distribution obtained from a fixed text\nclassifier, which can help regularize the learned vision classifier. The\nproposed method, Text Supervised fine-tuning (TeS), is evaluated with diverse\npre-trained vision models including ResNet and ViT, and text encoders including\nBERT and CLIP, on 11 downstream tasks. The consistent improvement with a clear\nmargin over distinct scenarios confirms the effectiveness of our proposal. Code\nis available at \\url{https://github.com/idstcv/TeS}.\n","authors":["Junyang Wang","Yuanhong Xu","Juhua Hu","Ming Yan","Jitao Sang","Qi Qian"],"pdf_url":"https://arxiv.org/pdf/2304.01489v2.pdf","comment":"accepted by ICCV'23"},{"id":"http://arxiv.org/abs/2308.07207v2","updated":"2023-08-15T02:59:04Z","published":"2023-08-14T15:24:44Z","title":"FOLT: Fast Multiple Object Tracking from UAV-captured Videos Based on\n Optical Flow","summary":" Multiple object tracking (MOT) has been successfully investigated in computer\nvision.\n However, MOT for the videos captured by unmanned aerial vehicles (UAV) is\nstill challenging due to small object size, blurred object appearance, and very\nlarge and/or irregular motion in both ground objects and UAV platforms.\n In this paper, we propose FOLT to mitigate these problems and reach fast and\naccurate MOT in UAV view.\n Aiming at speed-accuracy trade-off, FOLT adopts a modern detector and\nlight-weight optical flow extractor to extract object detection features and\nmotion features at a minimum cost.\n Given the extracted flow, the flow-guided feature augmentation is designed to\naugment the object detection feature based on its optical flow, which improves\nthe detection of small objects.\n Then the flow-guided motion prediction is also proposed to predict the\nobject's position in the next frame, which improves the tracking performance of\nobjects with very large displacements between adjacent frames.\n Finally, the tracker matches the detected objects and predicted objects using\na spatially matching scheme to generate tracks for every object.\n Experiments on Visdrone and UAVDT datasets show that our proposed model can\nsuccessfully track small objects with large and irregular motion and outperform\nexisting state-of-the-art methods in UAV-MOT tasks.\n","authors":["Mufeng Yao","Jiaqi Wang","Jinlong Peng","Mingmin Chi","Chao Liu"],"pdf_url":"https://arxiv.org/pdf/2308.07207v2.pdf","comment":"Accepted by ACM Multi-Media 2023"},{"id":"http://arxiv.org/abs/2308.07539v1","updated":"2023-08-15T02:46:49Z","published":"2023-08-15T02:46:49Z","title":"Visual and Textual Prior Guided Mask Assemble for Few-Shot Segmentation\n and Beyond","summary":" Few-shot segmentation (FSS) aims to segment the novel classes with a few\nannotated images. Due to CLIP's advantages of aligning visual and textual\ninformation, the integration of CLIP can enhance the generalization ability of\nFSS model. However, even with the CLIP model, the existing CLIP-based FSS\nmethods are still subject to the biased prediction towards base classes, which\nis caused by the class-specific feature level interactions. To solve this\nissue, we propose a visual and textual Prior Guided Mask Assemble Network\n(PGMA-Net). It employs a class-agnostic mask assembly process to alleviate the\nbias, and formulates diverse tasks into a unified manner by assembling the\nprior through affinity. Specifically, the class-relevant textual and visual\nfeatures are first transformed to class-agnostic prior in the form of\nprobability map. Then, a Prior-Guided Mask Assemble Module (PGMAM) including\nmultiple General Assemble Units (GAUs) is introduced. It considers diverse and\nplug-and-play interactions, such as visual-textual, inter- and intra-image,\ntraining-free, and high-order ones. Lastly, to ensure the class-agnostic\nability, a Hierarchical Decoder with Channel-Drop Mechanism (HDCDM) is proposed\nto flexibly exploit the assembled masks and low-level features, without relying\non any class-specific information. It achieves new state-of-the-art results in\nthe FSS task, with mIoU of $77.6$ on $\\text{PASCAL-}5^i$ and $59.4$ on\n$\\text{COCO-}20^i$ in 1-shot scenario. Beyond this, we show that without extra\nre-training, the proposed PGMA-Net can solve bbox-level and cross-domain FSS,\nco-segmentation, zero-shot segmentation (ZSS) tasks, leading an any-shot\nsegmentation framework.\n","authors":["Chen Shuai","Meng Fanman","Zhang Runtong","Qiu Heqian","Li Hongliang","Wu Qingbo","Xu Linfeng"],"pdf_url":"https://arxiv.org/pdf/2308.07539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07537v1","updated":"2023-08-15T02:39:39Z","published":"2023-08-15T02:39:39Z","title":"AttMOT: Improving Multiple-Object Tracking by Introducing Auxiliary\n Pedestrian Attributes","summary":" Multi-object tracking (MOT) is a fundamental problem in computer vision with\nnumerous applications, such as intelligent surveillance and automated driving.\nDespite the significant progress made in MOT, pedestrian attributes, such as\ngender, hairstyle, body shape, and clothing features, which contain rich and\nhigh-level information, have been less explored. To address this gap, we\npropose a simple, effective, and generic method to predict pedestrian\nattributes to support general Re-ID embedding. We first introduce AttMOT, a\nlarge, highly enriched synthetic dataset for pedestrian tracking, containing\nover 80k frames and 6 million pedestrian IDs with different time, weather\nconditions, and scenarios. To the best of our knowledge, AttMOT is the first\nMOT dataset with semantic attributes. Subsequently, we explore different\napproaches to fuse Re-ID embedding and pedestrian attributes, including\nattention mechanisms, which we hope will stimulate the development of\nattribute-assisted MOT. The proposed method AAM demonstrates its effectiveness\nand generality on several representative pedestrian multi-object tracking\nbenchmarks, including MOT17 and MOT20, through experiments on the AttMOT\ndataset. When applied to state-of-the-art trackers, AAM achieves consistent\nimprovements in MOTA, HOTA, AssA, IDs, and IDF1 scores. For instance, on MOT17,\nthe proposed method yields a +1.1 MOTA, +1.7 HOTA, and +1.8 IDF1 improvement\nwhen used with FairMOT. To encourage further research on attribute-assisted\nMOT, we will release the AttMOT dataset.\n","authors":["Yunhao Li","Zhen Xiao","Lin Yang","Dan Meng","Xin Zhou","Heng Fan","Libo Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07535v1","updated":"2023-08-15T02:35:59Z","published":"2023-08-15T02:35:59Z","title":"Improved Region Proposal Network for Enhanced Few-Shot Object Detection","summary":" Despite significant success of deep learning in object detection tasks, the\nstandard training of deep neural networks requires access to a substantial\nquantity of annotated images across all classes. Data annotation is an arduous\nand time-consuming endeavor, particularly when dealing with infrequent objects.\nFew-shot object detection (FSOD) methods have emerged as a solution to the\nlimitations of classic object detection approaches based on deep learning. FSOD\nmethods demonstrate remarkable performance by achieving robust object detection\nusing a significantly smaller amount of training data. A challenge for FSOD is\nthat instances from novel classes that do not belong to the fixed set of\ntraining classes appear in the background and the base model may pick them up\nas potential objects. These objects behave similarly to label noise because\nthey are classified as one of the training dataset classes, leading to FSOD\nperformance degradation. We develop a semi-supervised algorithm to detect and\nthen utilize these unlabeled novel objects as positive samples during the FSOD\ntraining stage to improve FSOD performance. Specifically, we develop a\nhierarchical ternary classification region proposal network (HTRPN) to localize\nthe potential unlabeled novel objects and assign them new objectness labels to\ndistinguish these objects from the base training dataset classes. Our improved\nhierarchical sampling strategy for the region proposal network (RPN) also\nboosts the perception ability of the object detection model for large objects.\nWe test our approach and COCO and PASCAL VOC baselines that are commonly used\nin FSOD literature. Our experimental results indicate that our method is\neffective and outperforms the existing state-of-the-art (SOTA) FSOD methods.\nOur implementation is provided as a supplement to support reproducibility of\nthe results.\n","authors":["Zeyu Shangguan","Mohammad Rostami"],"pdf_url":"https://arxiv.org/pdf/2308.07535v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.10422"},{"id":"http://arxiv.org/abs/2308.06452v2","updated":"2023-08-15T02:31:51Z","published":"2023-08-12T03:13:38Z","title":"Improved YOLOv8 Detection Algorithm in Security Inspection Image","summary":" Security inspection is the first line of defense to ensure the safety of\npeople's lives and property, and intelligent security inspection is an\ninevitable trend in the future development of the security inspection industry.\nAiming at the problems of overlapping detection objects, false detection of\ncontraband, and missed detection in the process of X-ray image detection, an\nimproved X-ray contraband detection algorithm CSS-YOLO based on YOLOv8s is\nproposed.\n","authors":["Liyao Lu"],"pdf_url":"https://arxiv.org/pdf/2308.06452v2.pdf","comment":"23 pages,23 figures"},{"id":"http://arxiv.org/abs/2307.07893v2","updated":"2023-08-15T02:21:20Z","published":"2023-07-15T22:13:36Z","title":"Anomaly Detection in Automated Fibre Placement: Learning with Data\n Limitations","summary":" Conventional defect detection systems in Automated Fibre Placement (AFP)\ntypically rely on end-to-end supervised learning, necessitating a substantial\nnumber of labelled defective samples for effective training. However, the\nscarcity of such labelled data poses a challenge. To overcome this limitation,\nwe present a comprehensive framework for defect detection and localization in\nAutomated Fibre Placement. Our approach combines unsupervised deep learning and\nclassical computer vision algorithms, eliminating the need for labelled data or\nmanufacturing defect samples. It efficiently detects various surface issues\nwhile requiring fewer images of composite parts for training. Our framework\nemploys an innovative sample extraction method leveraging AFP's inherent\nsymmetry to expand the dataset. By inputting a depth map of the fibre layup\nsurface, we extract local samples aligned with each composite strip (tow).\nThese samples are processed through an autoencoder, trained on normal samples\nfor precise reconstructions, highlighting anomalies through reconstruction\nerrors. Aggregated values form an anomaly map for insightful visualization. The\nframework employs blob detection on this map to locate manufacturing defects.\nThe experimental findings reveal that despite training the autoencoder with a\nlimited number of images, our proposed method exhibits satisfactory detection\naccuracy and accurately identifies defect locations. Our framework demonstrates\ncomparable performance to existing methods, while also offering the advantage\nof detecting all types of anomalies without relying on an extensive labelled\ndataset of defects.\n","authors":["Assef Ghamisi","Todd Charter","Li Ji","Maxime Rivard","Gil Lund","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2307.07893v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07528v1","updated":"2023-08-15T01:54:59Z","published":"2023-08-15T01:54:59Z","title":"Confidence Contours: Uncertainty-Aware Annotation for Medical Semantic\n Segmentation","summary":" Medical image segmentation modeling is a high-stakes task where understanding\nof uncertainty is crucial for addressing visual ambiguity. Prior work has\ndeveloped segmentation models utilizing probabilistic or generative mechanisms\nto infer uncertainty from labels where annotators draw a singular boundary.\nHowever, as these annotations cannot represent an individual annotator's\nuncertainty, models trained on them produce uncertainty maps that are difficult\nto interpret. We propose a novel segmentation representation, Confidence\nContours, which uses high- and low-confidence ``contours'' to capture\nuncertainty directly, and develop a novel annotation system for collecting\ncontours. We conduct an evaluation on the Lung Image Dataset Consortium (LIDC)\nand a synthetic dataset. From an annotation study with 30 participants, results\nshow that Confidence Contours provide high representative capacity without\nconsiderably higher annotator effort. We also find that general-purpose\nsegmentation models can learn Confidence Contours at the same performance level\nas standard singular annotations. Finally, from interviews with 5 medical\nexperts, we find that Confidence Contour maps are more interpretable than\nBayesian maps due to representation of structural uncertainty.\n","authors":["Andre Ye","Quan Ze Chen","Amy Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07528v1.pdf","comment":"10 pages content, 12 pages total. Accepted to HCOMP '23"},{"id":"http://arxiv.org/abs/2301.09637v2","updated":"2023-08-15T01:05:21Z","published":"2023-01-23T18:59:59Z","title":"InfiniCity: Infinite-Scale City Synthesis","summary":" Toward infinite-scale 3D city synthesis, we propose a novel framework,\nInfiniCity, which constructs and renders an unconstrainedly large and\n3D-grounded environment from random noises. InfiniCity decomposes the seemingly\nimpractical task into three feasible modules, taking advantage of both 2D and\n3D data. First, an infinite-pixel image synthesis module generates\narbitrary-scale 2D maps from the bird's-eye view. Next, an octree-based voxel\ncompletion module lifts the generated 2D map to 3D octrees. Finally, a\nvoxel-based neural rendering module texturizes the voxels and renders 2D\nimages. InfiniCity can thus synthesize arbitrary-scale and traversable 3D city\nenvironments, and allow flexible and interactive editing from users. We\nquantitatively and qualitatively demonstrate the efficacy of the proposed\nframework. Project page: https://hubert0527.github.io/infinicity/\n","authors":["Chieh Hubert Lin","Hsin-Ying Lee","Willi Menapace","Menglei Chai","Aliaksandr Siarohin","Ming-Hsuan Yang","Sergey Tulyakov"],"pdf_url":"https://arxiv.org/pdf/2301.09637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07509v1","updated":"2023-08-15T00:27:18Z","published":"2023-08-15T00:27:18Z","title":"Boosting Semi-Supervised Learning by bridging high and low-confidence\n predictions","summary":" Pseudo-labeling is a crucial technique in semi-supervised learning (SSL),\nwhere artificial labels are generated for unlabeled data by a trained model,\nallowing for the simultaneous training of labeled and unlabeled data in a\nsupervised setting. However, several studies have identified three main issues\nwith pseudo-labeling-based approaches. Firstly, these methods heavily rely on\npredictions from the trained model, which may not always be accurate, leading\nto a confirmation bias problem. Secondly, the trained model may be overfitted\nto easy-to-learn examples, ignoring hard-to-learn ones, resulting in the\n\\textit{\"Matthew effect\"} where the already strong become stronger and the weak\nweaker. Thirdly, most of the low-confidence predictions of unlabeled data are\ndiscarded due to the use of a high threshold, leading to an underutilization of\nunlabeled data during training. To address these issues, we propose a new\nmethod called ReFixMatch, which aims to utilize all of the unlabeled data\nduring training, thus improving the generalizability of the model and\nperformance on SSL benchmarks. Notably, ReFixMatch achieves 41.05\\% top-1\naccuracy with 100k labeled examples on ImageNet, outperforming the baseline\nFixMatch and current state-of-the-art methods.\n","authors":["Khanh-Binh Nguyen","Joon-Sung Yang"],"pdf_url":"https://arxiv.org/pdf/2308.07509v1.pdf","comment":"Accepted to ICCVW2023 (Workshop on representation learning with very\n limited images: the potential of self-, synthetic- and formula-supervision)"},{"id":"http://arxiv.org/abs/2308.07506v1","updated":"2023-08-15T00:09:33Z","published":"2023-08-15T00:09:33Z","title":"Benchmarking Scalable Epistemic Uncertainty Quantification in Organ\n Segmentation","summary":" Deep learning based methods for automatic organ segmentation have shown\npromise in aiding diagnosis and treatment planning. However, quantifying and\nunderstanding the uncertainty associated with model predictions is crucial in\ncritical clinical applications. While many techniques have been proposed for\nepistemic or model-based uncertainty estimation, it is unclear which method is\npreferred in the medical image analysis setting. This paper presents a\ncomprehensive benchmarking study that evaluates epistemic uncertainty\nquantification methods in organ segmentation in terms of accuracy, uncertainty\ncalibration, and scalability. We provide a comprehensive discussion of the\nstrengths, weaknesses, and out-of-distribution detection capabilities of each\nmethod as well as recommendations for future improvements. These findings\ncontribute to the development of reliable and robust models that yield accurate\nsegmentations while effectively quantifying epistemic uncertainty.\n","authors":["Jadie Adams","Shireen Y. Elhabian"],"pdf_url":"https://arxiv.org/pdf/2308.07506v1.pdf","comment":"Accepted to the UNSURE Workshop held in conjunction with MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.07504v1","updated":"2023-08-15T00:02:10Z","published":"2023-08-15T00:02:10Z","title":"ICAFusion: Iterative Cross-Attention Guided Feature Fusion for\n Multispectral Object Detection","summary":" Effective feature fusion of multispectral images plays a crucial role in\nmulti-spectral object detection. Previous studies have demonstrated the\neffectiveness of feature fusion using convolutional neural networks, but these\nmethods are sensitive to image misalignment due to the inherent deffciency in\nlocal-range feature interaction resulting in the performance degradation. To\naddress this issue, a novel feature fusion framework of dual cross-attention\ntransformers is proposed to model global feature interaction and capture\ncomplementary information across modalities simultaneously. This framework\nenhances the discriminability of object features through the query-guided\ncross-attention mechanism, leading to improved performance. However, stacking\nmultiple transformer blocks for feature enhancement incurs a large number of\nparameters and high spatial complexity. To handle this, inspired by the human\nprocess of reviewing knowledge, an iterative interaction mechanism is proposed\nto share parameters among block-wise multimodal transformers, reducing model\ncomplexity and computation cost. The proposed method is general and effective\nto be integrated into different detection frameworks and used with different\nbackbones. Experimental results on KAIST, FLIR, and VEDAI datasets show that\nthe proposed method achieves superior performance and faster inference, making\nit suitable for various practical scenarios. Code will be available at\nhttps://github.com/chanchanchan97/ICAFusion.\n","authors":["Jifeng Shen","Yifei Chen","Yue Liu","Xin Zuo","Heng Fan","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2308.07504v1.pdf","comment":"submitted to Pattern Recognition Journal, minor revision"},{"id":"http://arxiv.org/abs/2211.02982v2","updated":"2023-08-15T21:19:13Z","published":"2022-11-05T22:06:50Z","title":"Event and Entity Extraction from Generated Video Captions","summary":" Annotation of multimedia data by humans is time-consuming and costly, while\nreliable automatic generation of semantic metadata is a major challenge. We\npropose a framework to extract semantic metadata from automatically generated\nvideo captions. As metadata, we consider entities, the entities' properties,\nrelations between entities, and the video category. We employ two\nstate-of-the-art dense video captioning models with masked transformer (MT) and\nparallel decoding (PVDC) to generate captions for videos of the ActivityNet\nCaptions dataset. Our experiments show that it is possible to extract entities,\ntheir properties, relations between entities, and the video category from the\ngenerated captions. We observe that the quality of the extracted information is\nmainly influenced by the quality of the event localization in the video as well\nas the performance of the event caption generation.\n","authors":["Johannes Scherer","Ansgar Scherp","Deepayan Bhowmik"],"pdf_url":"https://arxiv.org/pdf/2211.02982v2.pdf","comment":"Paper accepted at CD-MAKE 2023"},{"id":"http://arxiv.org/abs/2308.08038v1","updated":"2023-08-15T20:58:42Z","published":"2023-08-15T20:58:42Z","title":"Deep Learning Framework for Spleen Volume Estimation from 2D\n Cross-sectional Views","summary":" Abnormal spleen enlargement (splenomegaly) is regarded as a clinical\nindicator for a range of conditions, including liver disease, cancer and blood\ndiseases. While spleen length measured from ultrasound images is a commonly\nused surrogate for spleen size, spleen volume remains the gold standard metric\nfor assessing splenomegaly and the severity of related clinical conditions.\nComputed tomography is the main imaging modality for measuring spleen volume,\nbut it is less accessible in areas where there is a high prevalence of\nsplenomegaly (e.g., the Global South). Our objective was to enable automated\nspleen volume measurement from 2D cross-sectional segmentations, which can be\nobtained from ultrasound imaging. In this study, we describe a variational\nautoencoder-based framework to measure spleen volume from single- or dual-view\n2D spleen segmentations. We propose and evaluate three volume estimation\nmethods within this framework. We also demonstrate how 95\\% confidence\nintervals of volume estimates can be produced to make our method more\nclinically useful. Our best model achieved mean relative volume accuracies of\n86.62\\% and 92.58\\% for single- and dual-view segmentations, respectively,\nsurpassing the performance of the clinical standard approach of linear\nregression using manual measurements and a comparative deep learning-based\n2D-3D reconstruction-based approach. The proposed spleen volume estimation\nframework can be integrated into standard clinical workflows which currently\nuse 2D ultrasound images to measure spleen length. To the best of our\nknowledge, this is the first work to achieve direct 3D spleen volume estimation\nfrom 2D spleen segmentations.\n","authors":["Zhen Yuan","Esther Puyol-Anton","Haran Jogeesvaran","Baba Inusa","Andrew P. King"],"pdf_url":"https://arxiv.org/pdf/2308.08038v1.pdf","comment":"22 pages, 7 figures"},{"id":"http://arxiv.org/abs/2304.01973v2","updated":"2023-08-15T20:47:51Z","published":"2023-04-04T17:31:15Z","title":"ERM++: An Improved Baseline for Domain Generalization","summary":" Multi-source Domain Generalization (DG) measures a classifier's ability to\ngeneralize to new distributions of data it was not trained on, given several\ntraining domains. While several multi-source DG methods have been proposed,\nthey incur additional complexity during training by using domain labels. Recent\nwork has shown that a well-tuned Empirical Risk Minimization (ERM) training\nprocedure, that is simply minimizing the empirical risk on the source domains,\ncan outperform most existing DG methods. We identify several key candidate\ntechniques to further improve ERM performance, such as better utilization of\ntraining data, model parameter selection, and weight-space regularization. We\ncall the resulting method ERM++, and show it significantly improves the\nperformance of DG on five multi-source datasets by over 5% compared to standard\nERM, and beats state-of-the-art despite being less computationally expensive.\nAdditionally, we demonstrate the efficacy of ERM++ on the WILDS-FMOW dataset, a\nchallenging DG benchmark. We hope that ERM++ becomes a strong baseline for\nfuture DG research. Code is released at\nhttps://github.com/piotr-teterwak/erm_plusplus.\n","authors":["Piotr Teterwak","Kuniaki Saito","Theodoros Tsiligkaridis","Kate Saenko","Bryan A. Plummer"],"pdf_url":"https://arxiv.org/pdf/2304.01973v2.pdf","comment":"An improved baseline for Domain Generalization"},{"id":"http://arxiv.org/abs/2308.08011v1","updated":"2023-08-15T19:50:38Z","published":"2023-08-15T19:50:38Z","title":"Shortcut-V2V: Compression Framework for Video-to-Video Translation based\n on Temporal Redundancy Reduction","summary":" Video-to-video translation aims to generate video frames of a target domain\nfrom an input video. Despite its usefulness, the existing networks require\nenormous computations, necessitating their model compression for wide use.\nWhile there exist compression methods that improve computational efficiency in\nvarious image/video tasks, a generally-applicable compression method for\nvideo-to-video translation has not been studied much. In response, we present\nShortcut-V2V, a general-purpose compression framework for video-to-video\ntranslation. Shourcut-V2V avoids full inference for every neighboring video\nframe by approximating the intermediate features of a current frame from those\nof the previous frame. Moreover, in our framework, a newly-proposed block\ncalled AdaBD adaptively blends and deforms features of neighboring frames,\nwhich makes more accurate predictions of the intermediate features possible. We\nconduct quantitative and qualitative evaluations using well-known\nvideo-to-video translation models on various tasks to demonstrate the general\napplicability of our framework. The results show that Shourcut-V2V achieves\ncomparable performance compared to the original video-to-video translation\nmodel while saving 3.2-5.7x computational cost and 7.8-44x memory at test time.\n","authors":["Chaeyeon Chung","Yeojeong Park","Seunghwan Choi","Munkhsoyol Ganbat","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2308.08011v1.pdf","comment":"to be updated"},{"id":"http://arxiv.org/abs/2308.07298v2","updated":"2023-08-15T19:34:12Z","published":"2023-08-14T17:36:39Z","title":"Accurate Eye Tracking from Dense 3D Surface Reconstructions using\n Single-Shot Deflectometry","summary":" Eye-tracking plays a crucial role in the development of virtual reality\ndevices, neuroscience research, and psychology. Despite its significance in\nnumerous applications, achieving an accurate, robust, and fast eye-tracking\nsolution remains a considerable challenge for current state-of-the-art methods.\nWhile existing reflection-based techniques (e.g., \"glint tracking\") are\nconsidered the most accurate, their performance is limited by their reliance on\nsparse 3D surface data acquired solely from the cornea surface. In this paper,\nwe rethink the way how specular reflections can be used for eye tracking: We\npropose a novel method for accurate and fast evaluation of the gaze direction\nthat exploits teachings from single-shot phase-measuring-deflectometry (PMD).\nIn contrast to state-of-the-art reflection-based methods, our method acquires\ndense 3D surface information of both cornea and sclera within only one single\ncamera frame (single-shot). Improvements in acquired reflection surface\npoints(\"glints\") of factors $>3300 \\times$ are easily achievable. We show the\nfeasibility of our approach with experimentally evaluated gaze errors of only\n$\\leq 0.25^\\circ$ demonstrating a significant improvement over the current\nstate-of-the-art.\n","authors":["Jiazhang Wang","Tianfu Wang","Bingjie Xu","Oliver Cossairt","Florian Willomitzer"],"pdf_url":"https://arxiv.org/pdf/2308.07298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07997v1","updated":"2023-08-15T19:01:19Z","published":"2023-08-15T19:01:19Z","title":"$A^2$Nav: Action-Aware Zero-Shot Robot Navigation by Exploiting\n Vision-and-Language Ability of Foundation Models","summary":" We study the task of zero-shot vision-and-language navigation (ZS-VLN), a\npractical yet challenging problem in which an agent learns to navigate\nfollowing a path described by language instructions without requiring any\npath-instruction annotation data. Normally, the instructions have complex\ngrammatical structures and often contain various action descriptions (e.g.,\n\"proceed beyond\", \"depart from\"). How to correctly understand and execute these\naction demands is a critical problem, and the absence of annotated data makes\nit even more challenging. Note that a well-educated human being can easily\nunderstand path instructions without the need for any special training. In this\npaper, we propose an action-aware zero-shot VLN method ($A^2$Nav) by exploiting\nthe vision-and-language ability of foundation models. Specifically, the\nproposed method consists of an instruction parser and an action-aware\nnavigation policy. The instruction parser utilizes the advanced reasoning\nability of large language models (e.g., GPT-3) to decompose complex navigation\ninstructions into a sequence of action-specific object navigation sub-tasks.\nEach sub-task requires the agent to localize the object and navigate to a\nspecific goal position according to the associated action demand. To accomplish\nthese sub-tasks, an action-aware navigation policy is learned from freely\ncollected action-specific datasets that reveal distinct characteristics of each\naction demand. We use the learned navigation policy for executing sub-tasks\nsequentially to follow the navigation instruction. Extensive experiments show\n$A^2$Nav achieves promising ZS-VLN performance and even surpasses the\nsupervised learning methods on R2R-Habitat and RxR-Habitat datasets.\n","authors":["Peihao Chen","Xinyu Sun","Hongyan Zhi","Runhao Zeng","Thomas H. Li","Gaowen Liu","Mingkui Tan","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2308.07997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07977v1","updated":"2023-08-15T18:27:03Z","published":"2023-08-15T18:27:03Z","title":"YODA: You Only Diffuse Areas. An Area-Masked Diffusion Approach For\n Image Super-Resolution","summary":" This work introduces \"You Only Diffuse Areas\" (YODA), a novel method for\npartial diffusion in Single-Image Super-Resolution (SISR). The core idea is to\nutilize diffusion selectively on spatial regions based on attention maps\nderived from the low-resolution image and the current time step in the\ndiffusion process. This time-dependent targeting enables a more effective\nconversion to high-resolution outputs by focusing on areas that benefit the\nmost from the iterative refinement process, i.e., detail-rich objects. We\nempirically validate YODA by extending leading diffusion-based SISR methods SR3\nand SRDiff. Our experiments demonstrate new state-of-the-art performance gains\nin face and general SR across PSNR, SSIM, and LPIPS metrics. A notable finding\nis YODA's stabilization effect on training by reducing color shifts, especially\nwhen induced by small batch sizes, potentially contributing to\nresource-constrained scenarios. The proposed spatial and temporal adaptive\ndiffusion mechanism opens promising research directions, including developing\nenhanced attention map extraction techniques and optimizing inference latency\nbased on sparser diffusion.\n","authors":["Brian B. Moser","Stanislav Frolov","Federico Raue","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2308.07977v1.pdf","comment":"Brian B. Moser and Stanislav Frolov contributed equally"},{"id":"http://arxiv.org/abs/2308.07967v1","updated":"2023-08-15T18:05:19Z","published":"2023-08-15T18:05:19Z","title":"Boosting Cross-Quality Face Verification using Blind Face Restoration","summary":" In recent years, various Blind Face Restoration (BFR) techniques were\ndeveloped. These techniques transform low quality faces suffering from multiple\ndegradations to more realistic and natural face images with high perceptual\nquality. However, it is crucial for the task of face verification to not only\nenhance the perceptual quality of the low quality images but also to improve\nthe biometric-utility face quality metrics. Furthermore, preserving the\nvaluable identity information is of great importance. In this paper, we\ninvestigate the impact of applying three state-of-the-art blind face\nrestoration techniques namely, GFP-GAN, GPEN and SGPN on the performance of\nface verification system under very challenging environment characterized by\nvery low quality images. Extensive experimental results on the recently\nproposed cross-quality LFW database using three state-of-the-art deep face\nrecognition models demonstrate the effectiveness of GFP-GAN in boosting\nsignificantly the face verification accuracy.\n","authors":["Messaoud Bengherabi","Douaa Laib","Fella Souhila Lasnami","Ryma Boussaha"],"pdf_url":"https://arxiv.org/pdf/2308.07967v1.pdf","comment":"paper accepted at BIOSIG 2023 conference"},{"id":"http://arxiv.org/abs/2303.10310v2","updated":"2023-08-15T18:03:07Z","published":"2023-03-18T02:42:18Z","title":"Pseudo Supervised Metrics: Evaluating Unsupervised Image to Image\n Translation Models In Unsupervised Cross-Domain Classification Frameworks","summary":" The ability to classify images accurately and efficiently is dependent on\nhaving access to large labeled datasets and testing on data from the same\ndomain that the model is trained on. Classification becomes more challenging\nwhen dealing with new data from a different domain, where collecting a large\nlabeled dataset and training a new classifier from scratch is time-consuming,\nexpensive, and sometimes infeasible or impossible. Cross-domain classification\nframeworks were developed to handle this data domain shift problem by utilizing\nunsupervised image-to-image (UI2I) translation models to translate an input\nimage from the unlabeled domain to the labeled domain. The problem with these\nunsupervised models lies in their unsupervised nature. For lack of annotations,\nit is not possible to use the traditional supervised metrics to evaluate these\ntranslation models to pick the best-saved checkpoint model. In this paper, we\nintroduce a new method called Pseudo Supervised Metrics that was designed\nspecifically to support cross-domain classification applications contrary to\nother typically used metrics such as the FID which was designed to evaluate the\nmodel in terms of the quality of the generated image from a human-eye\nperspective. We show that our metric not only outperforms unsupervised metrics\nsuch as the FID, but is also highly correlated with the true supervised\nmetrics, robust, and explainable. Furthermore, we demonstrate that it can be\nused as a standard metric for future research in this field by applying it to a\ncritical real-world problem (the boiling crisis problem).\n","authors":["Firas Al-Hindawi","Md Mahfuzur Rahman Siddiquee","Teresa Wu","Han Hu","Ying Sun"],"pdf_url":"https://arxiv.org/pdf/2303.10310v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2212.09107"},{"id":"http://arxiv.org/abs/2308.07948v1","updated":"2023-08-15T16:16:29Z","published":"2023-08-15T16:16:29Z","title":"Leveraging Symmetries in Pick and Place","summary":" Robotic pick and place tasks are symmetric under translations and rotations\nof both the object to be picked and the desired place pose. For example, if the\npick object is rotated or translated, then the optimal pick action should also\nrotate or translate. The same is true for the place pose; if the desired place\npose changes, then the place action should also transform accordingly. A\nrecently proposed pick and place framework known as Transporter Net captures\nsome of these symmetries, but not all. This paper analytically studies the\nsymmetries present in planar robotic pick and place and proposes a method of\nincorporating equivariant neural models into Transporter Net in a way that\ncaptures all symmetries. The new model, which we call Equivariant Transporter\nNet, is equivariant to both pick and place symmetries and can immediately\ngeneralize pick and place knowledge to different pick and place poses. We\nevaluate the new model empirically and show that it is much more sample\nefficient than the non-symmetric version, resulting in a system that can\nimitate demonstrated pick and place behavior using very few human\ndemonstrations on a variety of imitation learning tasks.\n","authors":["Haojie Huang","Dian Wang","Arsh Tangri","Robin Walters","Robert Platt"],"pdf_url":"https://arxiv.org/pdf/2308.07948v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2202.09400"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.07924v1","updated":"2023-08-15T17:59:46Z","published":"2023-08-15T17:59:46Z","title":"Investigation Toward The Economic Feasibility of Personalized Medicine\n For Healthcare Service Providers: The Case of Bladder Cancer","summary":" In today's complex healthcare landscape, the pursuit of delivering optimal\npatient care while navigating intricate economic dynamics poses a significant\nchallenge for healthcare service providers (HSPs). In this already complex\ndynamics, the emergence of clinically promising personalized medicine based\ntreatment aims to revolutionize medicine. While personalized medicine holds\ntremendous potential for enhancing therapeutic outcomes, its integration within\nresource-constrained HSPs presents formidable challenges. In this study, we\ninvestigate the economic feasibility of implementing personalized medicine. The\ncentral objective is to strike a balance between catering to individual patient\nneeds and making economically viable decisions. Unlike conventional binary\napproaches to personalized treatment, we propose a more nuanced perspective by\ntreating personalization as a spectrum. This approach allows for greater\nflexibility in decision-making and resource allocation. To this end, we propose\na mathematical framework to investigate our proposal, focusing on Bladder\nCancer (BC) as a case study. Our results show that while it is feasible to\nintroduce personalized medicine, a highly efficient but highly expensive one\nwould be short-lived relative to its less effective but cheaper alternative as\nthe latter can be provided to a larger cohort of patients, optimizing the HSP's\nobjective better.\n","authors":["Elizaveta Savchenko","Svetlana Bunimovich-Mendrazitsky"],"pdf_url":"https://arxiv.org/pdf/2308.07924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13050v2","updated":"2023-08-15T17:39:46Z","published":"2023-06-22T17:17:45Z","title":"Data augmentation and refinement for recommender system: A\n semi-supervised approach using maximum margin matrix factorization","summary":" Collaborative filtering (CF) has become a popular method for developing\nrecommender systems (RSs) where ratings of a user for new items are predicted\nbased on her past preferences and available preference information of other\nusers. Despite the popularity of CF-based methods, their performance is often\ngreatly limited by the sparsity of observed entries. In this study, we explore\nthe data augmentation and refinement aspects of Maximum Margin Matrix\nFactorization (MMMF), a widely accepted CF technique for rating predictions,\nwhich has not been investigated before. We exploit the inherent characteristics\nof CF algorithms to assess the confidence level of individual ratings and\npropose a semi-supervised approach for rating augmentation based on\nself-training. We hypothesize that any CF algorithm's predictions with low\nconfidence are due to some deficiency in the training data and hence, the\nperformance of the algorithm can be improved by adopting a systematic data\naugmentation strategy. We iteratively use some of the ratings predicted with\nhigh confidence to augment the training data and remove low-confidence entries\nthrough a refinement process. By repeating this process, the system learns to\nimprove prediction accuracy. Our method is experimentally evaluated on several\nstate-of-the-art CF algorithms and leads to informative rating augmentation,\nimproving the performance of the baseline approaches.\n","authors":["Shamal Shaikh","Venkateswara Rao Kagita","Vikas Kumar","Arun K Pujari"],"pdf_url":"https://arxiv.org/pdf/2306.13050v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.07876v1","updated":"2023-08-15T16:41:53Z","published":"2023-08-15T16:41:53Z","title":"Synthesizing Political Zero-Shot Relation Classification via Codebook\n Knowledge, NLI, and ChatGPT","summary":" Recent supervised models for event coding vastly outperform pattern-matching\nmethods. However, their reliance solely on new annotations disregards the vast\nknowledge within expert databases, hindering their applicability to\nfine-grained classification. To address these limitations, we explore zero-shot\napproaches for political event ontology relation classification, by leveraging\nknowledge from established annotation codebooks. Our study encompasses both\nChatGPT and a novel natural language inference (NLI) based approach named ZSP.\nZSP adopts a tree-query framework that deconstructs the task into context,\nmodality, and class disambiguation levels. This framework improves\ninterpretability, efficiency, and adaptability to schema changes. By conducting\nextensive experiments on our newly curated datasets, we pinpoint the\ninstability issues within ChatGPT and highlight the superior performance of\nZSP. ZSP achieves an impressive 40% improvement in F1 score for fine-grained\nRootcode classification. ZSP demonstrates competitive performance compared to\nsupervised BERT models, positioning it as a valuable tool for event record\nvalidation and ontology development. Our work underscores the potential of\nleveraging transfer learning and existing expertise to enhance the efficiency\nand scalability of research in the field.\n","authors":["Yibo Hu","Erick Skorupa Parolin","Latifur Khan","Patrick T. Brandt","Javier Osorio","Vito J. D'Orazio"],"pdf_url":"https://arxiv.org/pdf/2308.07876v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.07857v1","updated":"2023-08-15T16:16:02Z","published":"2023-08-15T16:16:02Z","title":"Impression-Aware Recommender Systems","summary":" Novel data sources bring new opportunities to improve the quality of\nrecommender systems. Impressions are a novel data source containing past\nrecommendations (shown items) and traditional interactions. Researchers may use\nimpressions to refine user preferences and overcome the current limitations in\nrecommender systems research. The relevance and interest of impressions have\nincreased over the years; hence, the need for a review of relevant work on this\ntype of recommenders. We present a systematic literature review on recommender\nsystems using impressions, focusing on three fundamental angles in research:\nrecommenders, datasets, and evaluation methodologies. We provide three\ncategorizations of papers describing recommenders using impressions, present\neach reviewed paper in detail, describe datasets with impressions, and analyze\nthe existing evaluation methodologies. Lastly, we present open questions and\nfuture directions of interest, highlighting aspects missing in the literature\nthat can be addressed in future works.\n","authors":["Fernando B. Pérez Maurera","Maurizio Ferrari Dacrema","Pablo Castells","Paolo Cremonesi"],"pdf_url":"https://arxiv.org/pdf/2308.07857v1.pdf","comment":"34 pages, 103 references, 6 tables, 2 figures, ACM UNDER REVIEW"},{"id":"http://arxiv.org/abs/2308.07760v1","updated":"2023-08-15T13:27:18Z","published":"2023-08-15T13:27:18Z","title":"Dynamic Embedding Size Search with Minimum Regret for Streaming\n Recommender System","summary":" With the continuous increase of users and items, conventional recommender\nsystems trained on static datasets can hardly adapt to changing environments.\nThe high-throughput data requires the model to be updated in a timely manner\nfor capturing the user interest dynamics, which leads to the emergence of\nstreaming recommender systems. Due to the prevalence of deep learning-based\nrecommender systems, the embedding layer is widely adopted to represent the\ncharacteristics of users, items, and other features in low-dimensional vectors.\nHowever, it has been proved that setting an identical and static embedding size\nis sub-optimal in terms of recommendation performance and memory cost,\nespecially for streaming recommendations. To tackle this problem, we first\nrethink the streaming model update process and model the dynamic embedding size\nsearch as a bandit problem. Then, we analyze and quantify the factors that\ninfluence the optimal embedding sizes from the statistics perspective. Based on\nthis, we propose the \\textbf{D}ynamic \\textbf{E}mbedding \\textbf{S}ize\n\\textbf{S}earch (\\textbf{DESS}) method to minimize the embedding size selection\nregret on both user and item sides in a non-stationary manner. Theoretically,\nwe obtain a sublinear regret upper bound superior to previous methods.\nEmpirical results across two recommendation tasks on four public datasets also\ndemonstrate that our approach can achieve better streaming recommendation\nperformance with lower memory cost and higher time efficiency.\n","authors":["Bowei He","Xu He","Renrui Zhang","Yingxue Zhang","Ruiming Tang","Chen Ma"],"pdf_url":"https://arxiv.org/pdf/2308.07760v1.pdf","comment":"Accepted for publication on CIKM2023"},{"id":"http://arxiv.org/abs/2308.07752v1","updated":"2023-08-15T13:12:19Z","published":"2023-08-15T13:12:19Z","title":"Self-Supervised Dynamic Hypergraph Recommendation based on\n Hyper-Relational Knowledge Graph","summary":" Knowledge graphs (KGs) are commonly used as side information to enhance\ncollaborative signals and improve recommendation quality. In the context of\nknowledge-aware recommendation (KGR), graph neural networks (GNNs) have emerged\nas promising solutions for modeling factual and semantic information in KGs.\nHowever, the long-tail distribution of entities leads to sparsity in\nsupervision signals, which weakens the quality of item representation when\nutilizing KG enhancement. Additionally, the binary relation representation of\nKGs simplifies hyper-relational facts, making it challenging to model complex\nreal-world information. Furthermore, the over-smoothing phenomenon results in\nindistinguishable representations and information loss. To address these\nchallenges, we propose the SDK (Self-Supervised Dynamic Hypergraph\nRecommendation based on Hyper-Relational Knowledge Graph) framework. This\nframework establishes a cross-view hypergraph self-supervised learning\nmechanism for KG enhancement. Specifically, we model hyper-relational facts in\nKGs to capture interdependencies between entities under complete semantic\nconditions. With the refined representation, a hypergraph is dynamically\nconstructed to preserve features in the deep vector space, thereby alleviating\nthe over-smoothing problem. Furthermore, we mine external supervision signals\nfrom both the global perspective of the hypergraph and the local perspective of\ncollaborative filtering (CF) to guide the model prediction process. Extensive\nexperiments conducted on different datasets demonstrate the superiority of the\nSDK framework over state-of-the-art models. The results showcase its ability to\nalleviate the effects of over-smoothing and supervision signal sparsity.\n","authors":["Yi Liu","Hongrui Xuan","Bohan Li","Meng Wang","Tong Chen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2308.07752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07107v2","updated":"2023-08-15T12:09:20Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":" As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions within\nthis expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07711v1","updated":"2023-08-15T11:45:34Z","published":"2023-08-15T11:45:34Z","title":"SPM: Structured Pretraining and Matching Architectures for Relevance\n Modeling in Meituan Search","summary":" In e-commerce search, relevance between query and documents is an essential\nrequirement for satisfying user experience. Different from traditional\ne-commerce platforms that offer products, users search on life service\nplatforms such as Meituan mainly for product providers, which usually have\nabundant structured information, e.g. name, address, category, thousands of\nproducts. Modeling search relevance with these rich structured contents is\nchallenging due to the following issues: (1) there is language distribution\ndiscrepancy among different fields of structured document, making it difficult\nto directly adopt off-the-shelf pretrained language model based methods like\nBERT. (2) different fields usually have different importance and their length\nvary greatly, making it difficult to extract document information helpful for\nrelevance matching.\n To tackle these issues, in this paper we propose a novel two-stage\npretraining and matching architecture for relevance matching with rich\nstructured documents. At pretraining stage, we propose an effective pretraining\nmethod that employs both query and multiple fields of document as inputs,\nincluding an effective information compression method for lengthy fields. At\nrelevance matching stage, a novel matching method is proposed by leveraging\ndomain knowledge in search query to generate more effective document\nrepresentations for relevance scoring. Extensive offline experiments and online\nA/B tests on millions of users verify that the proposed architectures\neffectively improve the performance of relevance modeling. The model has\nalready been deployed online, serving the search traffic of Meituan for over a\nyear.\n","authors":["Wen Zan","Yaopeng Han","Xiaotian Jiang","Yao Xiao","Yang Yang","Dayao Chen","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07711v1.pdf","comment":"Accepted by CIKM '23"},{"id":"http://arxiv.org/abs/2308.07629v1","updated":"2023-08-15T08:27:12Z","published":"2023-08-15T08:27:12Z","title":"Learning from All Sides: Diversified Positive Augmentation via\n Self-distillation in Recommendation","summary":" Personalized recommendation relies on user historical behaviors to provide\nuser-interested items, and thus seriously struggles with the data sparsity\nissue. A powerful positive item augmentation is beneficial to address the\nsparsity issue, while few works could jointly consider both the accuracy and\ndiversity of these augmented training labels. In this work, we propose a novel\nmodel-agnostic Diversified self-distillation guided positive augmentation\n(DivSPA) for accurate and diverse positive item augmentations. Specifically,\nDivSPA first conducts three types of retrieval strategies to collect\nhigh-quality and diverse positive item candidates according to users' overall\ninterests, short-term intentions, and similar users. Next, a self-distillation\nmodule is conducted to double-check and rerank these candidates as the final\npositive augmentations. Extensive offline and online evaluations verify the\neffectiveness of our proposed DivSPA on both accuracy and diversity. DivSPA is\nsimple and effective, which could be conveniently adapted to other base models\nand systems. Currently, DivSPA has been deployed on multiple widely-used\nreal-world recommender systems.\n","authors":["Chong Liu","Xiaoyang Liu","Ruobing Xie","Lixin Zhang","Feng Xia","Leyu Lin"],"pdf_url":"https://arxiv.org/pdf/2308.07629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06016v3","updated":"2023-08-15T06:24:55Z","published":"2023-03-09T12:13:46Z","title":"Probe: Learning Users' Personalized Projection Bias in Intertemporal\n Bundle Choices","summary":" Intertemporal choices involve making decisions that require weighing the\ncosts in the present against the benefits in the future. One specific type of\nintertemporal choice is the decision between purchasing an individual item or\nopting for a bundle that includes that item. Previous research assumes that\nindividuals have accurate expectations of the factors involved in these\nchoices. However, in reality, users' perceptions of these factors are often\nbiased, leading to irrational and suboptimal decision-making. In this work, we\nspecifically focus on two commonly observed biases: projection bias and the\nreference-point effect. To address these biases, we propose a novel\nbias-embedded preference model called Probe. The Probe incorporates a weight\nfunction to capture users' projection bias and a value function to account for\nthe reference-point effect, and introduce prospect theory from behavioral\neconomics to combine the weight and value functions. This allows us to\ndetermine the probability of users selecting the bundle or a single item. We\nprovide a thorough theoretical analysis to demonstrate the impact of projection\nbias on the design of bundle sales strategies. Through experimental results, we\nshow that the proposed Probe model outperforms existing methods and contributes\nto a better understanding of users' irrational behaviors in bundle purchases.\nThis investigation can facilitate a deeper comprehension of users'\ndecision-making mechanisms, enable the provision of personalized services, and\nassist users in making more rational and optimal decisions.\n","authors":["Qingming Li","H. Vicky Zhao"],"pdf_url":"https://arxiv.org/pdf/2303.06016v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07525v1","updated":"2023-08-15T01:39:10Z","published":"2023-08-15T01:39:10Z","title":"Delphic Costs and Benefits in Web Search: A utilitarian and historical\n analysis","summary":" We present a new framework to conceptualize and operationalize the total user\nexperience of search, by studying the entirety of a search journey from an\nutilitarian point of view.\n Web search engines are widely perceived as \"free\". But search requires time\nand effort: in reality there are many intermingled non-monetary costs (e.g.\ntime costs, cognitive costs, interactivity costs) and the benefits may be\nmarred by various impairments, such as misunderstanding and misinformation.\nThis characterization of costs and benefits appears to be inherent to the human\nsearch for information within the pursuit of some larger task: most of the\ncosts and impairments can be identified in interactions with any web search\nengine, interactions with public libraries, and even in interactions with\nancient oracles. To emphasize this innate connection, we call these costs and\nbenefits Delphic, in contrast to explicitly financial costs and benefits.\n Our main thesis is that the users' satisfaction with a search engine mostly\ndepends on their experience of Delphic cost and benefits, in other words on\ntheir utility. The consumer utility is correlated with classic measures of\nsearch engine quality, such as ranking, precision, recall, etc., but is not\ncompletely determined by them. To argue our thesis, we catalog the Delphic\ncosts and benefits and show how the development of search engines over the last\nquarter century, from classic Information Retrieval roots to the integration of\nLarge Language Models, was driven to a great extent by the quest of decreasing\nDelphic costs and increasing Delphic benefits.\n We hope that the Delphic costs framework will engender new ideas and new\nresearch for evaluating and improving the web experience for everyone.\n","authors":["Andrei Z. Broder","Preston McAfee"],"pdf_url":"https://arxiv.org/pdf/2308.07525v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.08072v1","updated":"2023-08-15T23:56:44Z","published":"2023-08-15T23:56:44Z","title":"Decentralized Graph Neural Network for Privacy-Preserving Recommendation","summary":" Building a graph neural network (GNN)-based recommender system without\nviolating user privacy proves challenging. Existing methods can be divided into\nfederated GNNs and decentralized GNNs. But both methods have undesirable\neffects, i.e., low communication efficiency and privacy leakage. This paper\nproposes DGREC, a novel decentralized GNN for privacy-preserving\nrecommendations, where users can choose to publicize their interactions. It\nincludes three stages, i.e., graph construction, local gradient calculation,\nand global gradient passing. The first stage builds a local inner-item\nhypergraph for each user and a global inter-user graph. The second stage models\nuser preference and calculates gradients on each local device. The third stage\ndesigns a local differential privacy mechanism named secure gradient-sharing,\nwhich proves strong privacy-preserving of users' private data. We conduct\nextensive experiments on three public datasets to validate the consistent\nsuperiority of our framework.\n","authors":["Xiaolin Zheng","Zhongyu Wang","Chaochao Chen","Jiashu Qian","Yao Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08487v1","updated":"2023-08-15T05:48:44Z","published":"2023-08-15T05:48:44Z","title":"Temporal Interest Network for Click-Through Rate Prediction","summary":" The history of user behaviors constitutes one of the most significant\ncharacteristics in predicting the click-through rate (CTR), owing to their\nstrong semantic and temporal correlation with the target item. While the\nliterature has individually examined each of these correlations, research has\nyet to analyze them in combination, that is, the quadruple correlation of\n(behavior semantics, target semantics, behavior temporal, and target temporal).\nThe effect of this correlation on performance and the extent to which existing\nmethods learn it remain unknown. To address this gap, we empirically measure\nthe quadruple correlation and observe intuitive yet robust quadruple patterns.\nWe measure the learned correlation of several representative user behavior\nmethods, but to our surprise, none of them learn such a pattern, especially the\ntemporal one.\n In this paper, we propose the Temporal Interest Network (TIN) to capture the\nquadruple semantic and temporal correlation between behaviors and the target.\nWe achieve this by incorporating target-aware temporal encoding, in addition to\nsemantic embedding, to represent behaviors and the target. Furthermore, we\ndeploy target-aware attention, along with target-aware representation, to\nexplicitly conduct the 4-way interaction. We performed comprehensive\nevaluations on the Amazon and Alibaba datasets. Our proposed TIN outperforms\nthe best-performing baselines by 0.43\\% and 0.29\\% on two datasets,\nrespectively. Comprehensive analysis and visualization show that TIN is indeed\ncapable of learning the quadruple correlation effectively, while all existing\nmethods fail to do so. We provide our implementation of TIN in Tensorflow.\n","authors":["Haolin Zhou","Junwei Pan","Xinyi Zhou","Xihua Chen","Jie Jiang","Xiaofeng Gao","Guihai Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08487v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.07922v1","updated":"2023-08-15T17:59:18Z","published":"2023-08-15T17:59:18Z","title":"RAVEN: In-Context Learning with Retrieval Augmented Encoder-Decoder\n Language Models","summary":" In this paper, we investigate the in-context learning ability of\nretrieval-augmented encoder-decoder language models. We first conduct a\ncomprehensive analysis of the state-of-the-art ATLAS model and identify its\nlimitations in in-context learning, primarily due to a mismatch between\npretraining and testing, as well as a restricted context length. To address\nthese issues, we propose RAVEN, a model that combines retrieval-augmented\nmasked language modeling and prefix language modeling. We further introduce\nFusion-in-Context Learning to enhance the few-shot performance by enabling the\nmodel to leverage more in-context examples without requiring additional\ntraining or model modifications. Through extensive experiments, we demonstrate\nthat RAVEN significantly outperforms ATLAS and achieves results comparable to\nthe most advanced language models in certain scenarios, despite having\nsubstantially fewer parameters. Our work underscores the potential of\nretrieval-augmented encoder-decoder language models for in-context learning and\nencourages further research in this direction.\n","authors":["Jie Huang","Wei Ping","Peng Xu","Mohammad Shoeybi","Kevin Chen-Chuan Chang","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2308.07922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07899v1","updated":"2023-08-15T17:40:10Z","published":"2023-08-15T17:40:10Z","title":"The Regular Expression Inference Challenge","summary":" We propose \\emph{regular expression inference (REI)} as a challenge for\ncode/language modelling, and the wider machine learning community. REI is a\nsupervised machine learning (ML) and program synthesis task, and poses the\nproblem of finding minimal regular expressions from examples: Given two finite\nsets of strings $P$ and $N$ and a cost function $\\text{cost}(\\cdot)$, the task\nis to generate an expression $r$ that accepts all strings in $P$ and rejects\nall strings in $N$, while no other such expression $r'$ exists with\n$\\text{cost}(r')<\\text{cost}(r)$.\n REI has advantages as a challenge problem: (i) regular expressions are\nwell-known, widely used, and a natural idealisation of code; (ii) REI's\nasymptotic worst-case complexity is well understood; (iii) REI has a small\nnumber of easy to understand parameters (e.g.~$P$ or $N$ cardinality, string\nlengths of examples, or the cost function); this lets us easily finetune\nREI-hardness; (iv) REI is an unsolved problem for deep learning based ML.\n Recently, an REI solver was implemented on GPUs, using program synthesis\ntechniques. This enabled, for the first time, fast generation of minimal\nexpressions for complex REI instances. Building on this advance, we generate\nand publish the first large-scale datasets for REI, and devise and evaluate\nseveral initial heuristic and machine learning baselines.\n We invite the community to participate and explore ML methods that learn to\nsolve REI problems. We believe that progress in REI directly translates to\ncode/language modelling.\n","authors":["Mojtaba Valizadeh","Philip John Gorinski","Ignacio Iacobacci","Martin Berger"],"pdf_url":"https://arxiv.org/pdf/2308.07899v1.pdf","comment":"7 pages, 3 pages appendix, 6 tables"},{"id":"http://arxiv.org/abs/2306.13050v2","updated":"2023-08-15T17:39:46Z","published":"2023-06-22T17:17:45Z","title":"Data augmentation and refinement for recommender system: A\n semi-supervised approach using maximum margin matrix factorization","summary":" Collaborative filtering (CF) has become a popular method for developing\nrecommender systems (RSs) where ratings of a user for new items are predicted\nbased on her past preferences and available preference information of other\nusers. Despite the popularity of CF-based methods, their performance is often\ngreatly limited by the sparsity of observed entries. In this study, we explore\nthe data augmentation and refinement aspects of Maximum Margin Matrix\nFactorization (MMMF), a widely accepted CF technique for rating predictions,\nwhich has not been investigated before. We exploit the inherent characteristics\nof CF algorithms to assess the confidence level of individual ratings and\npropose a semi-supervised approach for rating augmentation based on\nself-training. We hypothesize that any CF algorithm's predictions with low\nconfidence are due to some deficiency in the training data and hence, the\nperformance of the algorithm can be improved by adopting a systematic data\naugmentation strategy. We iteratively use some of the ratings predicted with\nhigh confidence to augment the training data and remove low-confidence entries\nthrough a refinement process. By repeating this process, the system learns to\nimprove prediction accuracy. Our method is experimentally evaluated on several\nstate-of-the-art CF algorithms and leads to informative rating augmentation,\nimproving the performance of the baseline approaches.\n","authors":["Shamal Shaikh","Venkateswara Rao Kagita","Vikas Kumar","Arun K Pujari"],"pdf_url":"https://arxiv.org/pdf/2306.13050v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.01246v2","updated":"2023-08-15T17:39:05Z","published":"2023-08-02T16:00:39Z","title":"Tirtha -- An Automated Platform to Crowdsource Images and Create 3D\n Models of Heritage Sites","summary":" Digital preservation of Cultural Heritage (CH) sites is crucial to protect\nthem against damage from natural disasters or human activities. Creating 3D\nmodels of CH sites has become a popular method of digital preservation thanks\nto advancements in computer vision and photogrammetry. However, the process is\ntime-consuming, expensive, and typically requires specialized equipment and\nexpertise, posing challenges in resource-limited developing countries.\nAdditionally, the lack of an open repository for 3D models hinders research and\npublic engagement with their heritage. To address these issues, we propose\nTirtha, a web platform for crowdsourcing images of CH sites and creating their\n3D models. Tirtha utilizes state-of-the-art Structure from Motion (SfM) and\nMulti-View Stereo (MVS) techniques. It is modular, extensible and\ncost-effective, allowing for the incorporation of new techniques as\nphotogrammetry advances. Tirtha is accessible through a web interface at\nhttps://tirtha.niser.ac.in and can be deployed on-premise or in a cloud\nenvironment. In our case studies, we demonstrate the pipeline's effectiveness\nby creating 3D models of temples in Odisha, India, using crowdsourced images.\nThese models are available for viewing, interaction, and download on the Tirtha\nwebsite. Our work aims to provide a dataset of crowdsourced images and 3D\nreconstructions for research in computer vision, heritage conservation, and\nrelated domains. Overall, Tirtha is a step towards democratizing digital\npreservation, primarily in resource-limited developing countries.\n","authors":["Jyotirmaya Shivottam","Subhankar Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.01246v2.pdf","comment":"Accepted at The 28th International ACM Conference on 3D Web\n Technology (Web3D 2023)"},{"id":"http://arxiv.org/abs/2303.01262v2","updated":"2023-08-15T17:38:08Z","published":"2023-03-01T18:49:10Z","title":"Subset-Based Instance Optimality in Private Estimation","summary":" We propose a new definition of instance optimality for differentially private\nestimation algorithms. Our definition requires an optimal algorithm to compete,\nsimultaneously for every dataset $D$, with the best private benchmark algorithm\nthat (a) knows $D$ in advance and (b) is evaluated by its worst-case\nperformance on large subsets of $D$. That is, the benchmark algorithm need not\nperform well when potentially extreme points are added to $D$; it only has to\nhandle the removal of a small number of real data points that already exist.\nThis makes our benchmark significantly stronger than those proposed in prior\nwork. We nevertheless show, for real-valued datasets, how to construct private\nalgorithms that achieve our notion of instance optimality when estimating a\nbroad class of dataset properties, including means, quantiles, and\n$\\ell_p$-norm minimizers. For means in particular, we provide a detailed\nanalysis and show that our algorithm simultaneously matches or exceeds the\nasymptotic performance of existing algorithms under a range of distributional\nassumptions.\n","authors":["Travis Dick","Alex Kulesza","Ziteng Sun","Ananda Theertha Suresh"],"pdf_url":"https://arxiv.org/pdf/2303.01262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07896v1","updated":"2023-08-15T17:37:44Z","published":"2023-08-15T17:37:44Z","title":"SciRE-Solver: Efficient Sampling of Diffusion Probabilistic Models by\n Score-integrand Solver with Recursive Derivative Estimation","summary":" Diffusion probabilistic models (DPMs) are a powerful class of generative\nmodels known for their ability to generate high-fidelity image samples. A major\nchallenge in the implementation of DPMs is the slow sampling process. In this\nwork, we bring a high-efficiency sampler for DPMs. Specifically, we propose a\nscore-based exact solution paradigm for the diffusion ODEs corresponding to the\nsampling process of DPMs, which introduces a new perspective on developing\nnumerical algorithms for solving diffusion ODEs. To achieve an efficient\nsampler, we propose a recursive derivative estimation (RDE) method to reduce\nthe estimation error. With our proposed solution paradigm and RDE method, we\npropose the score-integrand solver with the convergence order guarantee as\nefficient solver (SciRE-Solver) for solving diffusion ODEs. The SciRE-Solver\nattains state-of-the-art (SOTA) sampling performance with a limited number of\nscore function evaluations (NFE) on both discrete-time and continuous-time DPMs\nin comparison to existing training-free sampling algorithms. Such as, we\nachieve $3.48$ FID with $12$ NFE and $2.42$ FID with $20$ NFE for\ncontinuous-time DPMs on CIFAR10, respectively. Different from other samplers,\nSciRE-Solver has the promising potential to surpass the FIDs achieved in the\noriginal papers of some pre-trained models with just fewer NFEs. For example,\nwe reach SOTA value of $2.40$ FID with $100$ NFE for continuous-time DPM and of\n$3.15$ FID with $84$ NFE for discrete-time DPM on CIFAR-10, as well as of\n$2.17$ ($2.02$) FID with $18$ ($50$) NFE for discrete-time DPM on CelebA\n64$\\times$64.\n","authors":["Shigui Li","Wei Chen","Delu Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.07896v1.pdf","comment":"42 pages,23 figures. arXiv admin note: text overlap with\n arXiv:2206.00927 by other authors"},{"id":"http://arxiv.org/abs/2211.15377v4","updated":"2023-08-15T17:33:53Z","published":"2022-11-23T09:57:17Z","title":"Whose Emotion Matters? Speaking Activity Localisation without Prior\n Knowledge","summary":" The task of emotion recognition in conversations (ERC) benefits from the\navailability of multiple modalities, as provided, for example, in the\nvideo-based Multimodal EmotionLines Dataset (MELD). However, only a few\nresearch approaches use both acoustic and visual information from the MELD\nvideos. There are two reasons for this: First, label-to-video alignments in\nMELD are noisy, making those videos an unreliable source of emotional speech\ndata. Second, conversations can involve several people in the same scene, which\nrequires the localisation of the utterance source. In this paper, we introduce\nMELD with Fixed Audiovisual Information via Realignment (MELD-FAIR) by using\nrecent active speaker detection and automatic speech recognition models, we are\nable to realign the videos of MELD and capture the facial expressions from\nspeakers in 96.92% of the utterances provided in MELD. Experiments with a\nself-supervised voice recognition model indicate that the realigned MELD-FAIR\nvideos more closely match the transcribed utterances given in the MELD dataset.\nFinally, we devise a model for emotion recognition in conversations trained on\nthe realigned MELD-FAIR videos, which outperforms state-of-the-art models for\nERC based on vision alone. This indicates that localising the source of\nspeaking activities is indeed effective for extracting facial expressions from\nthe uttering speakers and that faces provide more informative visual cues than\nthe visual features state-of-the-art models have been using so far. The\nMELD-FAIR realignment data, and the code of the realignment procedure and of\nthe emotional recognition, are available at\nhttps://github.com/knowledgetechnologyuhh/MELD-FAIR.\n","authors":["Hugo Carneiro","Cornelius Weber","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2211.15377v4.pdf","comment":"17 pages, 8 figures, 7 tables, Published in Neurocomputing"},{"id":"http://arxiv.org/abs/2308.07887v1","updated":"2023-08-15T17:27:16Z","published":"2023-08-15T17:27:16Z","title":"On regularized Radon-Nikodym differentiation","summary":" We discuss the problem of estimating Radon-Nikodym derivatives. This problem\nappears in various applications, such as covariate shift adaptation,\nlikelihood-ratio testing, mutual information estimation, and conditional\nprobability estimation. To address the above problem, we employ the general\nregularization scheme in reproducing kernel Hilbert spaces. The convergence\nrate of the corresponding regularized algorithm is established by taking into\naccount both the smoothness of the derivative and the capacity of the space in\nwhich it is estimated. This is done in terms of general source conditions and\nthe regularized Christoffel functions. We also find that the reconstruction of\nRadon-Nikodym derivatives at any particular point can be done with high order\nof accuracy. Our theoretical results are illustrated by numerical simulations.\n","authors":["Duc Hoan Nguyen","Werner Zellinger","Sergei V. Pereverzyev"],"pdf_url":"https://arxiv.org/pdf/2308.07887v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2307.11503"},{"id":"http://arxiv.org/abs/2308.07886v1","updated":"2023-08-15T17:23:18Z","published":"2023-08-15T17:23:18Z","title":"Back to Basics: A Sanity Check on Modern Time Series Classification\n Algorithms","summary":" The state-of-the-art in time series classification has come a long way, from\nthe 1NN-DTW algorithm to the ROCKET family of classifiers. However, in the\ncurrent fast-paced development of new classifiers, taking a step back and\nperforming simple baseline checks is essential. These checks are often\noverlooked, as researchers are focused on establishing new state-of-the-art\nresults, developing scalable algorithms, and making models explainable.\nNevertheless, there are many datasets that look like time series at first\nglance, but classic algorithms such as tabular methods with no time ordering\nmay perform better on such problems. For example, for spectroscopy datasets,\ntabular methods tend to significantly outperform recent time series methods. In\nthis study, we compare the performance of tabular models using classic machine\nlearning approaches (e.g., Ridge, LDA, RandomForest) with the ROCKET family of\nclassifiers (e.g., Rocket, MiniRocket, MultiRocket). Tabular models are simple\nand very efficient, while the ROCKET family of classifiers are more complex and\nhave state-of-the-art accuracy and efficiency among recent time series\nclassifiers. We find that tabular models outperform the ROCKET family of\nclassifiers on approximately 19% of univariate and 28% of multivariate datasets\nin the UCR/UEA benchmark and achieve accuracy within 10 percentage points on\nabout 50% of datasets. Our results suggest that it is important to consider\nsimple tabular models as baselines when developing time series classifiers.\nThese models are very fast, can be as effective as more complex methods and may\nbe easier to understand and deploy.\n","authors":["Bhaskar Dhariyal","Thach Le Nguyen","Georgiana Ifrim"],"pdf_url":"https://arxiv.org/pdf/2308.07886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07885v1","updated":"2023-08-15T17:22:42Z","published":"2023-08-15T17:22:42Z","title":"The Challenge of Fetal Cardiac MRI Reconstruction Using Deep Learning","summary":" Dynamic free-breathing fetal cardiac MRI is one of the most challenging\nmodalities, which requires high temporal and spatial resolution to depict rapid\nchanges in a small fetal heart. The ability of deep learning methods to recover\nundersampled data could help to optimise the kt-SENSE acquisition strategy and\nimprove non-gated kt-SENSE reconstruction quality. In this work, we explore\nsupervised deep learning networks for reconstruction of kt-SENSE style acquired\ndata using an extensive in vivo dataset. Having access to fully-sampled\nlow-resolution multi-coil fetal cardiac MRI, we study the performance of the\nnetworks to recover fully-sampled data from undersampled data. We consider\nmodel architectures together with training strategies taking into account their\napplication in the real clinical setup used to collect the dataset to enable\nnetworks to recover prospectively undersampled data. We explore a set of\nmodifications to form a baseline performance evaluation for dynamic fetal\ncardiac MRI on real data. We systematically evaluate the models on\ncoil-combined data to reveal the effect of the suggested changes to the\narchitecture in the context of fetal heart properties. We show that the\nbest-performers recover a detailed depiction of the maternal anatomy on a large\nscale, but the dynamic properties of the fetal heart are under-represented.\nTraining directly on multi-coil data improves the performance of the models,\nallows their prospective application to undersampled data and makes them\noutperform CTFNet introduced for adult cardiac cine MRI. However, these models\ndeliver similar qualitative performances recovering the maternal body very well\nbut underestimating the dynamic properties of fetal heart. This dynamic feature\nof fast change of fetal heart that is highly localised suggests both more\ntargeted training and evaluation methods might be needed for fetal heart\napplication.\n","authors":["Denis Prokopenko","Kerstin Hammernik","Thomas Roberts","David F A Lloyd","Daniel Rueckert","Joseph V Hajnal"],"pdf_url":"https://arxiv.org/pdf/2308.07885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07883v1","updated":"2023-08-15T17:13:16Z","published":"2023-08-15T17:13:16Z","title":"Towards Temporal Edge Regression: A Case Study on Agriculture Trade\n Between Nations","summary":" Recently, Graph Neural Networks (GNNs) have shown promising performance in\ntasks on dynamic graphs such as node classification, link prediction and graph\nregression. However, few work has studied the temporal edge regression task\nwhich has important real-world applications. In this paper, we explore the\napplication of GNNs to edge regression tasks in both static and dynamic\nsettings, focusing on predicting food and agriculture trade values between\nnations. We introduce three simple yet strong baselines and comprehensively\nevaluate one static and three dynamic GNN models using the UN Trade dataset.\nOur experimental results reveal that the baselines exhibit remarkably strong\nperformance across various settings, highlighting the inadequacy of existing\nGNNs. We also find that TGN outperforms other GNN models, suggesting TGN is a\nmore appropriate choice for edge regression tasks. Moreover, we note that the\nproportion of negative edges in the training samples significantly affects the\ntest performance. The companion source code can be found at:\nhttps://github.com/scylj1/GNN_Edge_Regression.\n","authors":["Lekang Jiang","Caiqi Zhang","Farimah Poursafaei","Shenyang Huang"],"pdf_url":"https://arxiv.org/pdf/2308.07883v1.pdf","comment":"12 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2302.03224v3","updated":"2023-08-15T16:44:02Z","published":"2023-02-07T03:14:00Z","title":"Undersampling and Cumulative Class Re-decision Methods to Improve\n Detection of Agitation in People with Dementia","summary":" Agitation is one of the most prevalent symptoms in people with dementia (PwD)\nthat can place themselves and the caregiver's safety at risk. Developing\nobjective agitation detection approaches is important to support health and\nsafety of PwD living in a residential setting. In a previous study, we\ncollected multimodal wearable sensor data from 17 participants for 600 days and\ndeveloped machine learning models for detecting agitation in one-minute\nwindows. However, there are significant limitations in the dataset, such as\nimbalance problem and potential imprecise labelsas the occurrence of agitation\nis much rarer in comparison to the normal behaviours. In this paper, we first\nimplemented different undersampling methods to eliminate the imbalance problem,\nand came to the conclusion that only 20% of normal behaviour data were adequate\nto train a competitive agitation detection model. Then, we designed a weighted\nundersampling method to evaluate the manual labeling mechanism given the\nambiguous time interval assumption. After that, the postprocessing method of\ncumulative class re-decision (CCR) was proposed based on the historical\nsequential information and continuity characteristic of agitation, improving\nthe decision-making performance for the potential application of agitation\ndetection system. The results showed that a combination of undersampling and\nCCR improved F1-score and other metrics to varying degrees with less training\ntime and data.\n","authors":["Zhidong Meng","Andrea Iaboni","Bing Ye","Kristine Newman","Alex Mihailidis","Zhihong Deng","Shehroz S. Khan"],"pdf_url":"https://arxiv.org/pdf/2302.03224v3.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.07876v1","updated":"2023-08-15T16:41:53Z","published":"2023-08-15T16:41:53Z","title":"Synthesizing Political Zero-Shot Relation Classification via Codebook\n Knowledge, NLI, and ChatGPT","summary":" Recent supervised models for event coding vastly outperform pattern-matching\nmethods. However, their reliance solely on new annotations disregards the vast\nknowledge within expert databases, hindering their applicability to\nfine-grained classification. To address these limitations, we explore zero-shot\napproaches for political event ontology relation classification, by leveraging\nknowledge from established annotation codebooks. Our study encompasses both\nChatGPT and a novel natural language inference (NLI) based approach named ZSP.\nZSP adopts a tree-query framework that deconstructs the task into context,\nmodality, and class disambiguation levels. This framework improves\ninterpretability, efficiency, and adaptability to schema changes. By conducting\nextensive experiments on our newly curated datasets, we pinpoint the\ninstability issues within ChatGPT and highlight the superior performance of\nZSP. ZSP achieves an impressive 40% improvement in F1 score for fine-grained\nRootcode classification. ZSP demonstrates competitive performance compared to\nsupervised BERT models, positioning it as a valuable tool for event record\nvalidation and ontology development. Our work underscores the potential of\nleveraging transfer learning and existing expertise to enhance the efficiency\nand scalability of research in the field.\n","authors":["Yibo Hu","Erick Skorupa Parolin","Latifur Khan","Patrick T. Brandt","Javier Osorio","Vito J. D'Orazio"],"pdf_url":"https://arxiv.org/pdf/2308.07876v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2307.05695v3","updated":"2023-08-15T16:41:13Z","published":"2023-07-11T18:02:09Z","title":"Stack More Layers Differently: High-Rank Training Through Low-Rank\n Updates","summary":" Despite the dominance and effectiveness of scaling, resulting in large\nnetworks with hundreds of billions of parameters, the necessity to train\noverparametrized models remains poorly understood, and alternative approaches\ndo not necessarily make it cheaper to train high-performance models. In this\npaper, we explore low-rank training techniques as an alternative approach to\ntraining large neural networks. We introduce a novel method called ReLoRA,\nwhich utilizes low-rank updates to train high-rank networks. We apply ReLoRA to\npre-training transformer language models with up to 350M parameters and\ndemonstrate comparable performance to regular neural network training.\nFurthermore, we observe that the efficiency of ReLoRA increases with model\nsize, making it a promising approach for training multi-billion-parameter\nnetworks efficiently. Our findings shed light on the potential of low-rank\ntraining techniques and their implications for scaling laws.\n","authors":["Vladislav Lialin","Namrata Shivagunde","Sherin Muckatira","Anna Rumshisky"],"pdf_url":"https://arxiv.org/pdf/2307.05695v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07871v1","updated":"2023-08-15T16:39:10Z","published":"2023-08-15T16:39:10Z","title":"Emotion Embeddings $\\unicode{x2014}$ Learning Stable and Homogeneous\n Abstractions from Heterogeneous Affective Datasets","summary":" Human emotion is expressed in many communication modalities and media formats\nand so their computational study is equally diversified into natural language\nprocessing, audio signal analysis, computer vision, etc. Similarly, the large\nvariety of representation formats used in previous research to describe\nemotions (polarity scales, basic emotion categories, dimensional approaches,\nappraisal theory, etc.) have led to an ever proliferating diversity of\ndatasets, predictive models, and software tools for emotion analysis. Because\nof these two distinct types of heterogeneity, at the expressional and\nrepresentational level, there is a dire need to unify previous work on\nincreasingly diverging data and label types. This article presents such a\nunifying computational model. We propose a training procedure that learns a\nshared latent representation for emotions, so-called emotion embeddings,\nindependent of different natural languages, communication modalities, media or\nrepresentation label formats, and even disparate model architectures.\nExperiments on a wide range of heterogeneous affective datasets indicate that\nthis approach yields the desired interoperability for the sake of reusability,\ninterpretability and flexibility, without penalizing prediction quality. Code\nand data are archived under https://doi.org/10.5281/zenodo.7405327 .\n","authors":["Sven Buechel","Udo Hahn"],"pdf_url":"https://arxiv.org/pdf/2308.07871v1.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.07870v1","updated":"2023-08-15T16:37:16Z","published":"2023-08-15T16:37:16Z","title":"Brain-Inspired Computational Intelligence via Predictive Coding","summary":" Artificial intelligence (AI) is rapidly becoming one of the key technologies\nof this century. The majority of results in AI thus far have been achieved\nusing deep neural networks trained with the error backpropagation learning\nalgorithm. However, the ubiquitous adoption of this approach has highlighted\nsome important limitations such as substantial computational cost, difficulty\nin quantifying uncertainty, lack of robustness, unreliability, and biological\nimplausibility. It is possible that addressing these limitations may require\nschemes that are inspired and guided by neuroscience theories. One such theory,\ncalled predictive coding (PC), has shown promising performance in machine\nintelligence tasks, exhibiting exciting properties that make it potentially\nvaluable for the machine learning community: PC can model information\nprocessing in different brain areas, can be used in cognitive control and\nrobotics, and has a solid mathematical grounding in variational inference,\noffering a powerful inversion scheme for a specific class of continuous-state\ngenerative models. With the hope of foregrounding research in this direction,\nwe survey the literature that has contributed to this perspective, highlighting\nthe many ways that PC might play a role in the future of machine learning and\ncomputational intelligence at large.\n","authors":["Tommaso Salvatori","Ankur Mali","Christopher L. Buckley","Thomas Lukasiewicz","Rajesh P. N. Rao","Karl Friston","Alexander Ororbia"],"pdf_url":"https://arxiv.org/pdf/2308.07870v1.pdf","comment":"37 Pages, 9 Figures"},{"id":"http://arxiv.org/abs/2308.07867v1","updated":"2023-08-15T16:34:37Z","published":"2023-08-15T16:34:37Z","title":"Graph-Structured Kernel Design for Power Flow Learning using Gaussian\n Processes","summary":" This paper presents a physics-inspired graph-structured kernel designed for\npower flow learning using Gaussian Process (GP). The kernel, named the\nvertex-degree kernel (VDK), relies on latent decomposition of voltage-injection\nrelationship based on the network graph or topology. Notably, VDK design avoids\nthe need to solve optimization problems for kernel search. To enhance\nefficiency, we also explore a graph-reduction approach to obtain a VDK\nrepresentation with lesser terms. Additionally, we propose a novel\nnetwork-swipe active learning scheme, which intelligently selects sequential\ntraining inputs to accelerate the learning of VDK. Leveraging the additive\nstructure of VDK, the active learning algorithm performs a block-descent type\nprocedure on GP's predictive variance, serving as a proxy for information gain.\nSimulations demonstrate that the proposed VDK-GP achieves more than two fold\nsample complexity reduction, compared to full GP on medium scale 500-Bus and\nlarge scale 1354-Bus power systems. The network-swipe algorithm outperforms\nmean performance of 500 random trials on test predictions by two fold for\nmedium-sized 500-Bus systems and best performance of 25 random trials for\nlarge-scale 1354-Bus systems by 10%. Moreover, we demonstrate that the proposed\nmethod's performance for uncertainty quantification applications with\ndistributionally shifted testing data sets.\n","authors":["Parikshit Pareek","Deepjyoti Deka","Sidhant Misra"],"pdf_url":"https://arxiv.org/pdf/2308.07867v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.07857v1","updated":"2023-08-15T16:16:02Z","published":"2023-08-15T16:16:02Z","title":"Impression-Aware Recommender Systems","summary":" Novel data sources bring new opportunities to improve the quality of\nrecommender systems. Impressions are a novel data source containing past\nrecommendations (shown items) and traditional interactions. Researchers may use\nimpressions to refine user preferences and overcome the current limitations in\nrecommender systems research. The relevance and interest of impressions have\nincreased over the years; hence, the need for a review of relevant work on this\ntype of recommenders. We present a systematic literature review on recommender\nsystems using impressions, focusing on three fundamental angles in research:\nrecommenders, datasets, and evaluation methodologies. We provide three\ncategorizations of papers describing recommenders using impressions, present\neach reviewed paper in detail, describe datasets with impressions, and analyze\nthe existing evaluation methodologies. Lastly, we present open questions and\nfuture directions of interest, highlighting aspects missing in the literature\nthat can be addressed in future works.\n","authors":["Fernando B. Pérez Maurera","Maurizio Ferrari Dacrema","Pablo Castells","Paolo Cremonesi"],"pdf_url":"https://arxiv.org/pdf/2308.07857v1.pdf","comment":"34 pages, 103 references, 6 tables, 2 figures, ACM UNDER REVIEW"},{"id":"http://arxiv.org/abs/2306.06569v2","updated":"2023-08-15T16:14:42Z","published":"2023-06-11T03:02:10Z","title":"Policy Regularization with Dataset Constraint for Offline Reinforcement\n Learning","summary":" We consider the problem of learning the best possible policy from a fixed\ndataset, known as offline Reinforcement Learning (RL). A common taxonomy of\nexisting offline RL works is policy regularization, which typically constrains\nthe learned policy by distribution or support of the behavior policy. However,\ndistribution and support constraints are overly conservative since they both\nforce the policy to choose similar actions as the behavior policy when\nconsidering particular states. It will limit the learned policy's performance,\nespecially when the behavior policy is sub-optimal. In this paper, we find that\nregularizing the policy towards the nearest state-action pair can be more\neffective and thus propose Policy Regularization with Dataset Constraint\n(PRDC). When updating the policy in a given state, PRDC searches the entire\ndataset for the nearest state-action sample and then restricts the policy with\nthe action of this sample. Unlike previous works, PRDC can guide the policy\nwith proper behaviors from the dataset, allowing it to choose actions that do\nnot appear in the dataset along with the given state. It is a softer constraint\nbut still keeps enough conservatism from out-of-distribution actions. Empirical\nevidence and theoretical analysis show that PRDC can alleviate offline RL's\nfundamentally challenging value overestimation issue with a bounded performance\ngap. Moreover, on a set of locomotion and navigation tasks, PRDC achieves\nstate-of-the-art performance compared with existing methods. Code is available\nat https://github.com/LAMDA-RL/PRDC\n","authors":["Yuhang Ran","Yi-Chen Li","Fuxiang Zhang","Zongzhang Zhang","Yang Yu"],"pdf_url":"https://arxiv.org/pdf/2306.06569v2.pdf","comment":"Accepted to ICML 2023"},{"id":"http://arxiv.org/abs/2305.17401v3","updated":"2023-08-15T16:02:43Z","published":"2023-05-27T07:59:49Z","title":"A Framework For Refining Text Classification and Object Recognition from\n Academic Articles","summary":" With the widespread use of the internet, it has become increasingly crucial\nto extract specific information from vast amounts of academic articles\nefficiently. Data mining techniques are generally employed to solve this issue.\nHowever, data mining for academic articles is challenging since it requires\nautomatically extracting specific patterns in complex and unstructured layout\ndocuments. Current data mining methods for academic articles employ\nrule-based(RB) or machine learning(ML) approaches. However, using rule-based\nmethods incurs a high coding cost for complex typesetting articles. On the\nother hand, simply using machine learning methods requires annotation work for\ncomplex content types within the paper, which can be costly. Furthermore, only\nusing machine learning can lead to cases where patterns easily recognized by\nrule-based methods are mistakenly extracted. To overcome these issues, from the\nperspective of analyzing the standard layout and typesetting used in the\nspecified publication, we emphasize implementing specific methods for specific\ncharacteristics in academic articles. We have developed a novel Text Block\nRefinement Framework (TBRF), a machine learning and rule-based scheme hybrid.\nWe used the well-known ACL proceeding articles as experimental data for the\nvalidation experiment. The experiment shows that our approach achieved over 95%\nclassification accuracy and 90% detection accuracy for tables and figures.\n","authors":["Jinghong Li","Koichi Ota","Wen Gu","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2305.17401v3.pdf","comment":"This paper has been accepted at 'The International Symposium on\n Innovations in Intelligent Systems and Applications 2023 (INISTA 2023)'"},{"id":"http://arxiv.org/abs/2301.04900v2","updated":"2023-08-15T15:59:01Z","published":"2023-01-12T09:44:59Z","title":"A Recipe for Well-behaved Graph Neural Approximations of Complex\n Dynamics","summary":" Data-driven approximations of ordinary differential equations offer a\npromising alternative to classical methods in discovering a dynamical system\nmodel, particularly in complex systems lacking explicit first principles. This\npaper focuses on a complex system whose dynamics is described with a system of\nordinary differential equations, coupled via a network adjacency matrix.\nNumerous real-world systems, including financial, social, and neural systems,\nbelong to this class of dynamical models. We propose essential elements for\napproximating such dynamical systems using neural networks, including necessary\nbiases and an appropriate neural architecture. Emphasizing the differences from\nstatic supervised learning, we advocate for evaluating generalization beyond\nclassical assumptions of statistical learning theory. To estimate confidence in\nprediction during inference time, we introduce a dedicated null model. By\nstudying various complex network dynamics, we demonstrate the neural network's\nability to approximate various dynamics, generalize across complex network\nstructures, sizes, and statistical properties of inputs. Our comprehensive\nframework enables deep learning approximations of high-dimensional,\nnon-linearly coupled complex dynamical systems.\n","authors":["Vaiva Vasiliauskaite","Nino Antulov-Fantulin"],"pdf_url":"https://arxiv.org/pdf/2301.04900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.02982v2","updated":"2023-08-15T15:52:47Z","published":"2023-01-08T05:24:12Z","title":"Why Batch Normalization Damage Federated Learning on Non-IID Data?","summary":" As a promising distributed learning paradigm, federated learning (FL)\ninvolves training deep neural network (DNN) models at the network edge while\nprotecting the privacy of the edge clients. To train a large-scale DNN model,\nbatch normalization (BN) has been regarded as a simple and effective means to\naccelerate the training and improve the generalization capability. However,\nrecent findings indicate that BN can significantly impair the performance of FL\nin the presence of non-i.i.d. data. While several FL algorithms have been\nproposed to address this issue, their performance still falls significantly\nwhen compared to the centralized scheme. Furthermore, none of them have\nprovided a theoretical explanation of how the BN damages the FL convergence. In\nthis paper, we present the first convergence analysis to show that under the\nnon-i.i.d. data, the mismatch between the local and global statistical\nparameters in BN causes the gradient deviation between the local and global\nmodels, which, as a result, slows down and biases the FL convergence. In view\nof this, we develop a new FL algorithm that is tailored to BN, called FedTAN,\nwhich is capable of achieving robust FL performance under a variety of data\ndistributions via iterative layer-wise parameter aggregation. Comprehensive\nexperimental results demonstrate the superiority of the proposed FedTAN over\nexisting baselines for training BN-based DNN models.\n","authors":["Yanmeng Wang","Qingjiang Shi","Tsung-Hui Chang"],"pdf_url":"https://arxiv.org/pdf/2301.02982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03202v3","updated":"2023-08-15T15:47:34Z","published":"2023-08-06T20:19:06Z","title":"Source-free Domain Adaptive Human Pose Estimation","summary":" Human Pose Estimation (HPE) is widely used in various fields, including\nmotion analysis, healthcare, and virtual reality. However, the great expenses\nof labeled real-world datasets present a significant challenge for HPE. To\novercome this, one approach is to train HPE models on synthetic datasets and\nthen perform domain adaptation (DA) on real-world data. Unfortunately, existing\nDA methods for HPE neglect data privacy and security by using both source and\ntarget data in the adaptation process. To this end, we propose a new task,\nnamed source-free domain adaptive HPE, which aims to address the challenges of\ncross-domain learning of HPE without access to source data during the\nadaptation process. We further propose a novel framework that consists of three\nmodels: source model, intermediate model, and target model, which explores the\ntask from both source-protect and target-relevant perspectives. The\nsource-protect module preserves source information more effectively while\nresisting noise, and the target-relevant module reduces the sparsity of spatial\nrepresentations by building a novel spatial probability space, and\npose-specific contrastive learning and information maximization are proposed on\nthe basis of this space. Comprehensive experiments on several domain adaptive\nHPE benchmarks show that the proposed method outperforms existing approaches by\na considerable margin. The codes are available at\nhttps://github.com/davidpengucf/SFDAHPE.\n","authors":["Qucheng Peng","Ce Zheng","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.03202v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07843v1","updated":"2023-08-15T15:43:12Z","published":"2023-08-15T15:43:12Z","title":"Dyadic Reinforcement Learning","summary":" Mobile health aims to enhance health outcomes by delivering interventions to\nindividuals as they go about their daily life. The involvement of care partners\nand social support networks often proves crucial in helping individuals\nmanaging burdensome medical conditions. This presents opportunities in mobile\nhealth to design interventions that target the dyadic relationship -- the\nrelationship between a target person and their care partner -- with the aim of\nenhancing social support. In this paper, we develop dyadic RL, an online\nreinforcement learning algorithm designed to personalize intervention delivery\nbased on contextual factors and past responses of a target person and their\ncare partner. Here, multiple sets of interventions impact the dyad across\nmultiple time intervals. The developed dyadic RL is Bayesian and hierarchical.\nWe formally introduce the problem setup, develop dyadic RL and establish a\nregret bound. We demonstrate dyadic RL's empirical performance through\nsimulation studies on both toy scenarios and on a realistic test bed\nconstructed from data collected in a mobile health study.\n","authors":["Shuangning Li","Lluis Salvat Niell","Sung Won Choi","Inbal Nahum-Shani","Guy Shani","Susan Murphy"],"pdf_url":"https://arxiv.org/pdf/2308.07843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07834v1","updated":"2023-08-15T15:23:36Z","published":"2023-08-15T15:23:36Z","title":"Simple and Efficient Partial Graph Adversarial Attack: A New Perspective","summary":" As the study of graph neural networks becomes more intensive and\ncomprehensive, their robustness and security have received great research\ninterest. The existing global attack methods treat all nodes in the graph as\ntheir attack targets. Although existing methods have achieved excellent\nresults, there is still considerable space for improvement. The key problem is\nthat the current approaches rigidly follow the definition of global attacks.\nThey ignore an important issue, i.e., different nodes have different robustness\nand are not equally resilient to attacks. From a global attacker's view, we\nshould arrange the attack budget wisely, rather than wasting them on highly\nrobust nodes. To this end, we propose a totally new method named partial graph\nattack (PGA), which selects the vulnerable nodes as attack targets. First, to\nselect the vulnerable items, we propose a hierarchical target selection policy,\nwhich allows attackers to only focus on easy-to-attack nodes. Then, we propose\na cost-effective anchor-picking policy to pick the most promising anchors for\nadding or removing edges, and a more aggressive iterative greedy-based attack\nmethod to perform more efficient attacks. Extensive experimental results\ndemonstrate that PGA can achieve significant improvements in both attack effect\nand attack efficiency compared to other existing graph global attack methods.\n","authors":["Guanghui Zhu","Mengyu Chen","Chunfeng Yuan","Yihua Huang"],"pdf_url":"https://arxiv.org/pdf/2308.07834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07832v1","updated":"2023-08-15T15:21:36Z","published":"2023-08-15T15:21:36Z","title":"REFORMS: Reporting Standards for Machine Learning Based Science","summary":" Machine learning (ML) methods are proliferating in scientific research.\nHowever, the adoption of these methods has been accompanied by failures of\nvalidity, reproducibility, and generalizability. These failures can hinder\nscientific progress, lead to false consensus around invalid claims, and\nundermine the credibility of ML-based science. ML methods are often applied and\nfail in similar ways across disciplines. Motivated by this observation, our\ngoal is to provide clear reporting standards for ML-based science. Drawing from\nan extensive review of past literature, we present the REFORMS checklist\n($\\textbf{Re}$porting Standards $\\textbf{For}$ $\\textbf{M}$achine Learning\nBased $\\textbf{S}$cience). It consists of 32 questions and a paired set of\nguidelines. REFORMS was developed based on a consensus of 19 researchers across\ncomputer science, data science, mathematics, social sciences, and biomedical\nsciences. REFORMS can serve as a resource for researchers when designing and\nimplementing a study, for referees when reviewing papers, and for journals when\nenforcing standards for transparency and reproducibility.\n","authors":["Sayash Kapoor","Emily Cantrell","Kenny Peng","Thanh Hien Pham","Christopher A. Bail","Odd Erik Gundersen","Jake M. Hofman","Jessica Hullman","Michael A. Lones","Momin M. Malik","Priyanka Nanayakkara","Russell A. Poldrack","Inioluwa Deborah Raji","Michael Roberts","Matthew J. Salganik","Marta Serra-Garcia","Brandon M. Stewart","Gilles Vandewiele","Arvind Narayanan"],"pdf_url":"https://arxiv.org/pdf/2308.07832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04003v3","updated":"2023-08-15T15:09:02Z","published":"2023-05-06T10:36:39Z","title":"ANTONIO: Towards a Systematic Method of Generating NLP Benchmarks for\n Verification","summary":" Verification of machine learning models used in Natural Language Processing\n(NLP) is known to be a hard problem. In particular, many known neural network\nverification methods that work for computer vision and other numeric datasets\ndo not work for NLP. Here, we study technical reasons that underlie this\nproblem. Based on this analysis, we propose practical methods and heuristics\nfor preparing NLP datasets and models in a way that renders them amenable to\nknown verification methods based on abstract interpretation. We implement these\nmethods as a Python library called ANTONIO that links to the neural network\nverifiers ERAN and Marabou. We perform evaluation of the tool using an NLP\ndataset R-U-A-Robot suggested as a benchmark for verifying legally critical NLP\napplications. We hope that, thanks to its general applicability, this work will\nopen novel possibilities for including NLP verification problems into neural\nnetwork verification competitions, and will popularise NLP problems within this\ncommunity.\n","authors":["Marco Casadio","Luca Arnaboldi","Matthew L. Daggitt","Omri Isac","Tanvi Dinkar","Daniel Kienitz","Verena Rieser","Ekaterina Komendantskaya"],"pdf_url":"https://arxiv.org/pdf/2305.04003v3.pdf","comment":"To appear in proceedings of 6th Workshop on Formal Methods for\n ML-Enabled Autonomous Systems (Affiliated with CAV 2023)"},{"id":"http://arxiv.org/abs/2308.07824v1","updated":"2023-08-15T15:07:32Z","published":"2023-08-15T15:07:32Z","title":"Cerberus: A Deep Learning Hybrid Model for Lithium-Ion Battery Aging\n Estimation and Prediction Based on Relaxation Voltage Curves","summary":" The degradation process of lithium-ion batteries is intricately linked to\ntheir entire lifecycle as power sources and energy storage devices,\nencompassing aspects such as performance delivery and cycling utilization.\nConsequently, the accurate and expedient estimation or prediction of the aging\nstate of lithium-ion batteries has garnered extensive attention. Nonetheless,\nprevailing research predominantly concentrates on either aging estimation or\nprediction, neglecting the dynamic fusion of both facets. This paper proposes a\nhybrid model for capacity aging estimation and prediction based on deep\nlearning, wherein salient features highly pertinent to aging are extracted from\ncharge and discharge relaxation processes. By amalgamating historical capacity\ndecay data, the model dynamically furnishes estimations of the present capacity\nand forecasts of future capacity for lithium-ion batteries. Our approach is\nvalidated against a novel dataset involving charge and discharge cycles at\nvarying rates. Specifically, under a charging condition of 0.25C, a mean\nabsolute percentage error (MAPE) of 0.29% is achieved. This outcome underscores\nthe model's adeptness in harnessing relaxation processes commonly encountered\nin the real world and synergizing with historical capacity records within\nbattery management systems (BMS), thereby affording estimations and\nprognostications of capacity decline with heightened precision.\n","authors":["Yue Xiang","Bo Jiang","Haifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2308.07824v1.pdf","comment":"3 figures, 1 table, 9 pages"},{"id":"http://arxiv.org/abs/2308.07822v1","updated":"2023-08-15T14:56:37Z","published":"2023-08-15T14:56:37Z","title":"Deep reinforcement learning for process design: Review and perspective","summary":" The transformation towards renewable energy and feedstock supply in the\nchemical industry requires new conceptual process design approaches. Recently,\nbreakthroughs in artificial intelligence offer opportunities to accelerate this\ntransition. Specifically, deep reinforcement learning, a subclass of machine\nlearning, has shown the potential to solve complex decision-making problems and\naid sustainable process design. We survey state-of-the-art research in\nreinforcement learning for process design through three major elements: (i)\ninformation representation, (ii) agent architecture, and (iii) environment and\nreward. Moreover, we discuss perspectives on underlying challenges and\npromising future works to unfold the full potential of reinforcement learning\nfor process design in chemical engineering.\n","authors":["Qinghe Gao","Artur M. Schweidtmann"],"pdf_url":"https://arxiv.org/pdf/2308.07822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07817v1","updated":"2023-08-15T14:50:12Z","published":"2023-08-15T14:50:12Z","title":"Quantifying the Cost of Learning in Queueing Systems","summary":" Queueing systems are widely applicable stochastic models with use cases in\ncommunication networks, healthcare, service systems, etc. Although their\noptimal control has been extensively studied, most existing approaches assume\nperfect knowledge of system parameters. Of course, this assumption rarely holds\nin practice where there is parameter uncertainty, thus motivating a recent line\nof work on bandit learning for queueing systems. This nascent stream of\nresearch focuses on the asymptotic performance of the proposed algorithms.\n In this paper, we argue that an asymptotic metric, which focuses on\nlate-stage performance, is insufficient to capture the intrinsic statistical\ncomplexity of learning in queueing systems which typically occurs in the early\nstage. Instead, we propose the Cost of Learning in Queueing (CLQ), a new metric\nthat quantifies the maximum increase in time-averaged queue length caused by\nparameter uncertainty. We characterize the CLQ of a single-queue multi-server\nsystem, and then extend these results to multi-queue multi-server systems and\nnetworks of queues. In establishing our results, we propose a unified analysis\nframework for CLQ that bridges Lyapunov and bandit analysis, which could be of\nindependent interest.\n","authors":["Daniel Freund","Thodoris Lykouris","Wentao Weng"],"pdf_url":"https://arxiv.org/pdf/2308.07817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02229v2","updated":"2023-08-15T14:46:50Z","published":"2023-04-05T04:59:59Z","title":"Mixed Regression via Approximate Message Passing","summary":" We study the problem of regression in a generalized linear model (GLM) with\nmultiple signals and latent variables. This model, which we call a matrix GLM,\ncovers many widely studied problems in statistical learning, including mixed\nlinear regression, max-affine regression, and mixture-of-experts. In mixed\nlinear regression, each observation comes from one of $L$ signal vectors\n(regressors), but we do not know which one; in max-affine regression, each\nobservation comes from the maximum of $L$ affine functions, each defined via a\ndifferent signal vector. The goal in all these problems is to estimate the\nsignals, and possibly some of the latent variables, from the observations. We\npropose a novel approximate message passing (AMP) algorithm for estimation in a\nmatrix GLM and rigorously characterize its performance in the high-dimensional\nlimit. This characterization is in terms of a state evolution recursion, which\nallows us to precisely compute performance measures such as the asymptotic\nmean-squared error. The state evolution characterization can be used to tailor\nthe AMP algorithm to take advantage of any structural information known about\nthe signals. Using state evolution, we derive an optimal choice of AMP\n`denoising' functions that minimizes the estimation error in each iteration.\n The theoretical results are validated by numerical simulations for mixed\nlinear regression, max-affine regression, and mixture-of-experts. For\nmax-affine regression, we propose an algorithm that combines AMP with\nexpectation-maximization to estimate intercepts of the model along with the\nsignals. The numerical results show that AMP significantly outperforms other\nestimators for mixed linear regression and max-affine regression in most\nparameter regimes.\n","authors":["Nelvin Tan","Ramji Venkataramanan"],"pdf_url":"https://arxiv.org/pdf/2304.02229v2.pdf","comment":"44 pages. To appear in the Journal of Machine Learning Research. A\n shorter version of this paper appeared in the proceedings of AISTATS 2023"},{"id":"http://arxiv.org/abs/2301.00389v2","updated":"2023-08-15T14:33:46Z","published":"2023-01-01T11:50:58Z","title":"FedICT: Federated Multi-task Distillation for Multi-access Edge\n Computing","summary":" The growing interest in intelligent services and privacy protection for\nmobile devices has given rise to the widespread application of federated\nlearning in Multi-access Edge Computing (MEC). Diverse user behaviors call for\npersonalized services with heterogeneous Machine Learning (ML) models on\ndifferent devices. Federated Multi-task Learning (FMTL) is proposed to train\nrelated but personalized ML models for different devices, whereas previous\nworks suffer from excessive communication overhead during training and neglect\nthe model heterogeneity among devices in MEC. Introducing knowledge\ndistillation into FMTL can simultaneously enable efficient communication and\nmodel heterogeneity among clients, whereas existing methods rely on a public\ndataset, which is impractical in reality. To tackle this dilemma, Federated\nMultI-task Distillation for Multi-access Edge CompuTing (FedICT) is proposed.\nFedICT direct local-global knowledge aloof during bi-directional distillation\nprocesses between clients and the server, aiming to enable multi-task clients\nwhile alleviating client drift derived from divergent optimization directions\nof client-side local models. Specifically, FedICT includes Federated Prior\nKnowledge Distillation (FPKD) and Local Knowledge Adjustment (LKA). FPKD is\nproposed to reinforce the clients' fitting of local data by introducing prior\nknowledge of local data distributions. Moreover, LKA is proposed to correct the\ndistillation loss of the server, making the transferred local knowledge better\nmatch the generalized representation. Experiments on three datasets show that\nFedICT significantly outperforms all compared benchmarks in various data\nheterogeneous and model architecture settings, achieving improved accuracy with\nless than 1.2% training communication overhead compared with FedAvg and no more\nthan 75% training communication round compared with FedGKT.\n","authors":["Zhiyuan Wu","Sheng Sun","Yuwei Wang","Min Liu","Quyang Pan","Xuefeng Jiang","Bo Gao"],"pdf_url":"https://arxiv.org/pdf/2301.00389v2.pdf","comment":"Accepted by IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS"},{"id":"http://arxiv.org/abs/2308.07805v1","updated":"2023-08-15T14:32:16Z","published":"2023-08-15T14:32:16Z","title":"Fairness and Privacy in Federated Learning and Their Implications in\n Healthcare","summary":" Currently, many contexts exist where distributed learning is difficult or\notherwise constrained by security and communication limitations. One common\ndomain where this is a consideration is in Healthcare where data is often\ngoverned by data-use-ordinances like HIPAA. On the other hand, larger sample\nsizes and shared data models are necessary to allow models to better generalize\non account of the potential for more variability and balancing underrepresented\nclasses. Federated learning is a type of distributed learning model that allows\ndata to be trained in a decentralized manner. This, in turn, addresses data\nsecurity, privacy, and vulnerability considerations as data itself is not\nshared across a given learning network nodes. Three main challenges to\nfederated learning include node data is not independent and identically\ndistributed (iid), clients requiring high levels of communication overhead\nbetween peers, and there is the heterogeneity of different clients within a\nnetwork with respect to dataset bias and size. As the field has grown, the\nnotion of fairness in federated learning has also been introduced through novel\nimplementations. Fairness approaches differ from the standard form of federated\nlearning and also have distinct challenges and considerations for the\nhealthcare domain. This paper endeavors to outline the typical lifecycle of\nfair federated learning in research as well as provide an updated taxonomy to\naccount for the current state of fairness in implementations. Lastly, this\npaper provides added insight into the implications and challenges of\nimplementing and supporting fairness in federated learning in the healthcare\ndomain.\n","authors":["Navya Annapareddy","Jade Preston","Judy Fox"],"pdf_url":"https://arxiv.org/pdf/2308.07805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07797v1","updated":"2023-08-15T14:21:53Z","published":"2023-08-15T14:21:53Z","title":"Adaptive Noise Covariance Estimation under Colored Noise using Dynamic\n Expectation Maximization","summary":" The accurate estimation of the noise covariance matrix (NCM) in a dynamic\nsystem is critical for state estimation and control, as it has a major\ninfluence in their optimality. Although a large number of NCM estimation\nmethods have been developed, most of them assume the noises to be white.\nHowever, in many real-world applications, the noises are colored (e.g., they\nexhibit temporal autocorrelations), resulting in suboptimal solutions. Here, we\nintroduce a novel brain-inspired algorithm that accurately and adaptively\nestimates the NCM for dynamic systems subjected to colored noise. Particularly,\nwe extend the Dynamic Expectation Maximization algorithm to perform both online\nnoise covariance and state estimation by optimizing the free energy objective.\nWe mathematically prove that our NCM estimator converges to the global optimum\nof this free energy objective. Using randomized numerical simulations, we show\nthat our estimator outperforms nine baseline methods with minimal noise\ncovariance estimation error under colored noise conditions. Notably, we show\nthat our method outperforms the best baseline (Variational Bayes) in joint\nnoise and state estimation for high colored noise. We foresee that the accuracy\nand the adaptive nature of our estimator make it suitable for online estimation\nin real-world applications.\n","authors":["Ajith Anil Meera","Pablo Lanillos"],"pdf_url":"https://arxiv.org/pdf/2308.07797v1.pdf","comment":"62nd IEEE Conference on Decision and Control"},{"id":"http://arxiv.org/abs/2308.07791v1","updated":"2023-08-15T14:16:29Z","published":"2023-08-15T14:16:29Z","title":"Informed Named Entity Recognition Decoding for Generative Language\n Models","summary":" Ever-larger language models with ever-increasing capabilities are by now\nwell-established text processing tools. Alas, information extraction tasks such\nas named entity recognition are still largely unaffected by this progress as\nthey are primarily based on the previous generation of encoder-only transformer\nmodels. Here, we propose a simple yet effective approach, Informed Named Entity\nRecognition Decoding (iNERD), which treats named entity recognition as a\ngenerative process. It leverages the language understanding capabilities of\nrecent generative models in a future-proof manner and employs an informed\ndecoding scheme incorporating the restricted nature of information extraction\ninto open-ended text generation, improving performance and eliminating any risk\nof hallucinations. We coarse-tune our model on a merged named entity corpus to\nstrengthen its performance, evaluate five generative language models on eight\nnamed entity recognition datasets, and achieve remarkable results, especially\nin an environment with an unknown entity class set, demonstrating the\nadaptability of the approach.\n","authors":["Tobias Deußer","Lars Hillebrand","Christian Bauckhage","Rafet Sifa"],"pdf_url":"https://arxiv.org/pdf/2308.07791v1.pdf","comment":"12 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.07787v1","updated":"2023-08-15T14:07:41Z","published":"2023-08-15T14:07:41Z","title":"DiffV2S: Diffusion-based Video-to-Speech Synthesis with Vision-guided\n Speaker Embedding","summary":" Recent research has demonstrated impressive results in video-to-speech\nsynthesis which involves reconstructing speech solely from visual input.\nHowever, previous works have struggled to accurately synthesize speech due to a\nlack of sufficient guidance for the model to infer the correct content with the\nappropriate sound. To resolve the issue, they have adopted an extra speaker\nembedding as a speaking style guidance from a reference auditory information.\nNevertheless, it is not always possible to obtain the audio information from\nthe corresponding video input, especially during the inference time. In this\npaper, we present a novel vision-guided speaker embedding extractor using a\nself-supervised pre-trained model and prompt tuning technique. In doing so, the\nrich speaker embedding information can be produced solely from input visual\ninformation, and the extra audio information is not necessary during the\ninference time. Using the extracted vision-guided speaker embedding\nrepresentations, we further develop a diffusion-based video-to-speech synthesis\nmodel, so called DiffV2S, conditioned on those speaker embeddings and the\nvisual representation extracted from the input video. The proposed DiffV2S not\nonly maintains phoneme details contained in the input video frames, but also\ncreates a highly intelligible mel-spectrogram in which the speaker identities\nof the multiple speakers are all preserved. Our experimental results show that\nDiffV2S achieves the state-of-the-art performance compared to the previous\nvideo-to-speech synthesis technique.\n","authors":["Jeongsoo Choi","Joanna Hong","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2308.07787v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07775v1","updated":"2023-08-15T13:51:03Z","published":"2023-08-15T13:51:03Z","title":"Hierarchical generative modelling for autonomous robots","summary":" Humans can produce complex whole-body motions when interacting with their\nsurroundings, by planning, executing and combining individual limb movements.\nWe investigated this fundamental aspect of motor control in the setting of\nautonomous robotic operations. We approach this problem by hierarchical\ngenerative modelling equipped with multi-level planning-for autonomous task\ncompletion-that mimics the deep temporal architecture of human motor control.\nHere, temporal depth refers to the nested time scales at which successive\nlevels of a forward or generative model unfold, for example, delivering an\nobject requires a global plan to contextualise the fast coordination of\nmultiple local movements of limbs. This separation of temporal scales also\nmotivates robotics and control. Specifically, to achieve versatile sensorimotor\ncontrol, it is advantageous to hierarchically structure the planning and\nlow-level motor control of individual limbs. We use numerical and physical\nsimulation to conduct experiments and to establish the efficacy of this\nformulation. Using a hierarchical generative model, we show how a humanoid\nrobot can autonomously complete a complex task that necessitates a holistic use\nof locomotion, manipulation, and grasping. Specifically, we demonstrate the\nability of a humanoid robot that can retrieve and transport a box, open and\nwalk through a door to reach the destination, approach and kick a football,\nwhile showing robust performance in presence of body damage and ground\nirregularities. Our findings demonstrated the effectiveness of using\nhuman-inspired motor control algorithms, and our method provides a viable\nhierarchical architecture for the autonomous completion of challenging\ngoal-directed tasks.\n","authors":["Kai Yuan","Noor Sajid","Karl Friston","Zhibin Li"],"pdf_url":"https://arxiv.org/pdf/2308.07775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07774v1","updated":"2023-08-15T13:49:12Z","published":"2023-08-15T13:49:12Z","title":"A Graph Encoder-Decoder Network for Unsupervised Anomaly Detection","summary":" A key component of many graph neural networks (GNNs) is the pooling\noperation, which seeks to reduce the size of a graph while preserving important\nstructural information. However, most existing graph pooling strategies rely on\nan assignment matrix obtained by employing a GNN layer, which is characterized\nby trainable parameters, often leading to significant computational complexity\nand a lack of interpretability in the pooling process. In this paper, we\npropose an unsupervised graph encoder-decoder model to detect abnormal nodes\nfrom graphs by learning an anomaly scoring function to rank nodes based on\ntheir degree of abnormality. In the encoding stage, we design a novel pooling\nmechanism, named LCPool, which leverages locality-constrained linear coding for\nfeature encoding to find a cluster assignment matrix by solving a least-squares\noptimization problem with a locality regularization term. By enforcing locality\nconstraints during the coding process, LCPool is designed to be free from\nlearnable parameters, capable of efficiently handling large graphs, and can\neffectively generate a coarser graph representation while retaining the most\nsignificant structural characteristics of the graph. In the decoding stage, we\npropose an unpooling operation, called LCUnpool, to reconstruct both the\nstructure and nodal features of the original graph. We conduct empirical\nevaluations of our method on six benchmark datasets using several evaluation\nmetrics, and the results demonstrate its superiority over state-of-the-art\nanomaly detection approaches.\n","authors":["Mahsa Mesgaran","A. Ben Hamza"],"pdf_url":"https://arxiv.org/pdf/2308.07774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07772v1","updated":"2023-08-15T13:48:16Z","published":"2023-08-15T13:48:16Z","title":"MOLE: MOdular Learning FramEwork via Mutual Information Maximization","summary":" This paper is to introduce an asynchronous and local learning framework for\nneural networks, named Modular Learning Framework (MOLE). This framework\nmodularizes neural networks by layers, defines the training objective via\nmutual information for each module, and sequentially trains each module by\nmutual information maximization. MOLE makes the training become local\noptimization with gradient-isolated across modules, and this scheme is more\nbiologically plausible than BP. We run experiments on vector-, grid- and\ngraph-type data. In particular, this framework is capable of solving both\ngraph- and node-level tasks for graph-type data. Therefore, MOLE has been\nexperimentally proven to be universally applicable to different types of data.\n","authors":["Tianchao Li","Yulong Pei"],"pdf_url":"https://arxiv.org/pdf/2308.07772v1.pdf","comment":"accepted by icml llw"},{"id":"http://arxiv.org/abs/2308.07761v1","updated":"2023-08-15T13:29:14Z","published":"2023-08-15T13:29:14Z","title":"NeFL: Nested Federated Learning for Heterogeneous Clients","summary":" Federated learning (FL) is a promising approach in distributed learning\nkeeping privacy. However, during the training pipeline of FL, slow or incapable\nclients (i.e., stragglers) slow down the total training time and degrade\nperformance. System heterogeneity, including heterogeneous computing and\nnetwork bandwidth, has been addressed to mitigate the impact of stragglers.\nPrevious studies split models to tackle the issue, but with less\ndegree-of-freedom in terms of model architecture. We propose nested federated\nlearning (NeFL), a generalized framework that efficiently divides a model into\nsubmodels using both depthwise and widthwise scaling. NeFL is implemented by\ninterpreting models as solving ordinary differential equations (ODEs) with\nadaptive step sizes. To address the inconsistency that arises when training\nmultiple submodels with different architecture, we decouple a few parameters.\nNeFL enables resource-constrained clients to effectively join the FL pipeline\nand the model to be trained with a larger amount of data. Through a series of\nexperiments, we demonstrate that NeFL leads to significant gains, especially\nfor the worst-case submodel (e.g., 8.33 improvement on CIFAR-10). Furthermore,\nwe demonstrate NeFL aligns with recent studies in FL.\n","authors":["Honggu Kang","Seohyeon Cha","Jinwoo Shin","Jongmyeong Lee","Joonhyuk Kang"],"pdf_url":"https://arxiv.org/pdf/2308.07761v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2308.07758v1","updated":"2023-08-15T13:19:59Z","published":"2023-08-15T13:19:59Z","title":"Backward Reasoning in Large Language Models for Verification","summary":" Chain-of-Though (CoT) prompting has shown promising performance in various\nreasoning tasks. Recently, Self-Consistency \\citep{wang2023selfconsistency}\nproposes to sample a diverse set of reasoning chains which may lead to\ndifferent answers while the answer that receives the most votes is selected. In\nthis paper, we propose a novel method to use backward reasoning in verifying\ncandidate answers. We mask a token in the question by ${\\bf x}$ and ask the LLM\nto predict the masked token when a candidate answer is provided by \\textit{a\nsimple template}, i.e., ``\\textit{\\textbf{If we know the answer of the above\nquestion is \\{a candidate answer\\}, what is the value of unknown variable ${\\bf\nx}$?}}'' Intuitively, the LLM is expected to predict the masked token\nsuccessfully if the provided candidate answer is correct. We further propose\nFOBAR to combine forward and backward reasoning for estimating the probability\nof candidate answers. We conduct extensive experiments on six data sets and\nthree LLMs. Experimental results demonstrate that FOBAR achieves\nstate-of-the-art performance on various reasoning benchmarks.\n","authors":["Weisen Jiang","Han Shi","Longhui Yu","Zhengying Liu","Yu Zhang","Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2308.07758v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.07748v1","updated":"2023-08-15T12:58:06Z","published":"2023-08-15T12:58:06Z","title":"Exploiting Sparsity in Automotive Radar Object Detection Networks","summary":" Having precise perception of the environment is crucial for ensuring the\nsecure and reliable functioning of autonomous driving systems. Radar object\ndetection networks are one fundamental part of such systems. CNN-based object\ndetectors showed good performance in this context, but they require large\ncompute resources. This paper investigates sparse convolutional object\ndetection networks, which combine powerful grid-based detection with low\ncompute resources. We investigate radar specific challenges and propose sparse\nkernel point pillars (SKPP) and dual voxel point convolutions (DVPC) as\nremedies for the grid rendering and sparse backbone architectures. We evaluate\nour SKPP-DPVCN architecture on nuScenes, which outperforms the baseline by\n5.89% and the previous state of the art by 4.19% in Car AP4.0. Moreover,\nSKPP-DPVCN reduces the average scale error (ASE) by 21.41% over the baseline.\n","authors":["Marius Lippke","Maurice Quach","Sascha Braun","Daniel Köhler","Michael Ulrich","Bastian Bischoff","Wei Yap Tan"],"pdf_url":"https://arxiv.org/pdf/2308.07748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07741v1","updated":"2023-08-15T12:40:56Z","published":"2023-08-15T12:40:56Z","title":"Real Robot Challenge 2022: Learning Dexterous Manipulation from Offline\n Data in the Real World","summary":" Experimentation on real robots is demanding in terms of time and costs. For\nthis reason, a large part of the reinforcement learning (RL) community uses\nsimulators to develop and benchmark algorithms. However, insights gained in\nsimulation do not necessarily translate to real robots, in particular for tasks\ninvolving complex interactions with the environment. The Real Robot Challenge\n2022 therefore served as a bridge between the RL and robotics communities by\nallowing participants to experiment remotely with a real robot - as easily as\nin simulation.\n In the last years, offline reinforcement learning has matured into a\npromising paradigm for learning from pre-collected datasets, alleviating the\nreliance on expensive online interactions. We therefore asked the participants\nto learn two dexterous manipulation tasks involving pushing, grasping, and\nin-hand orientation from provided real-robot datasets. An extensive software\ndocumentation and an initial stage based on a simulation of the real set-up\nmade the competition particularly accessible. By giving each team plenty of\naccess budget to evaluate their offline-learned policies on a cluster of seven\nidentical real TriFinger platforms, we organized an exciting competition for\nmachine learners and roboticists alike.\n In this work we state the rules of the competition, present the methods used\nby the winning teams and compare their results with a benchmark of\nstate-of-the-art offline RL algorithms on the challenge datasets.\n","authors":["Nico Gürtler","Felix Widmaier","Cansu Sancaktar","Sebastian Blaes","Pavel Kolev","Stefan Bauer","Manuel Wüthrich","Markus Wulfmeier","Martin Riedmiller","Arthur Allshire","Qiang Wang","Robert McCarthy","Hangyeol Kim","Jongchan Baek Pohang","Wookyong Kwon","Shanliang Qian","Yasunori Toshimitsu","Mike Yan Michelis","Amirhossein Kazemipour","Arman Raayatsanati","Hehui Zheng","Barnabasa Gavin Cangan","Bernhard Schölkopf","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2308.07741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.06591v2","updated":"2023-08-15T12:23:03Z","published":"2022-10-12T21:10:55Z","title":"Rigorous dynamical mean field theory for stochastic gradient descent\n methods","summary":" We prove closed-form equations for the exact high-dimensional asymptotics of\na family of first order gradient-based methods, learning an estimator (e.g.\nM-estimator, shallow neural network, ...) from observations on Gaussian data\nwith empirical risk minimization. This includes widely used algorithms such as\nstochastic gradient descent (SGD) or Nesterov acceleration. The obtained\nequations match those resulting from the discretization of dynamical mean-field\ntheory (DMFT) equations from statistical physics when applied to gradient flow.\nOur proof method allows us to give an explicit description of how memory\nkernels build up in the effective dynamics, and to include non-separable update\nfunctions, allowing datasets with non-identity covariance matrices. Finally, we\nprovide numerical implementations of the equations for SGD with generic\nextensive batch-size and with constant learning rates.\n","authors":["Cedric Gerbelot","Emanuele Troiani","Francesca Mignacco","Florent Krzakala","Lenka Zdeborova"],"pdf_url":"https://arxiv.org/pdf/2210.06591v2.pdf","comment":"38 pages, 4 figures"},{"id":"http://arxiv.org/abs/2304.07769v2","updated":"2023-08-15T12:14:43Z","published":"2023-04-16T13:05:39Z","title":"Spot The Odd One Out: Regularized Complete Cycle Consistent Anomaly\n Detector GAN","summary":" This study presents an adversarial method for anomaly detection in real-world\napplications, leveraging the power of generative adversarial neural networks\n(GANs) through cycle consistency in reconstruction error. Previous methods\nsuffer from the high variance between class-wise accuracy which leads to not\nbeing applicable for all types of anomalies. The proposed method named RCALAD\ntries to solve this problem by introducing a novel discriminator to the\nstructure, which results in a more efficient training process. Additionally,\nRCALAD employs a supplementary distribution in the input space to steer\nreconstructions toward the normal data distribution, effectively separating\nanomalous samples from their reconstructions and facilitating more accurate\nanomaly detection. To further enhance the performance of the model, two novel\nanomaly scores are introduced. The proposed model has been thoroughly evaluated\nthrough extensive experiments on six various datasets, yielding results that\ndemonstrate its superiority over existing state-of-the-art models. The code is\nreadily available to the research community at\nhttps://github.com/zahraDehghanian97/RCALAD.\n","authors":["Zahra Dehghanian","Saeed Saravani","Maryam Amirmazlaghani","Mohammad Rahmati"],"pdf_url":"https://arxiv.org/pdf/2304.07769v2.pdf","comment":"under revision of Applied Soft Computing Journal"},{"id":"http://arxiv.org/abs/2308.07728v1","updated":"2023-08-15T12:08:43Z","published":"2023-08-15T12:08:43Z","title":"Domain-Aware Fine-Tuning: Enhancing Neural Network Adaptability","summary":" Fine-tuning pre-trained neural network models has become a widely adopted\napproach across various domains. However, it can lead to the distortion of\npre-trained feature extractors that already possess strong generalization\ncapabilities. Mitigating feature distortion during adaptation to new target\ndomains is crucial. Recent studies have shown promising results in handling\nfeature distortion by aligning the head layer on in-distribution datasets\nbefore performing fine-tuning. Nonetheless, a significant limitation arises\nfrom the treatment of batch normalization layers during fine-tuning, leading to\nsuboptimal performance. In this paper, we propose Domain-Aware Fine-Tuning\n(DAFT), a novel approach that incorporates batch normalization conversion and\nthe integration of linear probing and fine-tuning. Our batch normalization\nconversion method effectively mitigates feature distortion by reducing\nmodifications to the neural network during fine-tuning. Additionally, we\nintroduce the integration of linear probing and fine-tuning to optimize the\nhead layer with gradual adaptation of the feature extractor. By leveraging\nbatch normalization layers and integrating linear probing and fine-tuning, our\nDAFT significantly mitigates feature distortion and achieves improved model\nperformance on both in-distribution and out-of-distribution datasets. Extensive\nexperiments demonstrate that our method outperforms other baseline methods,\ndemonstrating its effectiveness in not only improving performance but also\nmitigating feature distortion.\n","authors":["Seokhyeon Ha","Sunbeom Jung","Jungwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13163v2","updated":"2023-08-15T11:48:49Z","published":"2023-02-25T21:17:19Z","title":"Achieving High Accuracy with PINNs via Energy Natural Gradients","summary":" We propose energy natural gradient descent, a natural gradient method with\nrespect to a Hessian-induced Riemannian metric as an optimization algorithm for\nphysics-informed neural networks (PINNs) and the deep Ritz method. As a main\nmotivation we show that the update direction in function space resulting from\nthe energy natural gradient corresponds to the Newton direction modulo an\northogonal projection onto the model's tangent space. We demonstrate\nexperimentally that energy natural gradient descent yields highly accurate\nsolutions with errors several orders of magnitude smaller than what is obtained\nwhen training PINNs with standard optimizers like gradient descent or Adam,\neven when those are allowed significantly more computation time.\n","authors":["Johannes Müller","Marius Zeinhofer"],"pdf_url":"https://arxiv.org/pdf/2302.13163v2.pdf","comment":"Published version"},{"id":"http://arxiv.org/abs/2306.01991v2","updated":"2023-08-15T11:42:05Z","published":"2023-06-03T03:36:47Z","title":"A Bio-Inspired Chaos Sensor Model Based on the Perceptron Neural\n Network: Machine Learning Concept and Application for Computational\n Neuro-Science","summary":" The study presents a bio-inspired chaos sensor model based on the perceptron\nneural network for the estimation of entropy of spike train in neurodynamic\nsystems. After training, the sensor on perceptron, having 50 neurons in the\nhidden layer and 1 neuron at the output, approximates the fuzzy entropy of a\nshort time series with high accuracy, with a determination coefficient of R2 ~\n0.9. The Hindmarsh-Rose spike model was used to generate time series of spike\nintervals, and datasets for training and testing the perceptron. The selection\nof the hyperparameters of the perceptron model and the estimation of the sensor\naccuracy were performed using the K-block cross-validation method. Even for a\nhidden layer with one neuron, the model approximates the fuzzy entropy with\ngood results and the metric R2 ~ 0.5-0.8. In a simplified model with one neuron\nand equal weights in the first layer, the principle of approximation is based\non the linear transformation of the average value of the time series into the\nentropy value. An example of using the chaos sensor on spike train of action\npotential recordings from the L5 dorsal rootlet of rat is provided. The\nbio-inspired chaos sensor model based on an ensemble of neurons is able to\ndynamically track the chaotic behavior of a spike signal and transmit this\ninformation to other parts of the neurodynamic model for further processing.\nThe study will be useful for specialists in the field of computational\nneuroscience, and also to create humanoid and animal robots, and bio-robots\nwith limited resources.\n","authors":["Andrei Velichko","Petr Boriskov","Maksim Belyaev","Vadim Putrolaynen"],"pdf_url":"https://arxiv.org/pdf/2306.01991v2.pdf","comment":"28 pages, 15 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.07707v1","updated":"2023-08-15T11:30:45Z","published":"2023-08-15T11:30:45Z","title":"Fast Machine Unlearning Without Retraining Through Selective Synaptic\n Dampening","summary":" Machine unlearning, the ability for a machine learning model to forget, is\nbecoming increasingly important to comply with data privacy regulations, as\nwell as to remove harmful, manipulated, or outdated information. The key\nchallenge lies in forgetting specific information while protecting model\nperformance on the remaining data. While current state-of-the-art methods\nperform well, they typically require some level of retraining over the retained\ndata, in order to protect or restore model performance. This adds computational\noverhead and mandates that the training data remain available and accessible,\nwhich may not be feasible. In contrast, other methods employ a retrain-free\nparadigm, however, these approaches are prohibitively computationally expensive\nand do not perform on par with their retrain-based counterparts. We present\nSelective Synaptic Dampening (SSD), a novel two-step, post hoc, retrain-free\napproach to machine unlearning which is fast, performant, and does not require\nlong-term storage of the training data. First, SSD uses the Fisher information\nmatrix of the training and forgetting data to select parameters that are\ndisproportionately important to the forget set. Second, SSD induces forgetting\nby dampening these parameters proportional to their relative importance to the\nforget set with respect to the wider training data. We evaluate our method\nagainst several existing unlearning methods in a range of experiments using\nResNet18 and Vision Transformer. Results show that the performance of SSD is\ncompetitive with retrain-based post hoc methods, demonstrating the viability of\nretrain-free post hoc unlearning approaches.\n","authors":["Jack Foster","Stefan Schoepf","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2308.07707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07706v1","updated":"2023-08-15T11:28:21Z","published":"2023-08-15T11:28:21Z","title":"Exploring Transfer Learning in Medical Image Segmentation using\n Vision-Language Models","summary":" Medical Image Segmentation is crucial in various clinical applications within\nthe medical domain. While state-of-the-art segmentation models have proven\neffective, integrating textual guidance to enhance visual features for this\ntask remains an area with limited progress. Existing segmentation models that\nutilize textual guidance are primarily trained on open-domain images, raising\nconcerns about their direct applicability in the medical domain without manual\nintervention or fine-tuning.\n To address these challenges, we propose using multimodal vision-language\nmodels for capturing semantic information from image descriptions and images,\nenabling the segmentation of diverse medical images. This study comprehensively\nevaluates existing vision language models across multiple datasets to assess\ntheir transferability from the open domain to the medical field. Furthermore,\nwe introduce variations of image descriptions for previously unseen images in\nthe dataset, revealing notable variations in model performance based on the\ngenerated prompts.\n Our findings highlight the distribution shift between the open-domain images\nand the medical domain and show that the segmentation models trained on\nopen-domain images are not directly transferrable to the medical field. But\ntheir performance can be increased by finetuning them in the medical datasets.\nWe report the zero-shot and finetuned segmentation performance of 4 Vision\nLanguage Models (VLMs) on 11 medical datasets using 9 types of prompts derived\nfrom 14 attributes.\n","authors":["Kanchan Poudel","Manish Dhakal","Prasiddha Bhandari","Rabin Adhikari","Safal Thapaliya","Bishesh Khanal"],"pdf_url":"https://arxiv.org/pdf/2308.07706v1.pdf","comment":"25 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.07705v1","updated":"2023-08-15T11:28:02Z","published":"2023-08-15T11:28:02Z","title":"Parametric entropy based Cluster Centriod Initialization for k-means\n clustering of various Image datasets","summary":" One of the most employed yet simple algorithm for cluster analysis is the\nk-means algorithm. k-means has successfully witnessed its use in artificial\nintelligence, market segmentation, fraud detection, data mining, psychology,\netc., only to name a few. The k-means algorithm, however, does not always yield\nthe best quality results. Its performance heavily depends upon the number of\nclusters supplied and the proper initialization of the cluster centroids or\nseeds. In this paper, we conduct an analysis of the performance of k-means on\nimage data by employing parametric entropies in an entropy based centroid\ninitialization method and propose the best fitting entropy measures for general\nimage datasets. We use several entropies like Taneja entropy, Kapur entropy,\nAczel Daroczy entropy, Sharma Mittal entropy. We observe that for different\ndatasets, different entropies provide better results than the conventional\nmethods. We have applied our proposed algorithm on these datasets: Satellite,\nToys, Fruits, Cars, Brain MRI, Covid X-Ray.\n","authors":["Faheem Hussayn","Shahid M Shah"],"pdf_url":"https://arxiv.org/pdf/2308.07705v1.pdf","comment":"6 Pages, 2 tables, one algorithm. Accepted for publication in IEEE\n International Conference on Signal Processing and Computer Vision (SPCV-2023)"},{"id":"http://arxiv.org/abs/2210.15081v3","updated":"2023-08-15T11:20:59Z","published":"2022-10-26T23:34:30Z","title":"Bayesian Hyperbolic Multidimensional Scaling","summary":" Multidimensional scaling (MDS) is a widely used approach to representing\nhigh-dimensional, dependent data. MDS works by assigning each observation a\nlocation on a low-dimensional geometric manifold, with distance on the manifold\nrepresenting similarity. We propose a Bayesian approach to multidimensional\nscaling when the low-dimensional manifold is hyperbolic. Using hyperbolic space\nfacilitates representing tree-like structures common in many settings (e.g.\ntext or genetic data with hierarchical structure). A Bayesian approach provides\nregularization that minimizes the impact of measurement error in the observed\ndata and assesses uncertainty. We also propose a case-control likelihood\napproximation that allows for efficient sampling from the posterior\ndistribution in larger data settings, reducing computational complexity from\napproximately $O(n^2)$ to $O(n)$. We evaluate the proposed method against\nstate-of-the-art alternatives using simulations, canonical reference datasets,\nIndian village network data, and human gene expression data.\n","authors":["Bolun Liu","Shane Lubold","Adrian E. Raftery","Tyler H. McCormick"],"pdf_url":"https://arxiv.org/pdf/2210.15081v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.01206v2","updated":"2023-08-15T11:13:59Z","published":"2022-06-01T20:16:32Z","title":"Positive Unlabeled Contrastive Learning","summary":" Self-supervised pretraining on unlabeled data followed by supervised\nfine-tuning on labeled data is a popular paradigm for learning from limited\nlabeled examples. We extend this paradigm to the classical positive unlabeled\n(PU) setting, where the task is to learn a binary classifier given only a few\nlabeled positive samples, and (often) a large amount of unlabeled samples\n(which could be positive or negative).\n We first propose a simple extension of standard infoNCE family of contrastive\nlosses, to the PU setting; and show that this learns superior representations,\nas compared to existing unsupervised and supervised approaches. We then develop\na simple methodology to pseudo-label the unlabeled samples using a new\nPU-specific clustering scheme; these pseudo-labels can then be used to train\nthe final (positive vs. negative) classifier. Our method handily outperforms\nstate-of-the-art PU methods over several standard PU benchmark datasets, while\nnot requiring a-priori knowledge of any class prior (which is a common\nassumption in other PU methods). We also provide a simple theoretical analysis\nthat motivates our methods.\n","authors":["Anish Acharya","Sujay Sanghavi","Li Jing","Bhargav Bhushanam","Michael Rabbat","Inderjit Dhillon"],"pdf_url":"https://arxiv.org/pdf/2206.01206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13917v2","updated":"2023-08-15T11:07:44Z","published":"2023-04-27T02:01:24Z","title":"Proportionally Representative Clustering","summary":" In recent years, there has been a surge in effort to formalize notions of\nfairness in machine learning. We focus on clustering -- one of the fundamental\ntasks in unsupervised machine learning. We propose a new axiom ``proportional\nrepresentation fairness'' (PRF) that is designed for clustering problems where\nthe selection of centroids reflects the distribution of data points and how\ntightly they are clustered together. Our fairness concept is not satisfied by\nexisting fair clustering algorithms. We design efficient algorithms to achieve\nPRF both for unconstrained and discrete clustering problems. Our algorithm for\nthe unconstrained setting is also the first known polynomial-time approximation\nalgorithm for the well-studied Proportional Fairness (PF) axiom (Chen, Fain,\nLyu, and Munagala, ICML, 2019). Our algorithm for the discrete setting also\nmatches the best known approximation factor for PF.\n","authors":["Haris Aziz","Barton E. Lee","Sean Morota Chu","Jeremy Vollen"],"pdf_url":"https://arxiv.org/pdf/2304.13917v2.pdf","comment":"Revised version includes a new author (Jeremy Vollen) and new\n results: Our algorithm for the unconstrained setting is also the first known\n polynomial-time approximation algorithm for the well-studied Proportional\n Fairness (PF) axiom (Chen, Fain, Lyu, and Munagala, ICML, 2019). Our\n algorithm for the discrete setting also matches the best known approximation\n factor for PF"},{"id":"http://arxiv.org/abs/2306.05694v2","updated":"2023-08-15T11:00:06Z","published":"2023-06-09T06:30:25Z","title":"Explainable Representation Learning of Small Quantum States","summary":" Unsupervised machine learning models build an internal representation of\ntheir training data without the need for explicit human guidance or feature\nengineering. This learned representation provides insights into which features\nof the data are relevant for the task at hand. In the context of quantum\nphysics, training models to describe quantum states without human intervention\noffers a promising approach to gaining insight into how machines represent\ncomplex quantum states. The ability to interpret the learned representation may\noffer a new perspective on non-trivial features of quantum systems and their\nefficient representation. We train a generative model on two-qubit density\nmatrices generated by a parameterized quantum circuit. In a series of\ncomputational experiments, we investigate the learned representation of the\nmodel and its internal understanding of the data. We observe that the model\nlearns an interpretable representation which relates the quantum states to\ntheir underlying entanglement characteristics. In particular, our results\ndemonstrate that the latent representation of the model is directly correlated\nwith the entanglement measure concurrence. The insights from this study\nrepresent proof of concept towards interpretable machine learning of quantum\nstates. Our approach offers insight into how machines learn to represent\nsmall-scale quantum systems autonomously.\n","authors":["Felix Frohnert","Evert van Nieuwenburg"],"pdf_url":"https://arxiv.org/pdf/2306.05694v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07071v2","updated":"2023-08-15T10:55:53Z","published":"2023-06-12T12:35:16Z","title":"Budgeted Multi-Armed Bandits with Asymmetric Confidence Intervals","summary":" We study the stochastic Budgeted Multi-Armed Bandit (MAB) problem, where a\nplayer chooses from $K$ arms with unknown expected rewards and costs. The goal\nis to maximize the total reward under a budget constraint. A player thus seeks\nto choose the arm with the highest reward-cost ratio as often as possible.\nCurrent state-of-the-art policies for this problem have several issues, which\nwe illustrate. To overcome them, we propose a new upper confidence bound (UCB)\nsampling policy, $\\omega$-UCB, that uses asymmetric confidence intervals. These\nintervals scale with the distance between the sample mean and the bounds of a\nrandom variable, yielding a more accurate and tight estimation of the\nreward-cost ratio compared to our competitors. We show that our approach has\nlogarithmic regret and consistently outperforms existing policies in synthetic\nand real settings.\n","authors":["Marco Heyden","Vadim Arzamasov","Edouard Fouché","Klemens Böhm"],"pdf_url":"https://arxiv.org/pdf/2306.07071v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13709v3","updated":"2023-08-15T10:55:23Z","published":"2023-07-24T20:56:42Z","title":"Deep Bradley-Terry Rating: Quantifying Properties from Comparisons","summary":" Many properties in the real world can't be directly observed, making them\ndifficult to learn. To deal with this challenging problem, prior works have\nprimarily focused on estimating those properties by using graded human scores\nas the target label in the training. Meanwhile, rating algorithms based on the\nBradley-Terry model are extensively studied to evaluate the competitiveness of\nplayers based on their match history. In this paper, we introduce the Deep\nBradley-Terry Rating (DBTR), a novel machine learning framework designed to\nquantify and evaluate properties of unknown items. Our method seamlessly\nintegrates the Bradley-Terry model into the neural network structure. Moreover,\nwe generalize this architecture further to asymmetric environments with\nunfairness, a condition more commonly encountered in real-world settings.\nThrough experimental analysis, we demonstrate that DBTR successfully learns to\nquantify and estimate desired properties.\n","authors":["Satoru Fujii"],"pdf_url":"https://arxiv.org/pdf/2307.13709v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13925v2","updated":"2023-08-15T10:49:26Z","published":"2022-12-25T14:49:37Z","title":"Quality at the Tail","summary":" Benchmarking and evaluating deep learning models and systems necessitate a\nmeticulous approach to ensure comprehensive assessment. In practical\napplications, it is paramount to consider both the inference quality and the\ninference time, particularly within critical contexts, where stringent\nrequirements demand the simultaneous satisfaction of both metrics. Neglecting\neither aspect can result in severe and irreversible consequences, including\nloss of human life and property damage. Unfortunately, many studies lack a\ncomprehensive consideration of these metrics, often conducted under ideal or\npermissive conditions, thereby leading to incomplete or non-intuitive\nevaluation methodologies.\n This study reveals that deep learning inference quality exhibits\nfluctuations, which further introduces complications and challenges to the\nbenchmarking and evaluation. To better characterize the phenomenon, the concept\nof \"tail quality\" is introduced, which indicates the quality at the tail of\ndistributions. \"Tail quality\" can offer a more objective evaluation, overcoming\nthe limitations of conventional inference quality and inference time metrics in\ncapturing the quality fluctuation phenomenon. To capture the phenomenon, this\npaper also proposes a pioneering evaluation framework for comprehensive\nassessment and analysis of various factors affecting inference time and\nquality. Leveraging this framework enables the anticipation of the potential\ndistribution of inference time and inference quality, thus capturing \"tail\nquality\" before practically applying deep learning. The effectiveness of the\nevaluation framework is validated through experiments conducted on deep\nlearning models for three different tasks across four systems. Furthermore,\nemploying this evaluation framework, the experiments conducted a preliminary\nanalysis of several factors influencing inference quality and inference time.\n","authors":["Zhengxin Yang","Wanling Gao","Chunjie Luo","Lei Wang","Fei Tang","Xu Wen","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2212.13925v2.pdf","comment":"11 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2203.16331v2","updated":"2023-08-15T10:39:41Z","published":"2022-03-28T21:13:24Z","title":"FlexFringe: Modeling Software Behavior by Learning Probabilistic\n Automata","summary":" We present the efficient implementations of probabilistic deterministic\nfinite automaton learning methods available in FlexFringe. These implement\nwell-known strategies for state-merging including several modifications to\nimprove their performance in practice. We show experimentally that these\nalgorithms obtain competitive results and significant improvements over a\ndefault implementation. We also demonstrate how to use FlexFringe to learn\ninterpretable models from software logs and use these for anomaly detection.\nAlthough less interpretable, we show that learning smaller more convoluted\nmodels improves the performance of FlexFringe on anomaly detection,\noutperforming an existing solution based on neural nets.\n","authors":["Sicco Verwer","Christian Hammerschmidt"],"pdf_url":"https://arxiv.org/pdf/2203.16331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07688v1","updated":"2023-08-15T10:37:13Z","published":"2023-08-15T10:37:13Z","title":"Enhancing Network Initialization for Medical AI Models Using\n Large-Scale, Unlabeled Natural Images","summary":" Pre-training datasets, like ImageNet, have become the gold standard in\nmedical image analysis. However, the emergence of self-supervised learning\n(SSL), which leverages unlabeled data to learn robust features, presents an\nopportunity to bypass the intensive labeling process. In this study, we\nexplored if SSL for pre-training on non-medical images can be applied to chest\nradiographs and how it compares to supervised pre-training on non-medical\nimages and on medical images. We utilized a vision transformer and initialized\nits weights based on (i) SSL pre-training on natural images (DINOv2), (ii) SL\npre-training on natural images (ImageNet dataset), and (iii) SL pre-training on\nchest radiographs from the MIMIC-CXR database. We tested our approach on over\n800,000 chest radiographs from six large global datasets, diagnosing more than\n20 different imaging findings. Our SSL pre-training on curated images not only\noutperformed ImageNet-based pre-training (P<0.001 for all datasets) but, in\ncertain cases, also exceeded SL on the MIMIC-CXR dataset. Our findings suggest\nthat selecting the right pre-training strategy, especially with SSL, can be\npivotal for improving artificial intelligence (AI)'s diagnostic accuracy in\nmedical imaging. By demonstrating the promise of SSL in chest radiograph\nanalysis, we underline a transformative shift towards more efficient and\naccurate AI models in medical imaging.\n","authors":["Soroosh Tayebi Arasteh","Leo Misera","Jakob Nikolas Kather","Daniel Truhn","Sven Nebelung"],"pdf_url":"https://arxiv.org/pdf/2308.07688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07687v1","updated":"2023-08-15T10:37:04Z","published":"2023-08-15T10:37:04Z","title":"DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using\n Pre-trained Diffusion Models","summary":" Given a classifier, the inherent property of semantic Out-of-Distribution\n(OOD) samples is that their contents differ from all legal classes in terms of\nsemantics, namely semantic mismatch. There is a recent work that directly\napplies it to OOD detection, which employs a conditional Generative Adversarial\nNetwork (cGAN) to enlarge semantic mismatch in the image space. While achieving\nremarkable OOD detection performance on small datasets, it is not applicable to\nImageNet-scale datasets due to the difficulty in training cGANs with both input\nimages and labels as conditions. As diffusion models are much easier to train\nand amenable to various conditions compared to cGANs, in this work, we propose\nto directly use pre-trained diffusion models for semantic mismatch-guided OOD\ndetection, named DiffGuard. Specifically, given an OOD input image and the\npredicted label from the classifier, we try to enlarge the semantic difference\nbetween the reconstructed OOD image under these conditions and the original\ninput image. We also present several test-time techniques to further strengthen\nsuch differences. Experimental results show that DiffGuard is effective on both\nCifar-10 and hard cases of the large-scale ImageNet, and it can be easily\ncombined with existing OOD detection techniques to achieve state-of-the-art OOD\ndetection results.\n","authors":["Ruiyuan Gao","Chenchen Zhao","Lanqing Hong","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.07687v1.pdf","comment":"Accepted by ICCV2023, with supplementary materials"},{"id":"http://arxiv.org/abs/2012.00188v4","updated":"2023-08-15T10:02:38Z","published":"2020-12-01T00:49:17Z","title":"Fair Densities via Boosting the Sufficient Statistics of Exponential\n Families","summary":" We introduce a boosting algorithm to pre-process data for fairness. Starting\nfrom an initial fair but inaccurate distribution, our approach shifts towards\nbetter data fitting while still ensuring a minimal fairness guarantee. To do\nso, it learns the sufficient statistics of an exponential family with\nboosting-compliant convergence. Importantly, we are able to theoretically prove\nthat the learned distribution will have a representation rate and statistical\nrate data fairness guarantee. Unlike recent optimization based pre-processing\nmethods, our approach can be easily adapted for continuous domain features.\nFurthermore, when the weak learners are specified to be decision trees, the\nsufficient statistics of the learned distribution can be examined to provide\nclues on sources of (un)fairness. Empirical results are present to display the\nquality of result on real-world data.\n","authors":["Alexander Soen","Hisham Husain","Richard Nock"],"pdf_url":"https://arxiv.org/pdf/2012.00188v4.pdf","comment":"Published in Proceedings of the 40th International Conference on\n Machine Learning (ICML2023)"},{"id":"http://arxiv.org/abs/2303.04418v2","updated":"2023-08-15T09:58:08Z","published":"2023-03-08T07:45:06Z","title":"FUSQA: Fetal Ultrasound Segmentation Quality Assessment","summary":" Deep learning models have been effective for various fetal ultrasound\nsegmentation tasks. However, generalization to new unseen data has raised\nquestions about their effectiveness for clinical adoption. Normally, a\ntransition to new unseen data requires time-consuming and costly quality\nassurance processes to validate the segmentation performance post-transition.\nSegmentation quality assessment efforts have focused on natural images, where\nthe problem has been typically formulated as a dice score regression task. In\nthis paper, we propose a simplified Fetal Ultrasound Segmentation Quality\nAssessment (FUSQA) model to tackle the segmentation quality assessment when no\nmasks exist to compare with. We formulate the segmentation quality assessment\nprocess as an automated classification task to distinguish between good and\npoor-quality segmentation masks for more accurate gestational age estimation.\nWe validate the performance of our proposed approach on two datasets we collect\nfrom two hospitals using different ultrasound machines. We compare different\narchitectures, with our best-performing architecture achieving over 90%\nclassification accuracy on distinguishing between good and poor-quality\nsegmentation masks from an unseen dataset. Additionally, there was only a\n1.45-day difference between the gestational age reported by doctors and\nestimated based on CRL measurements using well-segmented masks. On the other\nhand, this difference increased and reached up to 7.73 days when we calculated\nCRL from the poorly segmented masks. As a result, AI-based approaches can\npotentially aid fetal ultrasound segmentation quality assessment and might\ndetect poor segmentation in real-time screening in the future.\n","authors":["Sevim Cengiz","Ibrahim Almakky","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2303.04418v2.pdf","comment":"13 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2302.14061v2","updated":"2023-08-15T09:43:14Z","published":"2023-02-27T00:21:43Z","title":"Semantic-aware Node Synthesis for Imbalanced Heterogeneous Information\n Networks","summary":" Heterogeneous graph neural networks (HGNNs) have exhibited exceptional\nefficacy in modeling the complex heterogeneity in heterogeneous information\nnetworks (HINs). The critical advantage of HGNNs is their ability to handle\ndiverse node and edge types in HINs by extracting and utilizing the abundant\nsemantic information for effective representation learning. However, as a\nwidespread phenomenon in many real-world scenarios, the class-imbalance\ndistribution in HINs creates a performance bottleneck for existing HGNNs. Apart\nfrom the quantity imbalance of nodes, another more crucial and distinctive\nchallenge in HINs is semantic imbalance. Minority classes in HINs often lack\ndiverse and sufficient neighbor nodes, resulting in biased and incomplete\nsemantic information. This semantic imbalance further compounds the difficulty\nof accurately classifying minority nodes, leading to the performance\ndegradation of HGNNs. To tackle the imbalance of minority classes and\nsupplement their inadequate semantics, we present the first method for the\nsemantic imbalance problem in imbalanced HINs named Semantic-aware Node\nSynthesis (SNS). By assessing the influence on minority classes, SNS adaptively\nselects the heterogeneous neighbor nodes and augments the network with\nsynthetic nodes while preserving the minority semantics. In addition, we\nintroduce two regularization approaches for HGNNs that constrain the\nrepresentation of synthetic nodes from both semantic and class perspectives to\neffectively suppress the potential noises from synthetic nodes, facilitating\nmore expressive embeddings for classification. The comprehensive experimental\nstudy demonstrates that SNS consistently outperforms existing methods by a\nlarge margin in different benchmark datasets.\n","authors":["Xinyi Gao","Wentao Zhang","Tong Chen","Junliang Yu","Hung Quoc Viet Nguyen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2302.14061v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07126v2","updated":"2023-08-15T09:39:00Z","published":"2023-08-14T13:13:50Z","title":"A Time-aware tensor decomposition for tracking evolving patterns","summary":" Time-evolving data sets can often be arranged as a higher-order tensor with\none of the modes being the time mode. While tensor factorizations have been\nsuccessfully used to capture the underlying patterns in such higher-order data\nsets, the temporal aspect is often ignored, allowing for the reordering of time\npoints. In recent studies, temporal regularizers are incorporated in the time\nmode to tackle this issue. Nevertheless, existing approaches still do not allow\nunderlying patterns to change in time (e.g., spatial changes in the brain,\ncontextual changes in topics). In this paper, we propose temporal PARAFAC2\n(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal\nregularization to extract gradually evolving patterns from temporal data.\nThrough extensive experiments on synthetic data, we demonstrate that tPARAFAC2\ncan capture the underlying evolving patterns accurately performing better than\nPARAFAC2 and coupled matrix factorization with temporal smoothness\nregularization.\n","authors":["Christos Chatzis","Max Pfeffer","Pedro Lind","Evrim Acar"],"pdf_url":"https://arxiv.org/pdf/2308.07126v2.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.05424v2","updated":"2023-08-15T09:37:05Z","published":"2023-05-09T13:15:52Z","title":"Echo from noise: synthetic ultrasound image generation using diffusion\n models for real image segmentation","summary":" We propose a novel pipeline for the generation of synthetic ultrasound images\nvia Denoising Diffusion Probabilistic Models (DDPMs) guided by cardiac semantic\nlabel maps. We show that these synthetic images can serve as a viable\nsubstitute for real data in the training of deep-learning models for ultrasound\nimage analysis tasks such as cardiac segmentation. To demonstrate the\neffectiveness of this approach, we generated synthetic 2D echocardiograms and\ntrained a neural network for segmenting the left ventricle and left atrium. The\nperformance of the network trained on exclusively synthetic images was\nevaluated on an unseen dataset of real images and yielded mean Dice scores of\n88.6 $\\pm 4.91$ , 91.9 $\\pm 4.22$, 85.2 $\\pm 4.83$ \\% for left ventricular\nendocardium, epicardium and left atrial segmentation respectively. This\nrepresents a relative increase of $9.2$, $3.3$ and $13.9$ \\% in Dice scores\ncompared to the previous state-of-the-art. The proposed pipeline has potential\nfor application to a wide range of other tasks across various medical imaging\nmodalities.\n","authors":["David Stojanovski","Uxio Hermida","Pablo Lamata","Arian Beqiri","Alberto Gomez"],"pdf_url":"https://arxiv.org/pdf/2305.05424v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07662v1","updated":"2023-08-15T09:25:11Z","published":"2023-08-15T09:25:11Z","title":"Gradient-Based Post-Training Quantization: Challenging the Status Quo","summary":" Quantization has become a crucial step for the efficient deployment of deep\nneural networks, where floating point operations are converted to simpler fixed\npoint operations. In its most naive form, it simply consists in a combination\nof scaling and rounding transformations, leading to either a limited\ncompression rate or a significant accuracy drop. Recently, Gradient-based\npost-training quantization (GPTQ) methods appears to be constitute a suitable\ntrade-off between such simple methods and more powerful, yet expensive\nQuantization-Aware Training (QAT) approaches, particularly when attempting to\nquantize LLMs, where scalability of the quantization process is of paramount\nimportance. GPTQ essentially consists in learning the rounding operation using\na small calibration set. In this work, we challenge common choices in GPTQ\nmethods. In particular, we show that the process is, to a certain extent,\nrobust to a number of variables (weight selection, feature augmentation, choice\nof calibration set). More importantly, we derive a number of best practices for\ndesigning more efficient and scalable GPTQ methods, regarding the problem\nformulation (loss, degrees of freedom, use of non-uniform quantization schemes)\nor optimization process (choice of variable and optimizer). Lastly, we propose\na novel importance-based mixed-precision technique. Those guidelines lead to\nsignificant performance improvements on all the tested state-of-the-art GPTQ\nmethods and networks (e.g. +6.819 points on ViT for 4-bit quantization), paving\nthe way for the design of scalable, yet effective quantization methods.\n","authors":["Edouard Yvinec","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2308.07662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07661v1","updated":"2023-08-15T09:24:38Z","published":"2023-08-15T09:24:38Z","title":"Attention Is Not All You Need Anymore","summary":" In recent years, the popular Transformer architecture has achieved great\nsuccess in many application areas, including natural language processing and\ncomputer vision. Many existing works aim to reduce the computational and memory\ncomplexity of the self-attention mechanism in the Transformer by trading off\nperformance. However, performance is key for the continuing success of the\nTransformer. In this paper, a drop-in replacement for the self-attention\nmechanism in the Transformer, called the Extractor, is proposed. Experimental\nresults show that replacing the self-attention mechanism with the Extractor\nimproves the performance of the Transformer. Furthermore, the proposed\nExtractor has the potential to run faster than the self-attention since it has\na much shorter critical path of computation. Additionally, the sequence\nprediction problem in the context of text generation is formulated using\nvariable-length discrete-time Markov chains, and the Transformer is reviewed\nbased on our understanding.\n","authors":["Zhe Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.10066v2","updated":"2023-08-15T09:12:03Z","published":"2022-02-21T09:26:34Z","title":"Multi-task Representation Learning with Stochastic Linear Bandits","summary":" We study the problem of transfer-learning in the setting of stochastic linear\nbandit tasks. We consider that a low dimensional linear representation is\nshared across the tasks, and study the benefit of learning this representation\nin the multi-task learning setting. Following recent results to design\nstochastic bandit policies, we propose an efficient greedy policy based on\ntrace norm regularization. It implicitly learns a low dimensional\nrepresentation by encouraging the matrix formed by the task regression vectors\nto be of low rank. Unlike previous work in the literature, our policy does not\nneed to know the rank of the underlying matrix. We derive an upper bound on the\nmulti-task regret of our policy, which is, up to logarithmic factors, of order\n$\\sqrt{NdT(T+d)r}$, where $T$ is the number of tasks, $r$ the rank, $d$ the\nnumber of variables and $N$ the number of rounds per task. We show the benefit\nof our strategy compared to the baseline $Td\\sqrt{N}$ obtained by solving each\ntask independently. We also provide a lower bound to the multi-task regret.\nFinally, we corroborate our theoretical findings with preliminary experiments\non synthetic data.\n","authors":["Leonardo Cella","Karim Lounici","Grégoire Pacreau","Massimiliano Pontil"],"pdf_url":"https://arxiv.org/pdf/2202.10066v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07655v1","updated":"2023-08-15T09:10:49Z","published":"2023-08-15T09:10:49Z","title":"From Commit Message Generation to History-Aware Commit Message\n Completion","summary":" Commit messages are crucial to software development, allowing developers to\ntrack changes and collaborate effectively. Despite their utility, most commit\nmessages lack important information since writing high-quality commit messages\nis tedious and time-consuming. The active research on commit message generation\n(CMG) has not yet led to wide adoption in practice. We argue that if we could\nshift the focus from commit message generation to commit message completion and\nuse previous commit history as additional context, we could significantly\nimprove the quality and the personal nature of the resulting commit messages.\n In this paper, we propose and evaluate both of these novel ideas. Since the\nexisting datasets lack historical data, we collect and share a novel dataset\ncalled CommitChronicle, containing 10.7M commits across 20 programming\nlanguages. We use this dataset to evaluate the completion setting and the\nusefulness of the historical context for state-of-the-art CMG models and\nGPT-3.5-turbo. Our results show that in some contexts, commit message\ncompletion shows better results than generation, and that while in general\nGPT-3.5-turbo performs worse, it shows potential for long and detailed\nmessages. As for the history, the results show that historical information\nimproves the performance of CMG models in the generation task, and the\nperformance of GPT-3.5-turbo in both generation and completion.\n","authors":["Aleksandra Eliseeva","Yaroslav Sokolov","Egor Bogomolov","Yaroslav Golubev","Danny Dig","Timofey Bryksin"],"pdf_url":"https://arxiv.org/pdf/2308.07655v1.pdf","comment":"Accepted to ASE'23. 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.12398v4","updated":"2023-08-15T09:05:54Z","published":"2023-03-22T09:06:07Z","title":"Multiscale Attention via Wavelet Neural Operators for Vision\n Transformers","summary":" Transformers have achieved widespread success in computer vision. At their\nheart, there is a Self-Attention (SA) mechanism, an inductive bias that\nassociates each token in the input with every other token through a weighted\nbasis. The standard SA mechanism has quadratic complexity with the sequence\nlength, which impedes its utility to long sequences appearing in high\nresolution vision. Recently, inspired by operator learning for PDEs, Adaptive\nFourier Neural Operators (AFNO) were introduced for high resolution attention\nbased on global convolution that is efficiently implemented via FFT. However,\nthe AFNO global filtering cannot well represent small and moderate scale\nstructures that commonly appear in natural images. To leverage the\ncoarse-to-fine scale structures we introduce a Multiscale Wavelet Attention\n(MWA) by leveraging wavelet neural operators which incurs linear complexity in\nthe sequence size. We replace the attention in ViT with MWA and our experiments\nwith CIFAR and Tiny-ImageNet classification demonstrate significant improvement\nover alternative Fourier-based attentions such as AFNO and Global Filter\nNetwork (GFN).\n","authors":["Anahita Nekoozadeh","Mohammad Reza Ahmadzadeh","Zahra Mardani"],"pdf_url":"https://arxiv.org/pdf/2303.12398v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07699v2","updated":"2023-08-15T09:03:39Z","published":"2023-06-13T11:34:36Z","title":"Time-aware Graph Structure Learning via Sequence Prediction on Temporal\n Graphs","summary":" Temporal Graph Learning, which aims to model the time-evolving nature of\ngraphs, has gained increasing attention and achieved remarkable performance\nrecently. However, in reality, graph structures are often incomplete and noisy,\nwhich hinders temporal graph networks (TGNs) from learning informative\nrepresentations. Graph contrastive learning uses data augmentation to generate\nplausible variations of existing data and learn robust representations.\nHowever, rule-based augmentation approaches may be suboptimal as they lack\nlearnability and fail to leverage rich information from downstream tasks. To\naddress these issues, we propose a Time-aware Graph Structure Learning (TGSL)\napproach via sequence prediction on temporal graphs, which learns better graph\nstructures for downstream tasks through adding potential temporal edges. In\nparticular, it predicts time-aware context embedding based on previously\nobserved interactions and uses the Gumble-Top-K to select the closest candidate\nedges to this context embedding. Additionally, several candidate sampling\nstrategies are proposed to ensure both efficiency and diversity. Furthermore,\nwe jointly learn the graph structure and TGNs in an end-to-end manner and\nperform inference on the refined graph. Extensive experiments on temporal link\nprediction benchmarks demonstrate that TGSL yields significant gains for the\npopular TGNs such as TGAT and GraphMixer, and it outperforms other contrastive\nlearning methods on temporal graphs. We release the code at\nhttps://github.com/ViktorAxelsen/TGSL.\n","authors":["Haozhen Zhang","Xueting Han","Xi Xiao","Jing Bai"],"pdf_url":"https://arxiv.org/pdf/2306.07699v2.pdf","comment":"Accepted by CIKM 2023. The code is available at\n https://github.com/ViktorAxelsen/TGSL"},{"id":"http://arxiv.org/abs/2111.13180v4","updated":"2023-08-15T08:57:59Z","published":"2021-11-25T17:22:22Z","title":"Variational Gibbs Inference for Statistical Model Estimation from\n Incomplete Data","summary":" Statistical models are central to machine learning with broad applicability\nacross a range of downstream tasks. The models are controlled by free\nparameters that are typically estimated from data by maximum-likelihood\nestimation or approximations thereof. However, when faced with real-world data\nsets many of the models run into a critical issue: they are formulated in terms\nof fully-observed data, whereas in practice the data sets are plagued with\nmissing data. The theory of statistical model estimation from incomplete data\nis conceptually similar to the estimation of latent-variable models, where\npowerful tools such as variational inference (VI) exist. However, in contrast\nto standard latent-variable models, parameter estimation with incomplete data\noften requires estimating exponentially-many conditional distributions of the\nmissing variables, hence making standard VI methods intractable. We address\nthis gap by introducing variational Gibbs inference (VGI), a new\ngeneral-purpose method to estimate the parameters of statistical models from\nincomplete data. We validate VGI on a set of synthetic and real-world\nestimation tasks, estimating important machine learning models such as\nvariational autoencoders and normalising flows from incomplete data. The\nproposed method, whilst general-purpose, achieves competitive or better\nperformance than existing model-specific estimation methods.\n","authors":["Vaidotas Simkus","Benjamin Rhodes","Michael U. Gutmann"],"pdf_url":"https://arxiv.org/pdf/2111.13180v4.pdf","comment":"Published at Journal of Machine Learning Research (JMLR)"},{"id":"http://arxiv.org/abs/2209.14770v2","updated":"2023-08-15T08:49:10Z","published":"2022-09-29T13:28:34Z","title":"R2C-GAN: Restore-to-Classify GANs for Blind X-Ray Restoration and\n COVID-19 Classification","summary":" Restoration of poor quality images with a blended set of artifacts plays a\nvital role for a reliable diagnosis. Existing studies have focused on specific\nrestoration problems such as image deblurring, denoising, and exposure\ncorrection where there is usually a strong assumption on the artifact type and\nseverity. As a pioneer study in blind X-ray restoration, we propose a joint\nmodel for generic image restoration and classification: Restore-to-Classify\nGenerative Adversarial Networks (R2C-GANs). Such a jointly optimized model\nkeeps any disease intact after the restoration. Therefore, this will naturally\nlead to a higher diagnosis performance thanks to the improved X-ray image\nquality. To accomplish this crucial objective, we define the restoration task\nas an Image-to-Image translation problem from poor quality having noisy,\nblurry, or over/under-exposed images to high quality image domain. The proposed\nR2C-GAN model is able to learn forward and inverse transforms between the two\ndomains using unpaired training samples. Simultaneously, the joint\nclassification preserves the disease label during restoration. Moreover, the\nR2C-GANs are equipped with operational layers/neurons reducing the network\ndepth and further boosting both restoration and classification performances.\nThe proposed joint model is extensively evaluated over the QaTa-COV19 dataset\nfor Coronavirus Disease 2019 (COVID-19) classification. The proposed\nrestoration approach achieves over 90% F1-Score which is significantly higher\nthan the performance of any deep model. Moreover, in the qualitative analysis,\nthe restoration performance of R2C-GANs is approved by a group of medical\ndoctors. We share the software implementation at\nhttps://github.com/meteahishali/R2C-GAN.\n","authors":["Mete Ahishali","Aysen Degerli","Serkan Kiranyaz","Tahir Hamid","Rashid Mazhar","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2209.14770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07641v1","updated":"2023-08-15T08:46:17Z","published":"2023-08-15T08:46:17Z","title":"Ternary Singular Value Decomposition as a Better Parameterized Form in\n Linear Mapping","summary":" We present a simple yet novel parameterized form of linear mapping to\nachieves remarkable network compression performance: a pseudo SVD called\nTernary SVD (TSVD).\n Unlike vanilla SVD, TSVD limits the $U$ and $V$ matrices in SVD to ternary\nmatrices form in $\\{\\pm 1, 0\\}$. This means that instead of using the expensive\nmultiplication instructions, TSVD only requires addition instructions when\ncomputing $U(\\cdot)$ and $V(\\cdot)$.\n We provide direct and training transition algorithms for TSVD like Post\nTraining Quantization and Quantization Aware Training respectively.\nAdditionally, we analyze the convergence of the direct transition algorithms in\ntheory.\n In experiments, we demonstrate that TSVD can achieve state-of-the-art network\ncompression performance in various types of networks and tasks, including\ncurrent baseline models such as ConvNext, Swim, BERT, and large language model\nlike OPT.\n","authors":["Boyu Chen","Hanxuan Chen","Jiao He","Fengyu Sun","Shangling Jui"],"pdf_url":"https://arxiv.org/pdf/2308.07641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10698v3","updated":"2023-08-15T08:32:35Z","published":"2023-06-19T04:48:36Z","title":"Deep Reinforcement Learning with Multitask Episodic Memory Based on\n Task-Conditioned Hypernetwork","summary":" Deep reinforcement learning algorithms are usually impeded by sampling\ninefficiency, heavily depending on multiple interactions with the environment\nto acquire accurate decision-making capabilities. In contrast, humans rely on\ntheir hippocampus to retrieve relevant information from past experiences of\nrelevant tasks, which guides their decision-making when learning a new task,\nrather than exclusively depending on environmental interactions. Nevertheless,\ndesigning a hippocampus-like module for an agent to incorporate past\nexperiences into established reinforcement learning algorithms presents two\nchallenges. The first challenge involves selecting the most relevant past\nexperiences for the current task, and the second challenge is integrating such\nexperiences into the decision network. To address these challenges, we propose\na novel method that utilizes a retrieval network based on task-conditioned\nhypernetwork, which adapts the retrieval network's parameters depending on the\ntask. At the same time, a dynamic modification mechanism enhances the\ncollaborative efforts between the retrieval and decision networks. We evaluate\nthe proposed method on the MiniGrid environment.The experimental results\ndemonstrate that our proposed method significantly outperforms strong\nbaselines.\n","authors":["Yonggang Jin","Chenxu Wang","Liuyu Xiang","Yaodong Yang","Junge Zhang","Jie Fu","Zhaofeng He"],"pdf_url":"https://arxiv.org/pdf/2306.10698v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15199v2","updated":"2023-08-15T08:30:45Z","published":"2023-07-27T21:14:46Z","title":"PromptStyler: Prompt-driven Style Generation for Source-free Domain\n Generalization","summary":" In a joint vision-language space, a text feature (e.g., from \"a photo of a\ndog\") could effectively represent its relevant image features (e.g., from dog\nphotos). Also, a recent study has demonstrated the cross-modal transferability\nphenomenon of this joint space. From these observations, we propose\nPromptStyler which simulates various distribution shifts in the joint space by\nsynthesizing diverse styles via prompts without using any images to deal with\nsource-free domain generalization. The proposed method learns to generate a\nvariety of style features (from \"a S* style of a\") via learnable style word\nvectors for pseudo-words S*. To ensure that learned styles do not distort\ncontent information, we force style-content features (from \"a S* style of a\n[class]\") to be located nearby their corresponding content features (from\n\"[class]\") in the joint vision-language space. After learning style word\nvectors, we train a linear classifier using synthesized style-content features.\nPromptStyler achieves the state of the art on PACS, VLCS, OfficeHome and\nDomainNet, even though it does not require any images for training.\n","authors":["Junhyeong Cho","Gilhyun Nam","Sungyeon Kim","Hunmin Yang","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2307.15199v2.pdf","comment":"Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/"},{"id":"http://arxiv.org/abs/2308.07625v1","updated":"2023-08-15T08:21:20Z","published":"2023-08-15T08:21:20Z","title":"Backpropagation Path Search On Adversarial Transferability","summary":" Deep neural networks are vulnerable to adversarial examples, dictating the\nimperativeness to test the model's robustness before deployment. Transfer-based\nattackers craft adversarial examples against surrogate models and transfer them\nto victim models deployed in the black-box situation. To enhance the\nadversarial transferability, structure-based attackers adjust the\nbackpropagation path to avoid the attack from overfitting the surrogate model.\nHowever, existing structure-based attackers fail to explore the convolution\nmodule in CNNs and modify the backpropagation graph heuristically, leading to\nlimited effectiveness. In this paper, we propose backPropagation pAth Search\n(PAS), solving the aforementioned two problems. We first propose SkipConv to\nadjust the backpropagation path of convolution by structural\nreparameterization. To overcome the drawback of heuristically designed\nbackpropagation paths, we further construct a DAG-based search space, utilize\none-step approximation for path evaluation and employ Bayesian Optimization to\nsearch for the optimal path. We conduct comprehensive experiments in a wide\nrange of transfer settings, showing that PAS improves the attack success rate\nby a huge margin for both normally trained and defense models.\n","authors":["Zhuoer Xu","Zhangxuan Gu","Jianping Zhang","Shiwen Cui","Changhua Meng","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07625v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.03291v2","updated":"2023-08-15T08:20:30Z","published":"2023-08-07T04:20:38Z","title":"SynJax: Structured Probability Distributions for JAX","summary":" The development of deep learning software libraries enabled significant\nprogress in the field by allowing users to focus on modeling, while letting the\nlibrary to take care of the tedious and time-consuming task of optimizing\nexecution for modern hardware accelerators. However, this has benefited only\nparticular types of deep learning models, such as Transformers, whose\nprimitives map easily to the vectorized computation. The models that explicitly\naccount for structured objects, such as trees and segmentations, did not\nbenefit equally because they require custom algorithms that are difficult to\nimplement in a vectorized form.\n SynJax directly addresses this problem by providing an efficient vectorized\nimplementation of inference algorithms for structured distributions covering\nalignment, tagging, segmentation, constituency trees and spanning trees. With\nSynJax we can build large-scale differentiable models that explicitly model\nstructure in the data. The code is available at\nhttps://github.com/deepmind/synjax.\n","authors":["Miloš Stanojević","Laurent Sartran"],"pdf_url":"https://arxiv.org/pdf/2308.03291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12449v2","updated":"2023-08-15T08:11:16Z","published":"2023-02-24T04:31:18Z","title":"SGL-PT: A Strong Graph Learner with Graph Prompt Tuning","summary":" Recently, much exertion has been paid to design graph self-supervised methods\nto obtain generalized pre-trained models, and adapt pre-trained models onto\ndownstream tasks through fine-tuning. However, there exists an inherent gap\nbetween pretext and downstream graph tasks, which insufficiently exerts the\nability of pre-trained models and even leads to negative transfer. Meanwhile,\nprompt tuning has seen emerging success in natural language processing by\naligning pre-training and fine-tuning with consistent training objectives. In\nthis paper, we identify the challenges for graph prompt tuning: The first is\nthe lack of a strong and universal pre-training task across sundry pre-training\nmethods in graph domain. The second challenge lies in the difficulty of\ndesigning a consistent training objective for both pre-training and downstream\ntasks. To overcome above obstacles, we propose a novel framework named SGL-PT\nwhich follows the learning strategy ``Pre-train, Prompt, and Predict''.\nSpecifically, we raise a strong and universal pre-training task coined as SGL\nthat acquires the complementary merits of generative and contrastive\nself-supervised graph learning. And aiming for graph classification task, we\nunify pre-training and fine-tuning by designing a novel verbalizer-free\nprompting function, which reformulates the downstream task in a similar format\nas pretext task. Empirical results show that our method surpasses other\nbaselines under unsupervised setting, and our prompt tuning method can greatly\nfacilitate models on biological datasets over fine-tuning methods.\n","authors":["Yun Zhu","Jianhao Guo","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2302.12449v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07616v1","updated":"2023-08-15T07:53:18Z","published":"2023-08-15T07:53:18Z","title":"A Multilayer Perceptron-based Fast Sunlight Assessment for the\n Conceptual Design of Residential Neighborhoods under Chinese Policy","summary":" In Chinese building codes, it is required that residential buildings receive\na minimum number of hours of natural, direct sunlight on a specified winter\nday, which represents the worst sunlight condition in a year. This requirement\nis a prerequisite for obtaining a building permit during the conceptual design\nof a residential project. Thus, officially sanctioned software is usually used\nto assess the sunlight performance of buildings. These software programs\npredict sunlight hours based on repeated shading calculations, which is\ntime-consuming. This paper proposed a multilayer perceptron-based method, a\none-stage prediction approach, which outputs a shading time interval caused by\nthe inputted cuboid-form building. The sunlight hours of a site can be obtained\nby calculating the union of the sunlight time intervals (complement of shading\ntime interval) of all the buildings. Three numerical experiments, i.e.,\nhorizontal level and slope analysis, and simulation-based optimization are\ncarried out; the results show that the method reduces the computation time to\n1/84~1/50 with 96.5%~98% accuracies. A residential neighborhood layout planning\nplug-in for Rhino 7/Grasshopper is also developed based on the proposed model.\nThis paper indicates that deep learning techniques can be adopted to accelerate\nsunlight hour simulations at the conceptual design phase.\n","authors":["Can Jiang","Xiong Liang","Yu-Cheng Zhou","Yong Tian","Shengli Xu","Jia-Rui Lin","Zhiliang Ma","Shiji Yang","Hao Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12872v4","updated":"2023-08-15T07:48:36Z","published":"2022-11-23T11:26:24Z","title":"μSplit: efficient image decomposition for microscopy data","summary":" We present {\\mu}Split, a dedicated approach for trained image decomposition\nin the context of fluorescence microscopy images. We find that best results\nusing regular deep architectures are achieved when large image patches are used\nduring training, making memory consumption the limiting factor to further\nimproving performance. We therefore introduce lateral contextualization (LC), a\nmemory efficient way to train powerful networks and show that LC leads to\nconsistent and significant improvements on the task at hand. We integrate LC\nwith U-Nets, Hierarchical AEs, and Hierarchical VAEs, for which we formulate a\nmodified ELBO loss. Additionally, LC enables training deeper hierarchical\nmodels than otherwise possible and, interestingly, helps to reduce tiling\nartefacts that are inherently impossible to avoid when using tiled VAE\npredictions. We apply {\\mu}Split to five decomposition tasks, one on a\nsynthetic dataset, four others derived from real microscopy data. LC achieves\nSOTA results (average improvements to the best baseline of 2.36 dB PSNR), while\nsimultaneously requiring considerably less GPU memory.\n","authors":[" Ashesh","Alexander Krull","Moises Di Sante","Francesco Silvio Pasqualini","Florian Jug"],"pdf_url":"https://arxiv.org/pdf/2211.12872v4.pdf","comment":"Published at ICCV 2023. 10 pages, 7 figures, 9 pages supplement, 8\n supplementary figures"},{"id":"http://arxiv.org/abs/2206.08242v2","updated":"2023-08-15T07:43:44Z","published":"2022-06-16T15:22:39Z","title":"Catastrophic overfitting can be induced with discriminative non-robust\n features","summary":" Adversarial training (AT) is the de facto method for building robust neural\nnetworks, but it can be computationally expensive. To mitigate this, fast\nsingle-step attacks can be used, but this may lead to catastrophic overfitting\n(CO). This phenomenon appears when networks gain non-trivial robustness during\nthe first stages of AT, but then reach a breaking point where they become\nvulnerable in just a few iterations. The mechanisms that lead to this failure\nmode are still poorly understood. In this work, we study the onset of CO in\nsingle-step AT methods through controlled modifications of typical datasets of\nnatural images. In particular, we show that CO can be induced at much smaller\n$\\epsilon$ values than it was observed before just by injecting images with\nseemingly innocuous features. These features aid non-robust classification but\nare not enough to achieve robustness on their own. Through extensive\nexperiments we analyze this novel phenomenon and discover that the presence of\nthese easy features induces a learning shortcut that leads to CO. Our findings\nprovide new insights into the mechanisms of CO and improve our understanding of\nthe dynamics of AT. The code to reproduce our experiments can be found at\nhttps://github.com/gortizji/co_features.\n","authors":["Guillermo Ortiz-Jiménez","Pau de Jorge","Amartya Sanyal","Adel Bibi","Puneet K. Dokania","Pascal Frossard","Gregory Rogéz","Philip H. S. Torr"],"pdf_url":"https://arxiv.org/pdf/2206.08242v2.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2308.07074v2","updated":"2023-08-15T07:37:32Z","published":"2023-08-14T11:16:28Z","title":"#InsTag: Instruction Tagging for Analyzing Supervised Fine-tuning of\n Large Language Models","summary":" Foundation language models obtain the instruction-following ability through\nsupervised fine-tuning (SFT). Diversity and complexity are considered critical\nfactors of a successful SFT dataset, while their definitions remain obscure and\nlack quantitative analyses. In this work, we propose InsTag, an open-set\nfine-grained tagger, to tag samples within SFT datasets based on semantics and\nintentions and define instruction diversity and complexity regarding tags. We\nobtain 6.6K tags to describe comprehensive user queries. Then we analyze\npopular open-sourced SFT datasets and find that the model ability grows with\nmore diverse and complex data. Based on this observation, we propose a data\nselector based on InsTag to select 6K diverse and complex samples from\nopen-source datasets and fine-tune models on InsTag-selected data. The\nresulting models, TagLM, outperform open-source models based on considerably\nlarger SFT data evaluated by MT-Bench, echoing the importance of query\ndiversity and complexity. We open-source InsTag in\nhttps://github.com/OFA-Sys/InsTag.\n","authors":["Keming Lu","Hongyi Yuan","Zheng Yuan","Runji Lin","Junyang Lin","Chuanqi Tan","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.07074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.04513v3","updated":"2023-08-15T07:22:13Z","published":"2021-11-08T13:56:22Z","title":"Clustering and Structural Robustness in Causal Diagrams","summary":" Graphs are commonly used to represent and visualize causal relations. For a\nsmall number of variables, this approach provides a succinct and clear view of\nthe scenario at hand. As the number of variables under study increases, the\ngraphical approach may become impractical, and the clarity of the\nrepresentation is lost. Clustering of variables is a natural way to reduce the\nsize of the causal diagram, but it may erroneously change the essential\nproperties of the causal relations if implemented arbitrarily. We define a\nspecific type of cluster, called transit cluster, that is guaranteed to\npreserve the identifiability properties of causal effects under certain\nconditions. We provide a sound and complete algorithm for finding all transit\nclusters in a given graph and demonstrate how clustering can simplify the\nidentification of causal effects. We also study the inverse problem, where one\nstarts with a clustered graph and looks for extended graphs where the\nidentifiability properties of causal effects remain unchanged. We show that\nthis kind of structural robustness is closely related to transit clusters.\n","authors":["Santtu Tikka","Jouni Helske","Juha Karvanen"],"pdf_url":"https://arxiv.org/pdf/2111.04513v3.pdf","comment":"This is the version published in JMLR"},{"id":"http://arxiv.org/abs/2308.07604v1","updated":"2023-08-15T07:19:54Z","published":"2023-08-15T07:19:54Z","title":"Searching for Novel Chemistry in Exoplanetary Atmospheres using Machine\n Learning for Anomaly Detection","summary":" The next generation of telescopes will yield a substantial increase in the\navailability of high-resolution spectroscopic data for thousands of exoplanets.\nThe sheer volume of data and number of planets to be analyzed greatly motivate\nthe development of new, fast and efficient methods for flagging interesting\nplanets for reobservation and detailed analysis. We advocate the application of\nmachine learning (ML) techniques for anomaly (novelty) detection to exoplanet\ntransit spectra, with the goal of identifying planets with unusual chemical\ncomposition and even searching for unknown biosignatures. We successfully\ndemonstrate the feasibility of two popular anomaly detection methods (Local\nOutlier Factor and One Class Support Vector Machine) on a large public database\nof synthetic spectra. We consider several test cases, each with different\nlevels of instrumental noise. In each case, we use ROC curves to quantify and\ncompare the performance of the two ML techniques.\n","authors":["Roy T. Forestano","Konstantin T. Matchev","Katia Matcheva","Eyup B. Unlu"],"pdf_url":"https://arxiv.org/pdf/2308.07604v1.pdf","comment":"Submitted to AAS Journals, 30 pages, 14 figures"},{"id":"http://arxiv.org/abs/2304.03646v2","updated":"2023-08-15T07:09:10Z","published":"2023-04-07T13:50:57Z","title":"Fairness through Aleatoric Uncertainty","summary":" We propose a simple yet effective solution to tackle the often-competing\ngoals of fairness and utility in classification tasks. While fairness ensures\nthat the model's predictions are unbiased and do not discriminate against any\nparticular group or individual, utility focuses on maximizing the model's\npredictive performance. This work introduces the idea of leveraging aleatoric\nuncertainty (e.g., data ambiguity) to improve the fairness-utility trade-off.\nOur central hypothesis is that aleatoric uncertainty is a key factor for\nalgorithmic fairness and samples with low aleatoric uncertainty are modeled\nmore accurately and fairly than those with high aleatoric uncertainty. We then\npropose a principled model to improve fairness when aleatoric uncertainty is\nhigh and improve utility elsewhere. Our approach first intervenes in the data\ndistribution to better decouple aleatoric uncertainty and epistemic\nuncertainty. It then introduces a fairness-utility bi-objective loss defined\nbased on the estimated aleatoric uncertainty. Our approach is theoretically\nguaranteed to improve the fairness-utility trade-off. Experimental results on\nboth tabular and image datasets show that the proposed approach outperforms\nstate-of-the-art methods w.r.t. the fairness-utility trade-off and w.r.t. both\ngroup and individual fairness metrics. This work presents a fresh perspective\non the trade-off between utility and algorithmic fairness and opens a key\navenue for the potential of using prediction uncertainty in fair machine\nlearning.\n","authors":["Anique Tahir","Lu Cheng","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2304.03646v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07598v1","updated":"2023-08-15T06:58:19Z","published":"2023-08-15T06:58:19Z","title":"Generating Personas for Games with Multimodal Adversarial Imitation\n Learning","summary":" Reinforcement learning has been widely successful in producing agents capable\nof playing games at a human level. However, this requires complex reward\nengineering, and the agent's resulting policy is often unpredictable. Going\nbeyond reinforcement learning is necessary to model a wide range of human\nplaystyles, which can be difficult to represent with a reward function. This\npaper presents a novel imitation learning approach to generate multiple persona\npolicies for playtesting. Multimodal Generative Adversarial Imitation Learning\n(MultiGAIL) uses an auxiliary input parameter to learn distinct personas using\na single-agent model. MultiGAIL is based on generative adversarial imitation\nlearning and uses multiple discriminators as reward models, inferring the\nenvironment reward by comparing the agent and distinct expert policies. The\nreward from each discriminator is weighted according to the auxiliary input.\nOur experimental analysis demonstrates the effectiveness of our technique in\ntwo environments with continuous and discrete action spaces.\n","authors":["William Ahlberg","Alessandro Sestini","Konrad Tollmar","Linus Gisslén"],"pdf_url":"https://arxiv.org/pdf/2308.07598v1.pdf","comment":"Published in CoG 2023"},{"id":"http://arxiv.org/abs/2308.07588v1","updated":"2023-08-15T06:19:31Z","published":"2023-08-15T06:19:31Z","title":"High-Probability Risk Bounds via Sequential Predictors","summary":" Online learning methods yield sequential regret bounds under minimal\nassumptions and provide in-expectation risk bounds for statistical learning.\nHowever, despite the apparent advantage of online guarantees over their\nstatistical counterparts, recent findings indicate that in many important\ncases, regret bounds may not guarantee tight high-probability risk bounds in\nthe statistical setting. In this work we show that online to batch conversions\napplied to general online learning algorithms can bypass this limitation. Via a\ngeneral second-order correction to the loss function defining the regret, we\nobtain nearly optimal high-probability risk bounds for several classical\nstatistical estimation problems, such as discrete distribution estimation,\nlinear regression, logistic regression, and conditional density estimation. Our\nanalysis relies on the fact that many online learning algorithms are improper,\nas they are not restricted to use predictors from a given reference class. The\nimproper nature of our estimators enables significant improvements in the\ndependencies on various problem parameters. Finally, we discuss some\ncomputational advantages of our sequential algorithms over their existing batch\ncounterparts.\n","authors":["Dirk van der Hoeven","Nikita Zhivotovskiy","Nicolò Cesa-Bianchi"],"pdf_url":"https://arxiv.org/pdf/2308.07588v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2308.07221v2","updated":"2023-08-15T06:00:03Z","published":"2023-08-14T15:47:25Z","title":"AudioFormer: Audio Transformer learns audio feature representations from\n discrete acoustic codes","summary":" We propose a method named AudioFormer,which learns audio feature\nrepresentations through the acquisition of discrete acoustic codes and\nsubsequently fine-tunes them for audio classification tasks. Initially,we\nintroduce a novel perspective by considering the audio classification task as a\nform of natural language understanding (NLU). Leveraging an existing neural\naudio codec model,we generate discrete acoustic codes and utilize them to train\na masked language model (MLM),thereby obtaining audio feature representations.\nFurthermore,we pioneer the integration of a Multi-Positive sample Contrastive\n(MPC) learning approach. This method enables the learning of joint\nrepresentations among multiple discrete acoustic codes within the same audio\ninput. In our experiments,we treat discrete acoustic codes as textual data and\ntrain a masked language model using a cloze-like methodology,ultimately\nderiving high-quality audio representations. Notably,the MPC learning technique\neffectively captures collaborative representations among distinct positive\nsamples. Our research outcomes demonstrate that AudioFormer attains\nsignificantly improved performance compared to prevailing monomodal audio\nclassification models across multiple datasets,and even outperforms\naudio-visual multimodal classification models on select datasets.\nSpecifically,our approach achieves remarkable results on datasets including\nAudioSet (2M,20K),and FSD50K,with performance scores of 53.9,45.1,and\n65.6,respectively. We have openly shared both the code and models:\nhttps://github.com/LZH-0225/AudioFormer.git.\n","authors":["Zhaohui Li","Haitao Wang","Xinghua Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.07221v2.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2302.06433v2","updated":"2023-08-15T05:09:29Z","published":"2023-02-13T15:12:15Z","title":"Label-efficient Time Series Representation Learning: A Review","summary":" The scarcity of labeled data is one of the main challenges of applying deep\nlearning models on time series data in the real world. Therefore, several\napproaches, e.g., transfer learning, self-supervised learning, and\nsemi-supervised learning, have been recently developed to promote the learning\ncapability of deep learning models from the limited time series labels. In this\nsurvey, for the first time, we provide a novel taxonomy to categorize existing\napproaches that address the scarcity of labeled data problem in time series\ndata based on their dependency on external data sources. Moreover, we present a\nreview of the recent advances in each approach and conclude the limitations of\nthe current works and provide future directions that could yield better\nprogress in the field.\n","authors":["Emadeldeen Eldele","Mohamed Ragab","Zhenghua Chen","Min Wu","Chee-Keong Kwoh","Xiaoli Li"],"pdf_url":"https://arxiv.org/pdf/2302.06433v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2308.07575v1","updated":"2023-08-15T05:08:12Z","published":"2023-08-15T05:08:12Z","title":"Story Visualization by Online Text Augmentation with Context Memory","summary":" Story visualization (SV) is a challenging text-to-image generation task for\nthe difficulty of not only rendering visual details from the text descriptions\nbut also encoding a long-term context across multiple sentences. While prior\nefforts mostly focus on generating a semantically relevant image for each\nsentence, encoding a context spread across the given paragraph to generate\ncontextually convincing images (e.g., with a correct character or with a proper\nbackground of the scene) remains a challenge. To this end, we propose a novel\nmemory architecture for the Bi-directional Transformers with an online text\naugmentation that generates multiple pseudo-descriptions as supplementary\nsupervision during training, for better generalization to the language\nvariation at inference. In extensive experiments on the two popular SV\nbenchmarks, i.e., the Pororo-SV and Flintstones-SV, the proposed method\nsignificantly outperforms the state of the arts in various evaluation metrics\nincluding FID, character F1, frame accuracy, BLEU-2/3, and R-precision with\nsimilar or less computational complexity.\n","authors":["Daechul Ahn","Daneul Kim","Gwangmo Song","Seung Hwan Kim","Honglak Lee","Dongyeop Kang","Jonghyun Choi"],"pdf_url":"https://arxiv.org/pdf/2308.07575v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07571v1","updated":"2023-08-15T04:49:11Z","published":"2023-08-15T04:49:11Z","title":"Ske2Grid: Skeleton-to-Grid Representation Learning for Action\n Recognition","summary":" This paper presents Ske2Grid, a new representation learning framework for\nimproved skeleton-based action recognition. In Ske2Grid, we define a regular\nconvolution operation upon a novel grid representation of human skeleton, which\nis a compact image-like grid patch constructed and learned through three novel\ndesigns. Specifically, we propose a graph-node index transform (GIT) to\nconstruct a regular grid patch through assigning the nodes in the skeleton\ngraph one by one to the desired grid cells. To ensure that GIT is a bijection\nand enrich the expressiveness of the grid representation, an up-sampling\ntransform (UPT) is learned to interpolate the skeleton graph nodes for filling\nthe grid patch to the full. To resolve the problem when the one-step UPT is\naggressive and further exploit the representation capability of the grid patch\nwith increasing spatial size, a progressive learning strategy (PLS) is proposed\nwhich decouples the UPT into multiple steps and aligns them to multiple paired\nGITs through a compact cascaded design learned progressively. We construct\nnetworks upon prevailing graph convolution networks and conduct experiments on\nsix mainstream skeleton-based action recognition datasets. Experiments show\nthat our Ske2Grid significantly outperforms existing GCN-based solutions under\ndifferent benchmark settings, without bells and whistles. Code and models are\navailable at https://github.com/OSVAI/Ske2Grid\n","authors":["Dongqi Cai","Yangyuxuan Kang","Anbang Yao","Yurong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07571v1.pdf","comment":"The paper of Ske2Grid is published at ICML 2023. Code and models are\n available at https://github.com/OSVAI/Ske2Grid"},{"id":"http://arxiv.org/abs/2308.02804v2","updated":"2023-08-15T04:29:40Z","published":"2023-08-05T06:29:46Z","title":"MiAMix: Enhancing Image Classification through a Multi-stage Augmented\n Mixed Sample Data Augmentation Method","summary":" Despite substantial progress in the field of deep learning, overfitting\npersists as a critical challenge, and data augmentation has emerged as a\nparticularly promising approach due to its capacity to enhance model\ngeneralization in various computer vision tasks. While various strategies have\nbeen proposed, Mixed Sample Data Augmentation (MSDA) has shown great potential\nfor enhancing model performance and generalization. We introduce a novel mixup\nmethod called MiAMix, which stands for Multi-stage Augmented Mixup. MiAMix\nintegrates image augmentation into the mixup framework, utilizes multiple\ndiversified mixing methods concurrently, and improves the mixing method by\nrandomly selecting mixing mask augmentation methods. Recent methods utilize\nsaliency information and the MiAMix is designed for computational efficiency as\nwell, reducing additional overhead and offering easy integration into existing\ntraining pipelines. We comprehensively evaluate MiaMix using four image\nbenchmarks and pitting it against current state-of-the-art mixed sample data\naugmentation techniques to demonstrate that MIAMix improves performance without\nheavy computational overhead.\n","authors":["Wen Liang","Youzhi Liang","Jianguo Jia"],"pdf_url":"https://arxiv.org/pdf/2308.02804v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03669v3","updated":"2023-08-15T04:21:36Z","published":"2023-08-07T15:40:34Z","title":"Diffusion Model in Causal Inference with Unmeasured Confounders","summary":" We study how to extend the use of the diffusion model to answer the causal\nquestion from the observational data under the existence of unmeasured\nconfounders. In Pearl's framework of using a Directed Acyclic Graph (DAG) to\ncapture the causal intervention, a Diffusion-based Causal Model (DCM) was\nproposed incorporating the diffusion model to answer the causal questions more\naccurately, assuming that all of the confounders are observed. However,\nunmeasured confounders in practice exist, which hinders DCM from being\napplicable. To alleviate this limitation of DCM, we propose an extended model\ncalled Backdoor Criterion based DCM (BDCM), whose idea is rooted in the\nBackdoor criterion to find the variables in DAG to be included in the decoding\nprocess of the diffusion model so that we can extend DCM to the case with\nunmeasured confounders. Synthetic data experiment demonstrates that our\nproposed model captures the counterfactual distribution more precisely than DCM\nunder the unmeasured confounders.\n","authors":["Tatsuhiro Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.03669v3.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2212.01197v3","updated":"2023-08-15T04:11:05Z","published":"2022-12-02T14:24:53Z","title":"FedALA: Adaptive Local Aggregation for Personalized Federated Learning","summary":" A key challenge in federated learning (FL) is the statistical heterogeneity\nthat impairs the generalization of the global model on each client. To address\nthis, we propose a method Federated learning with Adaptive Local Aggregation\n(FedALA) by capturing the desired information in the global model for client\nmodels in personalized FL. The key component of FedALA is an Adaptive Local\nAggregation (ALA) module, which can adaptively aggregate the downloaded global\nmodel and local model towards the local objective on each client to initialize\nthe local model before training in each iteration. To evaluate the\neffectiveness of FedALA, we conduct extensive experiments with five benchmark\ndatasets in computer vision and natural language processing domains. FedALA\noutperforms eleven state-of-the-art baselines by up to 3.27% in test accuracy.\nFurthermore, we also apply ALA module to other federated learning methods and\nachieve up to 24.19% improvement in test accuracy.\n","authors":["Jianqing Zhang","Yang Hua","Hao Wang","Tao Song","Zhengui Xue","Ruhui Ma","Haibing Guan"],"pdf_url":"https://arxiv.org/pdf/2212.01197v3.pdf","comment":"Accepted by AAAI 2023"},{"id":"http://arxiv.org/abs/2308.07562v1","updated":"2023-08-15T04:09:53Z","published":"2023-08-15T04:09:53Z","title":"Semi-Supervised Learning with Multiple Imputations on Non-Random Missing\n Labels","summary":" Semi-Supervised Learning (SSL) is implemented when algorithms are trained on\nboth labeled and unlabeled data. This is a very common application of ML as it\nis unrealistic to obtain a fully labeled dataset. Researchers have tackled\nthree main issues: missing at random (MAR), missing completely at random\n(MCAR), and missing not at random (MNAR). The MNAR problem is the most\nchallenging of the three as one cannot safely assume that all class\ndistributions are equal. Existing methods, including Class-Aware Imputation\n(CAI) and Class-Aware Propensity (CAP), mostly overlook the non-randomness in\nthe unlabeled data. This paper proposes two new methods of combining multiple\nimputation models to achieve higher accuracy and less bias. 1) We use multiple\nimputation models, create confidence intervals, and apply a threshold to ignore\npseudo-labels with low confidence. 2) Our new method, SSL with De-biased\nImputations (SSL-DI), aims to reduce bias by filtering out inaccurate data and\nfinding a subset that is accurate and reliable. This subset of the larger\ndataset could be imputed into another SSL model, which will be less biased. The\nproposed models have been shown to be effective in both MCAR and MNAR\nsituations, and experimental results show that our methodology outperforms\nexisting methods in terms of classification accuracy and reducing bias.\n","authors":["Jason Lu","Michael Ma","Huaze Xu","Zixi Xu"],"pdf_url":"https://arxiv.org/pdf/2308.07562v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2202.08510v4","updated":"2023-08-15T04:07:36Z","published":"2022-02-17T08:33:52Z","title":"Multi-Scale Hybrid Vision Transformer for Learning Gastric Histology:\n AI-Based Decision Support System for Gastric Cancer Treatment","summary":" Gastric endoscopic screening is an effective way to decide appropriate\ngastric cancer (GC) treatment at an early stage, reducing GC-associated\nmortality rate. Although artificial intelligence (AI) has brought a great\npromise to assist pathologist to screen digitalized whole slide images,\nexisting AI systems are limited in fine-grained cancer subclassifications and\nhave little usability in planning cancer treatment. We propose a practical AI\nsystem that enables five subclassifications of GC pathology, which can be\ndirectly matched to general GC treatment guidance. The AI system is designed to\nefficiently differentiate multi-classes of GC through multi-scale\nself-attention mechanism using 2-stage hybrid Vision Transformer (ViT)\nnetworks, by mimicking the way how human pathologists understand histology. The\nAI system demonstrates reliable diagnostic performance by achieving\nclass-average sensitivity of above 0.85 on a total of 1,212 slides from\nmulticentric cohort. Furthermore, AI-assisted pathologists show significantly\nimproved diagnostic sensitivity by 12% in addition to 18% reduced screening\ntime compared to human pathologists. Our results demonstrate that AI-assisted\ngastric endoscopic screening has a great potential for providing presumptive\npathologic opinion and appropriate cancer treatment of gastric cancer in\npractical clinical settings.\n","authors":["Yujin Oh","Go Eun Bae","Kyung-Hee Kim","Min-Kyung Yeo","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2202.08510v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07556v1","updated":"2023-08-15T03:49:59Z","published":"2023-08-15T03:49:59Z","title":"A User-Centered Evaluation of Spanish Text Simplification","summary":" We present an evaluation of text simplification (TS) in Spanish for a\nproduction system, by means of two corpora focused in both complex-sentence and\ncomplex-word identification. We compare the most prevalent Spanish-specific\nreadability scores with neural networks, and show that the latter are\nconsistently better at predicting user preferences regarding TS. As part of our\nanalysis, we find that multilingual models underperform against equivalent\nSpanish-only models on the same task, yet all models focus too often on\nspurious statistical features, such as sentence length. We release the corpora\nin our evaluation to the broader community with the hopes of pushing forward\nthe state-of-the-art in Spanish natural language processing.\n","authors":["Adrian de Wynter","Anthony Hevia","Si-Qing Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07556v1.pdf","comment":"Data at https://github.com/microsoft/BrevE-CLaro"},{"id":"http://arxiv.org/abs/2308.07553v1","updated":"2023-08-15T03:46:41Z","published":"2023-08-15T03:46:41Z","title":"Enhancing the Antidote: Improved Pointwise Certifications against\n Poisoning Attacks","summary":" Poisoning attacks can disproportionately influence model behaviour by making\nsmall changes to the training corpus. While defences against specific poisoning\nattacks do exist, they in general do not provide any guarantees, leaving them\npotentially countered by novel attacks. In contrast, by examining worst-case\nbehaviours Certified Defences make it possible to provide guarantees of the\nrobustness of a sample against adversarial attacks modifying a finite number of\ntraining samples, known as pointwise certification. We achieve this by\nexploiting both Differential Privacy and the Sampled Gaussian Mechanism to\nensure the invariance of prediction for each testing instance against finite\nnumbers of poisoned examples. In doing so, our model provides guarantees of\nadversarial robustness that are more than twice as large as those provided by\nprior certifications.\n","authors":["Shijie Liu","Andrew C. Cullen","Paul Montague","Sarah M. Erfani","Benjamin I. P. Rubinstein"],"pdf_url":"https://arxiv.org/pdf/2308.07553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02822v2","updated":"2023-08-15T03:13:38Z","published":"2023-06-05T12:20:40Z","title":"Discovering Dynamic Causal Space for DAG Structure Learning","summary":" Discovering causal structure from purely observational data (i.e., causal\ndiscovery), aiming to identify causal relationships among variables, is a\nfundamental task in machine learning. The recent invention of differentiable\nscore-based DAG learners is a crucial enabler, which reframes the combinatorial\noptimization problem into a differentiable optimization with a DAG constraint\nover directed graph space. Despite their great success, these cutting-edge DAG\nlearners incorporate DAG-ness independent score functions to evaluate the\ndirected graph candidates, lacking in considering graph structure. As a result,\nmeasuring the data fitness alone regardless of DAG-ness inevitably leads to\ndiscovering suboptimal DAGs and model vulnerabilities. Towards this end, we\npropose a dynamic causal space for DAG structure learning, coined CASPER, that\nintegrates the graph structure into the score function as a new measure in the\ncausal space to faithfully reflect the causal distance between estimated and\nground truth DAG. CASPER revises the learning process as well as enhances the\nDAG structure learning via adaptive attention to DAG-ness. Grounded by\nempirical visualization, CASPER, as a space, satisfies a series of desired\nproperties, such as structure awareness and noise robustness. Extensive\nexperiments on both synthetic and real-world datasets clearly validate the\nsuperiority of our CASPER over the state-of-the-art causal discovery methods in\nterms of accuracy and robustness.\n","authors":["Fangfu Liu","Wenchang Ma","An Zhang","Xiang Wang","Yueqi Duan","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2306.02822v2.pdf","comment":"Accepted by KDD 2023. Our codes are available at\n https://github.com/liuff19/CASPER"},{"id":"http://arxiv.org/abs/2308.07538v1","updated":"2023-08-15T02:40:32Z","published":"2023-08-15T02:40:32Z","title":"Domain Adaptation via Minimax Entropy for Real/Bogus Classification of\n Astronomical Alerts","summary":" Time domain astronomy is advancing towards the analysis of multiple massive\ndatasets in real time, prompting the development of multi-stream machine\nlearning models. In this work, we study Domain Adaptation (DA) for real/bogus\nclassification of astronomical alerts using four different datasets: HiTS, DES,\nATLAS, and ZTF. We study the domain shift between these datasets, and improve a\nnaive deep learning classification model by using a fine tuning approach and\nsemi-supervised deep DA via Minimax Entropy (MME). We compare the balanced\naccuracy of these models for different source-target scenarios. We find that\nboth the fine tuning and MME models improve significantly the base model with\nas few as one labeled item per class coming from the target dataset, but that\nthe MME does not compromise its performance on the source dataset.\n","authors":["Guillermo Cabrera-Vives","César Bolivar","Francisco Förster","Alejandra M. Muñoz Arancibia","Manuel Pérez-Carrasco","Esteban Reyes"],"pdf_url":"https://arxiv.org/pdf/2308.07538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07536v1","updated":"2023-08-15T02:37:11Z","published":"2023-08-15T02:37:11Z","title":"Projection-Free Methods for Stochastic Simple Bilevel Optimization with\n Convex Lower-level Problem","summary":" In this paper, we study a class of stochastic bilevel optimization problems,\nalso known as stochastic simple bilevel optimization, where we minimize a\nsmooth stochastic objective function over the optimal solution set of another\nstochastic convex optimization problem. We introduce novel stochastic bilevel\noptimization methods that locally approximate the solution set of the\nlower-level problem via a stochastic cutting plane, and then run a conditional\ngradient update with variance reduction techniques to control the error induced\nby using stochastic gradients. For the case that the upper-level function is\nconvex, our method requires\n$\\tilde{\\mathcal{O}}(\\max\\{1/\\epsilon_f^{2},1/\\epsilon_g^{2}\\}) $ stochastic\noracle queries to obtain a solution that is $\\epsilon_f$-optimal for the\nupper-level and $\\epsilon_g$-optimal for the lower-level. This guarantee\nimproves the previous best-known complexity of\n$\\mathcal{O}(\\max\\{1/\\epsilon_f^{4},1/\\epsilon_g^{4}\\})$. Moreover, for the\ncase that the upper-level function is non-convex, our method requires at most\n$\\tilde{\\mathcal{O}}(\\max\\{1/\\epsilon_f^{3},1/\\epsilon_g^{3}\\}) $ stochastic\noracle queries to find an $(\\epsilon_f, \\epsilon_g)$-stationary point. In the\nfinite-sum setting, we show that the number of stochastic oracle calls required\nby our method are $\\tilde{\\mathcal{O}}(\\sqrt{n}/\\epsilon)$ and\n$\\tilde{\\mathcal{O}}(\\sqrt{n}/\\epsilon^{2})$ for the convex and non-convex\nsettings, respectively, where $\\epsilon=\\min \\{\\epsilon_f,\\epsilon_g\\}$.\n","authors":["Jincheng Cao","Ruichen Jiang","Nazanin Abolfazli","Erfan Yazdandoost Hamedani","Aryan Mokhtari"],"pdf_url":"https://arxiv.org/pdf/2308.07536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.03758v4","updated":"2023-08-15T02:31:59Z","published":"2021-02-07T09:45:15Z","title":"Non-stationary Online Learning with Memory and Non-stochastic Control","summary":" We study the problem of Online Convex Optimization (OCO) with memory, which\nallows loss functions to depend on past decisions and thus captures temporal\neffects of learning problems. In this paper, we introduce dynamic policy regret\nas the performance measure to design algorithms robust to non-stationary\nenvironments, which competes algorithms' decisions with a sequence of changing\ncomparators. We propose a novel algorithm for OCO with memory that provably\nenjoys an optimal dynamic policy regret in terms of time horizon,\nnon-stationarity measure, and memory length. The key technical challenge is how\nto control the switching cost, the cumulative movements of player's decisions,\nwhich is neatly addressed by a novel switching-cost-aware online ensemble\napproach equipped with a new meta-base decomposition of dynamic policy regret\nand a careful design of meta-learner and base-learner that explicitly\nregularizes the switching cost. The results are further applied to tackle\nnon-stationarity in online non-stochastic control (Agarwal et al., 2019), i.e.,\ncontrolling a linear dynamical system with adversarial disturbance and convex\ncost functions. We derive a novel gradient-based controller with dynamic policy\nregret guarantees, which is the first controller provably competitive to a\nsequence of changing policies for online non-stochastic control.\n","authors":["Peng Zhao","Yu-Hu Yan","Yu-Xiang Wang","Zhi-Hua Zhou"],"pdf_url":"https://arxiv.org/pdf/2102.03758v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08325v3","updated":"2023-08-15T02:22:17Z","published":"2023-06-14T07:54:53Z","title":"GCformer: An Efficient Framework for Accurate and Scalable Long-Term\n Multivariate Time Series Forecasting","summary":" Transformer-based models have emerged as promising tools for time series\nforecasting.\n However, these model cannot make accurate prediction for long input time\nseries. On the one hand, they failed to capture global dependencies within time\nseries data. On the other hand, the long input sequence usually leads to large\nmodel size and high time complexity.\n To address these limitations, we present GCformer, which combines a\nstructured global convolutional branch for processing long input sequences with\na local Transformer-based branch for capturing short, recent signals. A\ncohesive framework for a global convolution kernel has been introduced,\nutilizing three distinct parameterization methods. The selected structured\nconvolutional kernel in the global branch has been specifically crafted with\nsublinear complexity, thereby allowing for the efficient and effective\nprocessing of lengthy and noisy input signals. Empirical studies on six\nbenchmark datasets demonstrate that GCformer outperforms state-of-the-art\nmethods, reducing MSE error in multivariate time series benchmarks by 4.38% and\nmodel parameters by 61.92%. In particular, the global convolutional branch can\nserve as a plug-in block to enhance the performance of other models, with an\naverage improvement of 31.93\\%, including various recently published\nTransformer-based models. Our code is publicly available at\nhttps://github.com/zyj-111/GCformer.\n","authors":["YanJun Zhao","Ziqing Ma","Tian Zhou","Liang Sun","Mengni Ye","Yi Qian"],"pdf_url":"https://arxiv.org/pdf/2306.08325v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07893v2","updated":"2023-08-15T02:21:20Z","published":"2023-07-15T22:13:36Z","title":"Anomaly Detection in Automated Fibre Placement: Learning with Data\n Limitations","summary":" Conventional defect detection systems in Automated Fibre Placement (AFP)\ntypically rely on end-to-end supervised learning, necessitating a substantial\nnumber of labelled defective samples for effective training. However, the\nscarcity of such labelled data poses a challenge. To overcome this limitation,\nwe present a comprehensive framework for defect detection and localization in\nAutomated Fibre Placement. Our approach combines unsupervised deep learning and\nclassical computer vision algorithms, eliminating the need for labelled data or\nmanufacturing defect samples. It efficiently detects various surface issues\nwhile requiring fewer images of composite parts for training. Our framework\nemploys an innovative sample extraction method leveraging AFP's inherent\nsymmetry to expand the dataset. By inputting a depth map of the fibre layup\nsurface, we extract local samples aligned with each composite strip (tow).\nThese samples are processed through an autoencoder, trained on normal samples\nfor precise reconstructions, highlighting anomalies through reconstruction\nerrors. Aggregated values form an anomaly map for insightful visualization. The\nframework employs blob detection on this map to locate manufacturing defects.\nThe experimental findings reveal that despite training the autoencoder with a\nlimited number of images, our proposed method exhibits satisfactory detection\naccuracy and accurately identifies defect locations. Our framework demonstrates\ncomparable performance to existing methods, while also offering the advantage\nof detecting all types of anomalies without relying on an extensive labelled\ndataset of defects.\n","authors":["Assef Ghamisi","Todd Charter","Li Ji","Maxime Rivard","Gil Lund","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2307.07893v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07527v1","updated":"2023-08-15T01:48:11Z","published":"2023-08-15T01:48:11Z","title":"FeatGeNN: Improving Model Performance for Tabular Data with\n Correlation-based Feature Extraction","summary":" Automated Feature Engineering (AutoFE) has become an important task for any\nmachine learning project, as it can help improve model performance and gain\nmore information for statistical analysis. However, most current approaches for\nAutoFE rely on manual feature creation or use methods that can generate a large\nnumber of features, which can be computationally intensive and lead to\noverfitting. To address these challenges, we propose a novel convolutional\nmethod called FeatGeNN that extracts and creates new features using correlation\nas a pooling function. Unlike traditional pooling functions like max-pooling,\ncorrelation-based pooling considers the linear relationship between the\nfeatures in the data matrix, making it more suitable for tabular data. We\nevaluate our method on various benchmark datasets and demonstrate that FeatGeNN\noutperforms existing AutoFE approaches regarding model performance. Our results\nsuggest that correlation-based pooling can be a promising alternative to\nmax-pooling for AutoFE in tabular data applications.\n","authors":["Sammuel Ramos Silva","Rodrigo Silva"],"pdf_url":"https://arxiv.org/pdf/2308.07527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10656v2","updated":"2023-08-15T01:26:57Z","published":"2023-06-19T00:42:35Z","title":"Virtual Human Generative Model: Masked Modeling Approach for Learning\n Human Characteristics","summary":" Identifying the relationship between healthcare attributes, lifestyles, and\npersonality is vital for understanding and improving physical and mental\nconditions. Machine learning approaches are promising for modeling their\nrelationships and offering actionable suggestions. In this paper, we propose\nVirtual Human Generative Model (VHGM), a machine learning model for estimating\nattributes about healthcare, lifestyles, and personalities. VHGM is a deep\ngenerative model trained with masked modeling to learn the joint distribution\nof attributes conditioned on known ones. Using heterogeneous tabular datasets,\nVHGM learns more than 1,800 attributes efficiently. We numerically evaluate the\nperformance of VHGM and its training techniques. As a proof-of-concept of VHGM,\nwe present several applications demonstrating user scenarios, such as virtual\nmeasurements of healthcare attributes and hypothesis verifications of\nlifestyles.\n","authors":["Kenta Oono","Nontawat Charoenphakdee","Kotatsu Bito","Zhengyan Gao","Yoshiaki Ota","Shoichiro Yamaguchi","Yohei Sugawara","Shin-ichi Maeda","Kunihiko Miyoshi","Yuki Saito","Koki Tsuda","Hiroshi Maruyama","Kohei Hayashi"],"pdf_url":"https://arxiv.org/pdf/2306.10656v2.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.07523v1","updated":"2023-08-15T01:25:35Z","published":"2023-08-15T01:25:35Z","title":"Potential of Deep Operator Networks in Digital Twin-enabling Technology\n for Nuclear System","summary":" This research introduces the Deep Operator Network (DeepONet) as a robust\nsurrogate modeling method within the context of digital twin (DT) systems for\nnuclear engineering. With the increasing importance of nuclear energy as a\ncarbon-neutral solution, adopting DT technology has become crucial to enhancing\noperational efficiencies, safety, and predictive capabilities in nuclear\nengineering applications. DeepONet exhibits remarkable prediction accuracy,\noutperforming traditional ML methods. Through extensive benchmarking and\nevaluation, this study showcases the scalability and computational efficiency\nof DeepONet in solving a challenging particle transport problem. By taking\nfunctions as input data and constructing the operator $G$ from training data,\nDeepONet can handle diverse and complex scenarios effectively. However, the\napplication of DeepONet also reveals challenges related to optimal sensor\nplacement and model evaluation, critical aspects of real-world implementation.\nAddressing these challenges will further enhance the method's practicality and\nreliability. Overall, DeepONet presents a promising and transformative tool for\nnuclear engineering research and applications. Its accurate prediction and\ncomputational efficiency capabilities can revolutionize DT systems, advancing\nnuclear engineering research. This study marks an important step towards\nharnessing the power of surrogate modeling techniques in critical engineering\ndomains.\n","authors":["Kazuma Kobayashi","Syed Bahauddin Alam"],"pdf_url":"https://arxiv.org/pdf/2308.07523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07520v1","updated":"2023-08-15T01:23:42Z","published":"2023-08-15T01:23:42Z","title":"Nonlinearity, Feedback and Uniform Consistency in Causal Structural\n Learning","summary":" The goal of Causal Discovery is to find automated search methods for learning\ncausal structures from observational data. In some cases all variables of the\ninterested causal mechanism are measured, and the task is to predict the\neffects one measured variable has on another. In contrast, sometimes the\nvariables of primary interest are not directly observable but instead inferred\nfrom their manifestations in the data. These are referred to as latent\nvariables. One commonly known example is the psychological construct of\nintelligence, which cannot directly measured so researchers try to assess\nthrough various indicators such as IQ tests. In this case, casual discovery\nalgorithms can uncover underlying patterns and structures to reveal the causal\nconnections between the latent variables and between the latent and observed\nvariables. This thesis focuses on two questions in causal discovery: providing\nan alternative definition of k-Triangle Faithfulness that (i) is weaker than\nstrong faithfulness when applied to the Gaussian family of distributions, (ii)\ncan be applied to non-Gaussian families of distributions, and (iii) under the\nassumption that the modified version of Strong Faithfulness holds, can be used\nto show the uniform consistency of a modified causal discovery algorithm;\nrelaxing the sufficiency assumption to learn causal structures with latent\nvariables. Given the importance of inferring cause-and-effect relationships for\nunderstanding and forecasting complex systems, the work in this thesis of\nrelaxing various simplification assumptions is expected to extend the causal\ndiscovery method to be applicable in a wider range with diversified causal\nmechanism and statistical phenomena.\n","authors":["Shuyan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08413v4","updated":"2023-08-15T01:11:26Z","published":"2022-11-15T18:51:20Z","title":"Decentralized Federated Learning: Fundamentals, State of the Art,\n Frameworks, Trends, and Challenges","summary":" In recent years, Federated Learning (FL) has gained relevance in training\ncollaborative models without sharing sensitive data. Since its birth,\nCentralized FL (CFL) has been the most common approach in the literature, where\na central entity creates a global model. However, a centralized approach leads\nto increased latency due to bottlenecks, heightened vulnerability to system\nfailures, and trustworthiness concerns affecting the entity responsible for the\nglobal model creation. Decentralized Federated Learning (DFL) emerged to\naddress these concerns by promoting decentralized model aggregation and\nminimizing reliance on centralized architectures. However, despite the work\ndone in DFL, the literature has not (i) studied the main aspects\ndifferentiating DFL and CFL; (ii) analyzed DFL frameworks to create and\nevaluate new solutions; and (iii) reviewed application scenarios using DFL.\nThus, this article identifies and analyzes the main fundamentals of DFL in\nterms of federation architectures, topologies, communication mechanisms,\nsecurity approaches, and key performance indicators. Additionally, the paper at\nhand explores existing mechanisms to optimize critical DFL fundamentals. Then,\nthe most relevant features of the current DFL frameworks are reviewed and\ncompared. After that, it analyzes the most used DFL application scenarios,\nidentifying solutions based on the fundamentals and frameworks previously\ndefined. Finally, the evolution of existing DFL solutions is studied to provide\na list of trends, lessons learned, and open challenges.\n","authors":["Enrique Tomás Martínez Beltrán","Mario Quiles Pérez","Pedro Miguel Sánchez Sánchez","Sergio López Bernal","Gérôme Bovet","Manuel Gil Pérez","Gregorio Martínez Pérez","Alberto Huertas Celdrán"],"pdf_url":"https://arxiv.org/pdf/2211.08413v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06801v2","updated":"2023-08-15T01:08:11Z","published":"2023-08-13T16:04:03Z","title":"SAILOR: Structural Augmentation Based Tail Node Representation Learning","summary":" Graph Neural Networks (GNNs) have achieved state-of-the-art performance in\nrepresentation learning for graphs recently. However, the effectiveness of\nGNNs, which capitalize on the key operation of message propagation, highly\ndepends on the quality of the topology structure. Most of the graphs in\nreal-world scenarios follow a long-tailed distribution on their node degrees,\nthat is, a vast majority of the nodes in the graph are tail nodes with only a\nfew connected edges. GNNs produce inferior node representations for tail nodes\nsince they lack structural information. In the pursuit of promoting the\nexpressiveness of GNNs for tail nodes, we explore how the deficiency of\nstructural information deteriorates the performance of tail nodes and propose a\ngeneral Structural Augmentation based taIL nOde Representation learning\nframework, dubbed as SAILOR, which can jointly learn to augment the graph\nstructure and extract more informative representations for tail nodes.\nExtensive experiments on public benchmark datasets demonstrate that SAILOR can\nsignificantly improve the tail node representations and outperform the\nstate-of-the-art baselines.\n","authors":["Jie Liao","Jintang Li","Liang Chen","Bingzhe Wu","Yatao Bian","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.06801v2.pdf","comment":"Accepted by CIKM 2023; Code is available at\n https://github.com/Jie-Re/SAILOR"},{"id":"http://arxiv.org/abs/2301.09637v2","updated":"2023-08-15T01:05:21Z","published":"2023-01-23T18:59:59Z","title":"InfiniCity: Infinite-Scale City Synthesis","summary":" Toward infinite-scale 3D city synthesis, we propose a novel framework,\nInfiniCity, which constructs and renders an unconstrainedly large and\n3D-grounded environment from random noises. InfiniCity decomposes the seemingly\nimpractical task into three feasible modules, taking advantage of both 2D and\n3D data. First, an infinite-pixel image synthesis module generates\narbitrary-scale 2D maps from the bird's-eye view. Next, an octree-based voxel\ncompletion module lifts the generated 2D map to 3D octrees. Finally, a\nvoxel-based neural rendering module texturizes the voxels and renders 2D\nimages. InfiniCity can thus synthesize arbitrary-scale and traversable 3D city\nenvironments, and allow flexible and interactive editing from users. We\nquantitatively and qualitatively demonstrate the efficacy of the proposed\nframework. Project page: https://hubert0527.github.io/infinicity/\n","authors":["Chieh Hubert Lin","Hsin-Ying Lee","Willi Menapace","Menglei Chai","Aliaksandr Siarohin","Ming-Hsuan Yang","Sergey Tulyakov"],"pdf_url":"https://arxiv.org/pdf/2301.09637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18378v2","updated":"2023-08-15T00:57:01Z","published":"2023-05-28T06:30:29Z","title":"Disentanglement via Latent Quantization","summary":" In disentangled representation learning, a model is asked to tease apart a\ndataset's underlying sources of variation and represent them independently of\none another. Since the model is provided with no ground truth information about\nthese sources, inductive biases take a paramount role in enabling\ndisentanglement. In this work, we construct an inductive bias towards encoding\nto and decoding from an organized latent space. Concretely, we do this by (i)\nquantizing the latent space into discrete code vectors with a separate\nlearnable scalar codebook per dimension and (ii) applying strong model\nregularization via an unusually high weight decay. Intuitively, the latent\nspace design forces the encoder to combinatorially construct codes from a small\nnumber of distinct scalar values, which in turn enables the decoder to assign a\nconsistent meaning to each value. Regularization then serves to drive the model\ntowards this parsimonious strategy. We demonstrate the broad applicability of\nthis approach by adding it to both basic data-reconstructing (vanilla\nautoencoder) and latent-reconstructing (InfoGAN) generative models. For\nreliable evaluation, we also propose InfoMEC, a new set of metrics for\ndisentanglement that is cohesively grounded in information theory and fixes\nwell-established shortcomings in previous metrics. Together with\nregularization, latent quantization dramatically improves the modularity and\nexplicitness of learned representations on a representative suite of benchmark\ndatasets. In particular, our quantized-latent autoencoder (QLAE) consistently\noutperforms strong methods from prior work in these key disentanglement\nproperties without compromising data reconstruction.\n","authors":["Kyle Hsu","Will Dorrell","James C. R. Whittington","Jiajun Wu","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2305.18378v2.pdf","comment":"25 pages, 15 figures, code available at\n https://github.com/kylehkhsu/disentangle"},{"id":"http://arxiv.org/abs/2308.07511v1","updated":"2023-08-15T00:30:58Z","published":"2023-08-15T00:30:58Z","title":"Distilling Knowledge from Resource Management Algorithms to Neural\n Networks: A Unified Training Assistance Approach","summary":" As a fundamental problem, numerous methods are dedicated to the optimization\nof signal-to-interference-plus-noise ratio (SINR), in a multi-user setting.\nAlthough traditional model-based optimization methods achieve strong\nperformance, the high complexity raises the research of neural network (NN)\nbased approaches to trade-off the performance and complexity. To fully leverage\nthe high performance of traditional model-based methods and the low complexity\nof the NN-based method, a knowledge distillation (KD) based algorithm\ndistillation (AD) method is proposed in this paper to improve the performance\nand convergence speed of the NN-based method, where traditional SINR\noptimization methods are employed as ``teachers\" to assist the training of NNs,\nwhich are ``students\", thus enhancing the performance of unsupervised and\nreinforcement learning techniques. This approach aims to alleviate common\nissues encountered in each of these training paradigms, including the\ninfeasibility of obtaining optimal solutions as labels and overfitting in\nsupervised learning, ensuring higher convergence performance in unsupervised\nlearning, and improving training efficiency in reinforcement learning.\nSimulation results demonstrate the enhanced performance of the proposed\nAD-based methods compared to traditional learning methods. Remarkably, this\nresearch paves the way for the integration of traditional optimization insights\nand emerging NN techniques in wireless communication system optimization.\n","authors":["Longfei Ma","Nan Cheng","Xiucheng Wang","Zhisheng Yin","Haibo Zhou","Wei Quan"],"pdf_url":"https://arxiv.org/pdf/2308.07511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07505v1","updated":"2023-08-15T00:08:43Z","published":"2023-08-15T00:08:43Z","title":"Data Race Detection Using Large Language Models","summary":" Large language models (LLMs) are demonstrating significant promise as an\nalternate strategy to facilitate analyses and optimizations of high-performance\ncomputing programs, circumventing the need for resource-intensive manual tool\ncreation. In this paper, we explore a novel LLM-based data race detection\napproach combining prompting engineering and fine-tuning techniques. We create\na dedicated dataset named DRB-ML, which is derived from DataRaceBench, with\nfine-grain labels showing the presence of data race pairs and their associated\nvariables, line numbers, and read/write information. DRB-ML is then used to\nevaluate representative LLMs and fine-tune open-source ones. Our experiment\nshows that LLMs can be a viable approach to data race detection. However, they\nstill cannot compete with traditional data race detection tools when we need\ndetailed information about variable pairs causing data races.\n","authors":["Le Chen","Xianzhong Ding","Murali Emani","Tristan Vanderbruggen","Pei-hung Lin","Chuanhua Liao"],"pdf_url":"https://arxiv.org/pdf/2308.07505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08072v1","updated":"2023-08-15T23:56:44Z","published":"2023-08-15T23:56:44Z","title":"Decentralized Graph Neural Network for Privacy-Preserving Recommendation","summary":" Building a graph neural network (GNN)-based recommender system without\nviolating user privacy proves challenging. Existing methods can be divided into\nfederated GNNs and decentralized GNNs. But both methods have undesirable\neffects, i.e., low communication efficiency and privacy leakage. This paper\nproposes DGREC, a novel decentralized GNN for privacy-preserving\nrecommendations, where users can choose to publicize their interactions. It\nincludes three stages, i.e., graph construction, local gradient calculation,\nand global gradient passing. The first stage builds a local inner-item\nhypergraph for each user and a global inter-user graph. The second stage models\nuser preference and calculates gradients on each local device. The third stage\ndesigns a local differential privacy mechanism named secure gradient-sharing,\nwhich proves strong privacy-preserving of users' private data. We conduct\nextensive experiments on three public datasets to validate the consistent\nsuperiority of our framework.\n","authors":["Xiaolin Zheng","Zhongyu Wang","Chaochao Chen","Jiashu Qian","Yao Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08071v1","updated":"2023-08-15T23:49:07Z","published":"2023-08-15T23:49:07Z","title":"Freshness or Accuracy, Why Not Both? Addressing Delayed Feedback via\n Dynamic Graph Neural Networks","summary":" The delayed feedback problem is one of the most pressing challenges in\npredicting the conversion rate since users' conversions are always delayed in\nonline commercial systems. Although new data are beneficial for continuous\ntraining, without complete feedback information, i.e., conversion labels,\ntraining algorithms may suffer from overwhelming fake negatives. Existing\nmethods tend to use multitask learning or design data pipelines to solve the\ndelayed feedback problem. However, these methods have a trade-off between data\nfreshness and label accuracy. In this paper, we propose Delayed Feedback\nModeling by Dynamic Graph Neural Network (DGDFEM). It includes three stages,\ni.e., preparing a data pipeline, building a dynamic graph, and training a CVR\nprediction model. In the model training, we propose a novel graph convolutional\nmethod named HLGCN, which leverages both high-pass and low-pass filters to deal\nwith conversion and non-conversion relationships. The proposed method achieves\nboth data freshness and label accuracy. We conduct extensive experiments on\nthree industry datasets, which validate the consistent superiority of our\nmethod.\n","authors":["Xiaolin Zheng","Zhongyu Wang","Chaochao Chen","Feng Zhu","Jiashu Qian"],"pdf_url":"https://arxiv.org/pdf/2308.08071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08070v1","updated":"2023-08-15T23:46:44Z","published":"2023-08-15T23:46:44Z","title":"Max-affine regression via first-order methods","summary":" We consider regression of a max-affine model that produces a piecewise linear\nmodel by combining affine models via the max function. The max-affine model\nubiquitously arises in applications in signal processing and statistics\nincluding multiclass classification, auction problems, and convex regression.\nIt also generalizes phase retrieval and learning rectifier linear unit\nactivation functions. We present a non-asymptotic convergence analysis of\ngradient descent (GD) and mini-batch stochastic gradient descent (SGD) for\nmax-affine regression when the model is observed at random locations following\nthe sub-Gaussianity and an anti-concentration with additive sub-Gaussian noise.\nUnder these assumptions, a suitably initialized GD and SGD converge linearly to\na neighborhood of the ground truth specified by the corresponding error bound.\nWe provide numerical results that corroborate the theoretical finding.\nImportantly, SGD not only converges faster in run time with fewer observations\nthan alternating minimization and GD in the noiseless scenario but also\noutperforms them in low-sample scenarios with noise.\n","authors":["Seonho Kim","Kiryung Lee"],"pdf_url":"https://arxiv.org/pdf/2308.08070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08069v1","updated":"2023-08-15T23:25:52Z","published":"2023-08-15T23:25:52Z","title":"A Reinforcement Learning Approach for Performance-aware Reduction in\n Power Consumption of Data Center Compute Nodes","summary":" As Exascale computing becomes a reality, the energy needs of compute nodes in\ncloud data centers will continue to grow. A common approach to reducing this\nenergy demand is to limit the power consumption of hardware components when\nworkloads are experiencing bottlenecks elsewhere in the system. However,\ndesigning a resource controller capable of detecting and limiting power\nconsumption on-the-fly is a complex issue and can also adversely impact\napplication performance. In this paper, we explore the use of Reinforcement\nLearning (RL) to design a power capping policy on cloud compute nodes using\nobservations on current power consumption and instantaneous application\nperformance (heartbeats). By leveraging the Argo Node Resource Management (NRM)\nsoftware stack in conjunction with the Intel Running Average Power Limit (RAPL)\nhardware control mechanism, we design an agent to control the maximum supplied\npower to processors without compromising on application performance. Employing\na Proximal Policy Optimization (PPO) agent to learn an optimal policy on a\nmathematical model of the compute nodes, we demonstrate and evaluate using the\nSTREAM benchmark how a trained agent running on actual hardware can take\nactions by balancing power consumption and application performance.\n","authors":["Akhilesh Raj","Swann Perarnau","Aniruddha Gokhale"],"pdf_url":"https://arxiv.org/pdf/2308.08069v1.pdf","comment":"This manuscript consists of a total of 10 pages with 8 figures and 3\n tables and is awaiting its publication at IC2E-2023"},{"id":"http://arxiv.org/abs/2210.01549v4","updated":"2023-08-15T23:10:57Z","published":"2022-10-04T12:20:21Z","title":"Diffusion Models for Graphs Benefit From Discrete State Spaces","summary":" Denoising diffusion probabilistic models and score-matching models have\nproven to be very powerful for generative tasks. While these approaches have\nalso been applied to the generation of discrete graphs, they have, so far,\nrelied on continuous Gaussian perturbations. Instead, in this work, we suggest\nusing discrete noise for the forward Markov process. This ensures that in every\nintermediate step the graph remains discrete. Compared to the previous\napproach, our experimental results on four datasets and multiple architectures\nshow that using a discrete noising process results in higher quality generated\nsamples indicated with an average MMDs reduced by a factor of 1.5. Furthermore,\nthe number of denoising steps is reduced from 1000 to 32 steps, leading to a 30\ntimes faster sampling procedure.\n","authors":["Kilian Konstantin Haefeli","Karolis Martinkus","Nathanaël Perraudin","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2210.01549v4.pdf","comment":"Presented at the First Learning on Graphs Conference (LoG 2022) and\n the NeurIPS 2022 New Frontiers in Graph Learning Workshop (NeurIPS\n GLFrontiers 2022)"},{"id":"http://arxiv.org/abs/2212.01729v2","updated":"2023-08-15T23:08:24Z","published":"2022-12-04T02:59:32Z","title":"Time-Synchronized Full System State Estimation Considering Practical\n Implementation Challenges","summary":" As phasor measurement units (PMUs) are usually placed on the highest voltage\nbuses, many lower voltage levels of the bulk power system are not observed by\nthem. This lack of visibility makes time-synchronized state estimation of the\nfull system a challenging problem. We propose a Deep Neural network-based State\nEstimator (DeNSE) to overcome this problem. The DeNSE employs a Bayesian\nframework to indirectly combine inferences drawn from slow timescale but\nwidespread supervisory control and data acquisition (SCADA) data with fast\ntimescale but local PMU data to attain sub-second situational awareness of the\nentire system. The practical utility of the proposed approach is demonstrated\nby considering topology changes, non-Gaussian measurement noise, and bad data\ndetection and correction. The results obtained using the IEEE 118-bus system\nshow the superiority of the DeNSE over a purely SCADA state estimator, a\nSCADA-PMU hybrid state estimator, and a PMU-only linear state estimator from a\ntechno-economic viability perspective. Lastly, the scalability of the DeNSE is\nproven by performing state estimation on a large and realistic 2000-bus\nSynthetic Texas system.\n","authors":["Antos Cheeramban Varghese","Hritik Shah","Behrouz Azimian","Anamitra Pal","Evangelos Farantatos"],"pdf_url":"https://arxiv.org/pdf/2212.01729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00864v2","updated":"2023-08-15T22:31:40Z","published":"2023-08-01T22:25:40Z","title":"PeRP: Personalized Residual Policies For Congestion Mitigation Through\n Co-operative Advisory Systems","summary":" Intelligent driving systems can be used to mitigate congestion through simple\nactions, thus improving many socioeconomic factors such as commute time and gas\ncosts. However, these systems assume precise control over autonomous vehicle\nfleets, and are hence limited in practice as they fail to account for\nuncertainty in human behavior. Piecewise Constant (PC) Policies address these\nissues by structurally modeling the likeness of human driving to reduce traffic\ncongestion in dense scenarios to provide action advice to be followed by human\ndrivers. However, PC policies assume that all drivers behave similarly. To this\nend, we develop a co-operative advisory system based on PC policies with a\nnovel driver trait conditioned Personalized Residual Policy, PeRP. PeRP advises\ndrivers to behave in ways that mitigate traffic congestion. We first infer the\ndriver's intrinsic traits on how they follow instructions in an unsupervised\nmanner with a variational autoencoder. Then, a policy conditioned on the\ninferred trait adapts the action of the PC policy to provide the driver with a\npersonalized recommendation. Our system is trained in simulation with novel\ndriver modeling of instruction adherence. We show that our approach\nsuccessfully mitigates congestion while adapting to different driver behaviors,\nwith 4 to 22% improvement in average speed over baselines.\n","authors":["Aamir Hasan","Neeloy Chakraborty","Haonan Chen","Jung-Hoon Cho","Cathy Wu","Katherine Driggs-Campbell"],"pdf_url":"https://arxiv.org/pdf/2308.00864v2.pdf","comment":"Accepted to ITSC 2023. Additional material and code is available at\n the project webpage: https://sites.google.com/illinois.edu/perp"},{"id":"http://arxiv.org/abs/2308.08061v1","updated":"2023-08-15T22:26:58Z","published":"2023-08-15T22:26:58Z","title":"The Costly Dilemma: Generalization, Evaluation and Cost-Optimal\n Deployment of Large Language Models","summary":" When deploying machine learning models in production for any\nproduct/application, there are three properties that are commonly desired.\nFirst, the models should be generalizable, in that we can extend it to further\nuse cases as our knowledge of the domain area develops. Second they should be\nevaluable, so that there are clear metrics for performance and the calculation\nof those metrics in production settings are feasible. Finally, the deployment\nshould be cost-optimal as far as possible. In this paper we propose that these\nthree objectives (i.e. generalization, evaluation and cost-optimality) can\noften be relatively orthogonal and that for large language models, despite\ntheir performance over conventional NLP models, enterprises need to carefully\nassess all the three factors before making substantial investments in this\ntechnology. We propose a framework for generalization, evaluation and\ncost-modeling specifically tailored to large language models, offering insights\ninto the intricacies of development, deployment and management for these large\nlanguage models.\n","authors":["Abi Aryan","Aakash Kumar Nain","Andrew McMahon","Lucas Augusto Meyer","Harpreet Singh Sahota"],"pdf_url":"https://arxiv.org/pdf/2308.08061v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2308.08060v1","updated":"2023-08-15T22:25:15Z","published":"2023-08-15T22:25:15Z","title":"Robust Bayesian Tensor Factorization with Zero-Inflated Poisson Model\n and Consensus Aggregation","summary":" Tensor factorizations (TF) are powerful tools for the efficient\nrepresentation and analysis of multidimensional data. However, classic TF\nmethods based on maximum likelihood estimation underperform when applied to\nzero-inflated count data, such as single-cell RNA sequencing (scRNA-seq) data.\nAdditionally, the stochasticity inherent in TFs results in factors that vary\nacross repeated runs, making interpretation and reproducibility of the results\nchallenging. In this paper, we introduce Zero Inflated Poisson Tensor\nFactorization (ZIPTF), a novel approach for the factorization of\nhigh-dimensional count data with excess zeros. To address the challenge of\nstochasticity, we introduce Consensus Zero Inflated Poisson Tensor\nFactorization (C-ZIPTF), which combines ZIPTF with a consensus-based\nmeta-analysis. We evaluate our proposed ZIPTF and C-ZIPTF on synthetic\nzero-inflated count data and synthetic and real scRNA-seq data. ZIPTF\nconsistently outperforms baseline matrix and tensor factorization methods in\nterms of reconstruction accuracy for zero-inflated data. When the probability\nof excess zeros is high, ZIPTF achieves up to $2.4\\times$ better accuracy.\nAdditionally, C-ZIPTF significantly improves the consistency and accuracy of\nthe factorization. When tested on both synthetic and real scRNA-seq data, ZIPTF\nand C-ZIPTF consistently recover known and biologically meaningful gene\nexpression programs.\n","authors":["Daniel Chafamo","Vignesh Shanmugam","Neriman Tokcan"],"pdf_url":"https://arxiv.org/pdf/2308.08060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08055v1","updated":"2023-08-15T21:50:40Z","published":"2023-08-15T21:50:40Z","title":"Simple online learning with consistency oracle","summary":" We consider online learning in the model where a learning algorithm can\naccess the class only via the consistency oracle -- an oracle, that, at any\nmoment, can give a function from the class that agrees with all examples seen\nso far. This model was recently considered by Assos et al. (COLT'23). It is\nmotivated by the fact that standard methods of online learning rely on\ncomputing the Littlestone dimension of subclasses, a problem that is\ncomputationally intractable. Assos et al. gave an online learning algorithm in\nthis model that makes at most $C^d$ mistakes on classes of Littlestone\ndimension $d$, for some absolute unspecified constant $C > 0$. We give a novel\nalgorithm that makes at most $O(256^d)$ mistakes. Our proof is significantly\nsimpler and uses only very basic properties of the Littlestone dimension. We\nalso observe that there exists no algorithm in this model that makes at most\n$2^{d+1}-2$ mistakes. We also observe that our algorithm (as well as the\nalgorithm of Assos et al.) solves an open problem by Hasrati and Ben-David\n(ALT'23). Namely, it demonstrates that every class of finite Littlestone\ndimension with recursively enumerable representation admits a computable online\nlearner (that may be undefined on unrealizable samples).\n","authors":["Alexander Kozachinskiy","Tomasz Steifer"],"pdf_url":"https://arxiv.org/pdf/2308.08055v1.pdf","comment":"submitted to conference"},{"id":"http://arxiv.org/abs/2308.08053v1","updated":"2023-08-15T21:43:11Z","published":"2023-08-15T21:43:11Z","title":"Natural Evolution Strategies as a Black Box Estimator for Stochastic\n Variational Inference","summary":" Stochastic variational inference and its derivatives in the form of\nvariational autoencoders enjoy the ability to perform Bayesian inference on\nlarge datasets in an efficient manner. However, performing inference with a VAE\nrequires a certain design choice (i.e. reparameterization trick) to allow\nunbiased and low variance gradient estimation, restricting the types of models\nthat can be created. To overcome this challenge, an alternative estimator based\non natural evolution strategies is proposed. This estimator does not make\nassumptions about the kind of distributions used, allowing for the creation of\nmodels that would otherwise not have been possible under the VAE framework.\n","authors":["Ahmad Ayaz Amin"],"pdf_url":"https://arxiv.org/pdf/2308.08053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08051v1","updated":"2023-08-15T21:35:44Z","published":"2023-08-15T21:35:44Z","title":"Unbiased Decisions Reduce Regret: Adversarial Domain Adaptation for the\n Bank Loan Problem","summary":" In many real world settings binary classification decisions are made based on\nlimited data in near real-time, e.g. when assessing a loan application. We\nfocus on a class of these problems that share a common feature: the true label\nis only observed when a data point is assigned a positive label by the\nprincipal, e.g. we only find out whether an applicant defaults if we accepted\ntheir loan application. As a consequence, the false rejections become\nself-reinforcing and cause the labelled training set, that is being\ncontinuously updated by the model decisions, to accumulate bias. Prior work\nmitigates this effect by injecting optimism into the model, however this comes\nat the cost of increased false acceptance rate. We introduce adversarial\noptimism (AdOpt) to directly address bias in the training set using adversarial\ndomain adaptation. The goal of AdOpt is to learn an unbiased but informative\nrepresentation of past data, by reducing the distributional shift between the\nset of accepted data points and all data points seen thus far. AdOpt\nsignificantly exceeds state-of-the-art performance on a set of challenging\nbenchmark problems. Our experiments also provide initial evidence that the\nintroduction of adversarial domain adaptation improves fairness in this\nsetting.\n","authors":["Elena Gal","Shaun Singh","Aldo Pacchiano","Ben Walker","Terry Lyons","Jakob Foerster"],"pdf_url":"https://arxiv.org/pdf/2308.08051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08046v1","updated":"2023-08-15T21:20:24Z","published":"2023-08-15T21:20:24Z","title":"Regret Lower Bounds in Multi-agent Multi-armed Bandit","summary":" Multi-armed Bandit motivates methods with provable upper bounds on regret and\nalso the counterpart lower bounds have been extensively studied in this\ncontext. Recently, Multi-agent Multi-armed Bandit has gained significant\ntraction in various domains, where individual clients face bandit problems in a\ndistributed manner and the objective is the overall system performance,\ntypically measured by regret. While efficient algorithms with regret upper\nbounds have emerged, limited attention has been given to the corresponding\nregret lower bounds, except for a recent lower bound for adversarial settings,\nwhich, however, has a gap with let known upper bounds. To this end, we herein\nprovide the first comprehensive study on regret lower bounds across different\nsettings and establish their tightness. Specifically, when the graphs exhibit\ngood connectivity properties and the rewards are stochastically distributed, we\ndemonstrate a lower bound of order $O(\\log T)$ for instance-dependent bounds\nand $\\sqrt{T}$ for mean-gap independent bounds which are tight. Assuming\nadversarial rewards, we establish a lower bound $O(T^{\\frac{2}{3}})$ for\nconnected graphs, thereby bridging the gap between the lower and upper bound in\nthe prior work. We also show a linear regret lower bound when the graph is\ndisconnected. While previous works have explored these settings with upper\nbounds, we provide a thorough study on tight lower bounds.\n","authors":["Mengfan Xu","Diego Klabjan"],"pdf_url":"https://arxiv.org/pdf/2308.08046v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2210.02614v4","updated":"2023-08-15T21:03:32Z","published":"2022-10-06T00:27:16Z","title":"Federated Learning with Server Learning: Enhancing Performance for\n Non-IID Data","summary":" Federated Learning (FL) has emerged as a means of distributed learning using\nlocal data stored at clients with a coordinating server. Recent studies showed\nthat FL can suffer from poor performance and slower convergence when training\ndata at clients are not independent and identically distributed. Here we\nconsider a new complementary approach to mitigating this performance\ndegradation by allowing the server to perform auxiliary learning from a small\ndataset. Our analysis and experiments show that this new approach can achieve\nsignificant improvements in both model accuracy and convergence time even when\nthe server dataset is small and its distribution differs from that of the\naggregated data from all clients.\n","authors":["Van Sy Mai","Richard J. La","Tao Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.02614v4.pdf","comment":"22 pages, 11 figures, 3 tables"},{"id":"http://arxiv.org/abs/2304.13017v2","updated":"2023-08-15T21:02:34Z","published":"2023-04-25T17:47:48Z","title":"DuETT: Dual Event Time Transformer for Electronic Health Records","summary":" Electronic health records (EHRs) recorded in hospital settings typically\ncontain a wide range of numeric time series data that is characterized by high\nsparsity and irregular observations. Effective modelling for such data must\nexploit its time series nature, the semantic relationship between different\ntypes of observations, and information in the sparsity structure of the data.\nSelf-supervised Transformers have shown outstanding performance in a variety of\nstructured tasks in NLP and computer vision. But multivariate time series data\ncontains structured relationships over two dimensions: time and recorded event\ntype, and straightforward applications of Transformers to time series data do\nnot leverage this distinct structure. The quadratic scaling of self-attention\nlayers can also significantly limit the input sequence length without\nappropriate input engineering. We introduce the DuETT architecture, an\nextension of Transformers designed to attend over both time and event type\ndimensions, yielding robust representations from EHR data. DuETT uses an\naggregated input where sparse time series are transformed into a regular\nsequence with fixed length; this lowers the computational complexity relative\nto previous EHR Transformer models and, more importantly, enables the use of\nlarger and deeper neural networks. When trained with self-supervised prediction\ntasks, that provide rich and informative signals for model pre-training, our\nmodel outperforms state-of-the-art deep learning models on multiple downstream\ntasks from the MIMIC-IV and PhysioNet-2012 EHR datasets.\n","authors":["Alex Labach","Aslesha Pokhrel","Xiao Shi Huang","Saba Zuberi","Seung Eun Yi","Maksims Volkovs","Tomi Poutanen","Rahul G. Krishnan"],"pdf_url":"https://arxiv.org/pdf/2304.13017v2.pdf","comment":"Accepted at MLHC 2023, camera-ready version"},{"id":"http://arxiv.org/abs/2304.01973v2","updated":"2023-08-15T20:47:51Z","published":"2023-04-04T17:31:15Z","title":"ERM++: An Improved Baseline for Domain Generalization","summary":" Multi-source Domain Generalization (DG) measures a classifier's ability to\ngeneralize to new distributions of data it was not trained on, given several\ntraining domains. While several multi-source DG methods have been proposed,\nthey incur additional complexity during training by using domain labels. Recent\nwork has shown that a well-tuned Empirical Risk Minimization (ERM) training\nprocedure, that is simply minimizing the empirical risk on the source domains,\ncan outperform most existing DG methods. We identify several key candidate\ntechniques to further improve ERM performance, such as better utilization of\ntraining data, model parameter selection, and weight-space regularization. We\ncall the resulting method ERM++, and show it significantly improves the\nperformance of DG on five multi-source datasets by over 5% compared to standard\nERM, and beats state-of-the-art despite being less computationally expensive.\nAdditionally, we demonstrate the efficacy of ERM++ on the WILDS-FMOW dataset, a\nchallenging DG benchmark. We hope that ERM++ becomes a strong baseline for\nfuture DG research. Code is released at\nhttps://github.com/piotr-teterwak/erm_plusplus.\n","authors":["Piotr Teterwak","Kuniaki Saito","Theodoros Tsiligkaridis","Kate Saenko","Bryan A. Plummer"],"pdf_url":"https://arxiv.org/pdf/2304.01973v2.pdf","comment":"An improved baseline for Domain Generalization"},{"id":"http://arxiv.org/abs/2308.08030v1","updated":"2023-08-15T20:40:42Z","published":"2023-08-15T20:40:42Z","title":"Classification of Data Generated by Gaussian Mixture Models Using Deep\n ReLU Networks","summary":" This paper studies the binary classification of unbounded data from ${\\mathbb\nR}^d$ generated under Gaussian Mixture Models (GMMs) using deep ReLU neural\nnetworks. We obtain $\\unicode{x2013}$ for the first time $\\unicode{x2013}$\nnon-asymptotic upper bounds and convergence rates of the excess risk (excess\nmisclassification error) for the classification without restrictions on model\nparameters. The convergence rates we derive do not depend on dimension $d$,\ndemonstrating that deep ReLU networks can overcome the curse of dimensionality\nin classification. While the majority of existing generalization analysis of\nclassification algorithms relies on a bounded domain, we consider an unbounded\ndomain by leveraging the analyticity and fast decay of Gaussian distributions.\nTo facilitate our analysis, we give a novel approximation error bound for\ngeneral analytic functions using ReLU networks, which may be of independent\ninterest. Gaussian distributions can be adopted nicely to model data arising in\napplications, e.g., speeches, images, and texts; our results provide a\ntheoretical verification of the observed efficiency of deep neural networks in\npractical classification problems.\n","authors":["Tian-Yi Zhou","Xiaoming Huo"],"pdf_url":"https://arxiv.org/pdf/2308.08030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08029v1","updated":"2023-08-15T20:39:23Z","published":"2023-08-15T20:39:23Z","title":"Planning to Learn: A Novel Algorithm for Active Learning during\n Model-Based Planning","summary":" Active Inference is a recent framework for modeling planning under\nuncertainty. Empirical and theoretical work have now begun to evaluate the\nstrengths and weaknesses of this approach and how it might be improved. A\nrecent extension - the sophisticated inference (SI) algorithm - improves\nperformance on multi-step planning problems through recursive decision tree\nsearch. However, little work to date has been done to compare SI to other\nestablished planning algorithms. SI was also developed with a focus on\ninference as opposed to learning. The present paper has two aims. First, we\ncompare performance of SI to Bayesian reinforcement learning (RL) schemes\ndesigned to solve similar problems. Second, we present an extension of SI -\nsophisticated learning (SL) - that more fully incorporates active learning\nduring planning. SL maintains beliefs about how model parameters would change\nunder the future observations expected under each policy. This allows a form of\ncounterfactual retrospective inference in which the agent considers what could\nbe learned from current or past observations given different future\nobservations. To accomplish these aims, we make use of a novel, biologically\ninspired environment designed to highlight the problem structure for which SL\noffers a unique solution. Here, an agent must continually search for available\n(but changing) resources in the presence of competing affordances for\ninformation gain. Our simulations show that SL outperforms all other algorithms\nin this context - most notably, Bayes-adaptive RL and upper confidence bound\nalgorithms, which aim to solve multi-step planning problems using similar\nprinciples (i.e., directed exploration and counterfactual reasoning). These\nresults provide added support for the utility of Active Inference in solving\nthis class of biologically-relevant problems and offer added tools for testing\nhypotheses about human cognition.\n","authors":["Rowan Hodson","Bruce Bassett","Charel van Hoof","Benjamin Rosman","Mark Solms","Jonathan P. Shock","Ryan Smith"],"pdf_url":"https://arxiv.org/pdf/2308.08029v1.pdf","comment":"31 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.08025v1","updated":"2023-08-15T20:30:52Z","published":"2023-08-15T20:30:52Z","title":"Potential Energy Advantage of Quantum Economy","summary":" Energy cost is increasingly crucial in the modern computing industry with the\nwide deployment of large-scale machine learning models and language models. For\nthe firms that provide computing services, low energy consumption is important\nboth from the perspective of their own market growth and the government's\nregulations. In this paper, we study the energy benefits of quantum computing\nvis-a-vis classical computing. Deviating from the conventional notion of\nquantum advantage based solely on computational complexity, we redefine\nadvantage in an energy efficiency context. Through a Cournot competition model\nconstrained by energy usage, we demonstrate quantum computing firms can\noutperform classical counterparts in both profitability and energy efficiency\nat Nash equilibrium. Therefore quantum computing may represent a more\nsustainable pathway for the computing industry. Moreover, we discover that the\nenergy benefits of quantum computing economies are contingent on large-scale\ncomputation. Based on real physical parameters, we further illustrate the scale\nof operation necessary for realizing this energy efficiency advantage.\n","authors":["Junyu Liu","Hansheng Jiang","Zuo-Jun Max Shen"],"pdf_url":"https://arxiv.org/pdf/2308.08025v1.pdf","comment":"23 pages, many figures"},{"id":"http://arxiv.org/abs/2308.08017v1","updated":"2023-08-15T20:17:26Z","published":"2023-08-15T20:17:26Z","title":"Active Inverse Learning in Stackelberg Trajectory Games","summary":" Game-theoretic inverse learning is the problem of inferring the players'\nobjectives from their actions. We formulate an inverse learning problem in a\nStackelberg game between a leader and a follower, where each player's action is\nthe trajectory of a dynamical system. We propose an active inverse learning\nmethod for the leader to infer which hypothesis among a finite set of\ncandidates describes the follower's objective function. Instead of using\npassively observed trajectories like existing methods, the proposed method\nactively maximizes the differences in the follower's trajectories under\ndifferent hypotheses to accelerate the leader's inference. We demonstrate the\nproposed method in a receding-horizon repeated trajectory game. Compared with\nuniformly random inputs, the leader inputs provided by the proposed method\naccelerate the convergence of the probability of different hypotheses\nconditioned on the follower's trajectory by orders of magnitude.\n","authors":["Yue Yu","Jacob Levy","Negar Mehr","David Fridovich-Keil","Ufuk Topcu"],"pdf_url":"https://arxiv.org/pdf/2308.08017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11562v4","updated":"2023-08-15T20:01:26Z","published":"2023-01-27T06:52:04Z","title":"Is My Prediction Arbitrary? The Confounding Effects of Variance in Fair\n Classification Benchmarks","summary":" Variance in predictions across different trained models is a significant,\nunder-explored source of error in fair classification. In practice, the\nvariance on some data examples is so large that decisions can be effectively\narbitrary. To investigate this problem, we take an experimental approach and\nmake four overarching contributions: We 1) Define a metric called\nself-consistency, derived from variance, which we use as a proxy for measuring\nand reducing arbitrariness; 2) Develop an ensembling algorithm that abstains\nfrom classification when a prediction would be arbitrary; 3) Conduct the\nlargest to-date empirical study of the role of variance (vis-a-vis\nself-consistency and arbitrariness) in fair classification; and, 4) Release a\ntoolkit that makes the US Home Mortgage Disclosure Act (HMDA) datasets easily\nusable for future research. Altogether, our experiments reveal shocking\ninsights about the reliability of conclusions on benchmark datasets. Most\nfairness classification benchmarks are close-to-fair when taking into account\nthe amount of arbitrariness present in predictions -- before we even try to\napply common fairness interventions. This finding calls into question the\npractical utility of common algorithmic fairness methods, and in turn suggests\nthat we should fundamentally reconsider how we choose to measure fairness in\nmachine learning.\n","authors":["A. Feder Cooper","Katherine Lee","Madiha Zahrah Choksi","Solon Barocas","Christopher De Sa","James Grimmelmann","Jon Kleinberg","Siddhartha Sen","Baobao Zhang"],"pdf_url":"https://arxiv.org/pdf/2301.11562v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08010v1","updated":"2023-08-15T19:50:07Z","published":"2023-08-15T19:50:07Z","title":"GRINN: A Physics-Informed Neural Network for solving hydrodynamic\n systems in the presence of self-gravity","summary":" Modeling self-gravitating gas flows is essential to answering many\nfundamental questions in astrophysics. This spans many topics including\nplanet-forming disks, star-forming clouds, galaxy formation, and the\ndevelopment of large-scale structures in the Universe. However, the nonlinear\ninteraction between gravity and fluid dynamics offers a formidable challenge to\nsolving the resulting time-dependent partial differential equations (PDEs) in\nthree dimensions (3D). By leveraging the universal approximation capabilities\nof a neural network within a mesh-free framework, physics informed neural\nnetworks (PINNs) offer a new way of addressing this challenge. We introduce the\ngravity-informed neural network (GRINN), a PINN-based code, to simulate 3D\nself-gravitating hydrodynamic systems. Here, we specifically study\ngravitational instability and wave propagation in an isothermal gas. Our\nresults match a linear analytic solution to within 1\\% in the linear regime and\na conventional grid code solution to within 5\\% as the disturbance grows into\nthe nonlinear regime. We find that the computation time of the GRINN does not\nscale with the number of dimensions. This is in contrast to the scaling of the\ngrid-based code for the hydrodynamic and self-gravity calculations as the\nnumber of dimensions is increased. Our results show that the GRINN computation\ntime is longer than the grid code in one- and two- dimensional calculations but\nis an order of magnitude lesser than the grid code in 3D with similar accuracy.\nPhysics-informed neural networks like GRINN thus show promise for advancing our\nability to model 3D astrophysical flows.\n","authors":["Sayantan Auddy","Ramit Dey","Neal J. Turner","Shantanu Basu"],"pdf_url":"https://arxiv.org/pdf/2308.08010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08003v1","updated":"2023-08-15T19:36:19Z","published":"2023-08-15T19:36:19Z","title":"BI-LAVA: Biocuration with Hierarchical Image Labeling through Active\n Learning and Visual Analysis","summary":" In the biomedical domain, taxonomies organize the acquisition modalities of\nscientific images in hierarchical structures. Such taxonomies leverage large\nsets of correct image labels and provide essential information about the\nimportance of a scientific publication, which could then be used in biocuration\ntasks. However, the hierarchical nature of the labels, the overhead of\nprocessing images, the absence or incompleteness of labeled data, and the\nexpertise required to label this type of data impede the creation of useful\ndatasets for biocuration. From a multi-year collaboration with biocurators and\ntext-mining researchers, we derive an iterative visual analytics and active\nlearning strategy to address these challenges. We implement this strategy in a\nsystem called BI-LAVA Biocuration with Hierarchical Image Labeling through\nActive Learning and Visual Analysis. BI-LAVA leverages a small set of image\nlabels, a hierarchical set of image classifiers, and active learning to help\nmodel builders deal with incomplete ground-truth labels, target a hierarchical\ntaxonomy of image modalities, and classify a large pool of unlabeled images.\nBI-LAVA's front end uses custom encodings to represent data distributions,\ntaxonomies, image projections, and neighborhoods of image thumbnails, which\nhelp model builders explore an unfamiliar image dataset and taxonomy and\ncorrect and generate labels. An evaluation with machine learning practitioners\nshows that our mixed human-machine approach successfully supports domain\nexperts in understanding the characteristics of classes within the taxonomy, as\nwell as validating and improving data quality in labeled and unlabeled\ncollections.\n","authors":["Juan Trelles","Andrew Wentzel","William Berrios","G. Elisabeta Marai"],"pdf_url":"https://arxiv.org/pdf/2308.08003v1.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.07250v2","updated":"2023-08-15T19:08:36Z","published":"2023-08-14T16:34:47Z","title":"LCE: An Augmented Combination of Bagging and Boosting in Python","summary":" lcensemble is a high-performing, scalable and user-friendly Python package\nfor the general tasks of classification and regression. The package implements\nLocal Cascade Ensemble (LCE), a machine learning method that further enhances\nthe prediction performance of the current state-of-the-art methods Random\nForest and XGBoost. LCE combines their strengths and adopts a complementary\ndiversification approach to obtain a better generalizing predictor. The package\nis compatible with scikit-learn, therefore it can interact with scikit-learn\npipelines and model selection tools. It is distributed under the Apache 2.0\nlicense, and its source code is available at\nhttps://github.com/LocalCascadeEnsemble/LCE.\n","authors":["Kevin Fauvel","Élisa Fromont","Véronique Masson","Philippe Faverdin","Alexandre Termier"],"pdf_url":"https://arxiv.org/pdf/2308.07250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12670v3","updated":"2023-08-15T18:55:42Z","published":"2022-01-29T21:48:48Z","title":"SMGRL: Scalable Multi-resolution Graph Representation Learning","summary":" Graph convolutional networks (GCNs) allow us to learn topologically-aware\nnode embeddings, which can be useful for classification or link prediction.\nHowever, they are unable to capture long-range dependencies between nodes\nwithout adding additional layers -- which in turn leads to over-smoothing and\nincreased time and space complexity. Further, the complex dependencies between\nnodes make mini-batching challenging, limiting their applicability to large\ngraphs. We propose a Scalable Multi-resolution Graph Representation Learning\n(SMGRL) framework that enables us to learn multi-resolution node embeddings\nefficiently. Our framework is model-agnostic and can be applied to any existing\nGCN model. We dramatically reduce training costs by training only on a\nreduced-dimension coarsening of the original graph, then exploit\nself-similarity to apply the resulting algorithm at multiple resolutions. The\nresulting multi-resolution embeddings can be aggregated to yield high-quality\nnode embeddings that capture both long- and short-range dependencies. Our\nexperiments show that this leads to improved classification accuracy, without\nincurring high computational costs.\n","authors":["Reza Namazi","Elahe Ghalebi","Sinead Williamson","Hamidreza Mahyar"],"pdf_url":"https://arxiv.org/pdf/2201.12670v3.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2308.07983v1","updated":"2023-08-15T18:32:00Z","published":"2023-08-15T18:32:00Z","title":"Monte Carlo guided Diffusion for Bayesian linear inverse problems","summary":" Ill-posed linear inverse problems that combine knowledge of the forward\nmeasurement model with prior models arise frequently in various applications,\nfrom computational photography to medical imaging. Recent research has focused\non solving these problems with score-based generative models (SGMs) that\nproduce perceptually plausible images, especially in inpainting problems. In\nthis study, we exploit the particular structure of the prior defined in the SGM\nto formulate recovery in a Bayesian framework as a Feynman--Kac model adapted\nfrom the forward diffusion model used to construct score-based diffusion. To\nsolve this Feynman--Kac problem, we propose the use of Sequential Monte Carlo\nmethods. The proposed algorithm, MCGdiff, is shown to be theoretically grounded\nand we provide numerical simulations showing that it outperforms competing\nbaselines when dealing with ill-posed inverse problems.\n","authors":["Gabriel Cardoso","Yazid Janati El Idrissi","Sylvain Le Corff","Eric Moulines"],"pdf_url":"https://arxiv.org/pdf/2308.07983v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2308.07980v1","updated":"2023-08-15T18:28:22Z","published":"2023-08-15T18:28:22Z","title":"An Adaptive Approach for Probabilistic Wind Power Forecasting Based on\n Meta-Learning","summary":" This paper studies an adaptive approach for probabilistic wind power\nforecasting (WPF) including offline and online learning procedures. In the\noffline learning stage, a base forecast model is trained via inner and outer\nloop updates of meta-learning, which endows the base forecast model with\nexcellent adaptability to different forecast tasks, i.e., probabilistic WPF\nwith different lead times or locations. In the online learning stage, the base\nforecast model is applied to online forecasting combined with incremental\nlearning techniques. On this basis, the online forecast takes full advantage of\nrecent information and the adaptability of the base forecast model. Two\napplications are developed based on our proposed approach concerning\nforecasting with different lead times (temporal adaptation) and forecasting for\nnewly established wind farms (spatial adaptation), respectively. Numerical\ntests were conducted on real-world wind power data sets. Simulation results\nvalidate the advantages in adaptivity of the proposed methods compared with\nexisting alternatives.\n","authors":["Zichao Meng","Ye Guo","Hongbin Sun"],"pdf_url":"https://arxiv.org/pdf/2308.07980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07971v1","updated":"2023-08-15T18:18:34Z","published":"2023-08-15T18:18:34Z","title":"MultiSChuBERT: Effective Multimodal Fusion for Scholarly Document\n Quality Prediction","summary":" Automatic assessment of the quality of scholarly documents is a difficult\ntask with high potential impact. Multimodality, in particular the addition of\nvisual information next to text, has been shown to improve the performance on\nscholarly document quality prediction (SDQP) tasks. We propose the multimodal\npredictive model MultiSChuBERT. It combines a textual model based on chunking\nfull paper text and aggregating computed BERT chunk-encodings (SChuBERT), with\na visual model based on Inception V3.Our work contributes to the current\nstate-of-the-art in SDQP in three ways. First, we show that the method of\ncombining visual and textual embeddings can substantially influence the\nresults. Second, we demonstrate that gradual-unfreezing of the weights of the\nvisual sub-model, reduces its tendency to ovefit the data, improving results.\nThird, we show the retained benefit of multimodality when replacing standard\nBERT$_{\\textrm{BASE}}$ embeddings with more recent state-of-the-art text\nembedding models.\n Using BERT$_{\\textrm{BASE}}$ embeddings, on the (log) number of citations\nprediction task with the ACL-BiblioMetry dataset, our MultiSChuBERT\n(text+visual) model obtains an $R^{2}$ score of 0.454 compared to 0.432 for the\nSChuBERT (text only) model. Similar improvements are obtained on the PeerRead\naccept/reject prediction task. In our experiments using SciBERT, scincl,\nSPECTER and SPECTER2.0 embeddings, we show that each of these tailored\nembeddings adds further improvements over the standard BERT$_{\\textrm{BASE}}$\nembeddings, with the SPECTER2.0 embeddings performing best.\n","authors":["Gideon Maillette de Buy Wenniger","Thomas van Dongen","Lambert Schomaker"],"pdf_url":"https://arxiv.org/pdf/2308.07971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10310v2","updated":"2023-08-15T18:03:07Z","published":"2023-03-18T02:42:18Z","title":"Pseudo Supervised Metrics: Evaluating Unsupervised Image to Image\n Translation Models In Unsupervised Cross-Domain Classification Frameworks","summary":" The ability to classify images accurately and efficiently is dependent on\nhaving access to large labeled datasets and testing on data from the same\ndomain that the model is trained on. Classification becomes more challenging\nwhen dealing with new data from a different domain, where collecting a large\nlabeled dataset and training a new classifier from scratch is time-consuming,\nexpensive, and sometimes infeasible or impossible. Cross-domain classification\nframeworks were developed to handle this data domain shift problem by utilizing\nunsupervised image-to-image (UI2I) translation models to translate an input\nimage from the unlabeled domain to the labeled domain. The problem with these\nunsupervised models lies in their unsupervised nature. For lack of annotations,\nit is not possible to use the traditional supervised metrics to evaluate these\ntranslation models to pick the best-saved checkpoint model. In this paper, we\nintroduce a new method called Pseudo Supervised Metrics that was designed\nspecifically to support cross-domain classification applications contrary to\nother typically used metrics such as the FID which was designed to evaluate the\nmodel in terms of the quality of the generated image from a human-eye\nperspective. We show that our metric not only outperforms unsupervised metrics\nsuch as the FID, but is also highly correlated with the true supervised\nmetrics, robust, and explainable. Furthermore, we demonstrate that it can be\nused as a standard metric for future research in this field by applying it to a\ncritical real-world problem (the boiling crisis problem).\n","authors":["Firas Al-Hindawi","Md Mahfuzur Rahman Siddiquee","Teresa Wu","Han Hu","Ying Sun"],"pdf_url":"https://arxiv.org/pdf/2303.10310v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2212.09107"}],"Multimedia":[{"id":"http://arxiv.org/abs/2211.16198v4","updated":"2023-08-15T13:31:15Z","published":"2022-11-28T16:48:41Z","title":"SuS-X: Training-Free Name-Only Transfer of Vision-Language Models","summary":" Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet\neffective way to train large-scale vision-language models. CLIP demonstrates\nimpressive zero-shot classification and retrieval on diverse downstream tasks.\nHowever, to leverage its full potential, fine-tuning still appears to be\nnecessary. Fine-tuning the entire CLIP model can be resource-intensive and\nunstable. Moreover, recent methods that aim to circumvent this need for\nfine-tuning still require access to images from the target distribution. In\nthis paper, we pursue a different approach and explore the regime of\ntraining-free \"name-only transfer\" in which the only knowledge we possess about\nthe downstream task comprises the names of downstream target categories. We\npropose a novel method, SuS-X, consisting of two key building blocks -- SuS and\nTIP-X, that requires neither intensive fine-tuning nor costly labelled data.\nSuS-X achieves state-of-the-art zero-shot classification results on 19\nbenchmark datasets. We further show the utility of TIP-X in the training-free\nfew-shot setting, where we again achieve state-of-the-art results over strong\ntraining-free baselines. Code is available at\nhttps://github.com/vishaal27/SuS-X.\n","authors":["Vishaal Udandarao","Ankush Gupta","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2211.16198v4.pdf","comment":"Accepted at ICCV2023"},{"id":"http://arxiv.org/abs/2308.07733v1","updated":"2023-08-15T12:17:46Z","published":"2023-08-15T12:17:46Z","title":"Dynamic Low-Rank Instance Adaptation for Universal Neural Image\n Compression","summary":" The latest advancements in neural image compression show great potential in\nsurpassing the rate-distortion performance of conventional standard codecs.\nNevertheless, there exists an indelible domain gap between the datasets\nutilized for training (i.e., natural images) and those utilized for inference\n(e.g., artistic images). Our proposal involves a low-rank adaptation approach\naimed at addressing the rate-distortion drop observed in out-of-domain\ndatasets. Specifically, we perform low-rank matrix decomposition to update\ncertain adaptation parameters of the client's decoder. These updated\nparameters, along with image latents, are encoded into a bitstream and\ntransmitted to the decoder in practical scenarios. Due to the low-rank\nconstraint imposed on the adaptation parameters, the resulting bit rate\noverhead is small. Furthermore, the bit rate allocation of low-rank adaptation\nis \\emph{non-trivial}, considering the diverse inputs require varying\nadaptation bitstreams. We thus introduce a dynamic gating network on top of the\nlow-rank adaptation method, in order to decide which decoder layer should\nemploy adaptation. The dynamic adaptation network is optimized end-to-end using\nrate-distortion loss. Our proposed method exhibits universality across diverse\nimage datasets. Extensive results demonstrate that this paradigm significantly\nmitigates the domain gap, surpassing non-adaptive methods with an average\nBD-rate improvement of approximately $19\\%$ across out-of-domain images.\nFurthermore, it outperforms the most advanced instance adaptive methods by\nroughly $5\\%$ BD-rate. Ablation studies confirm our method's ability to\nuniversally enhance various image compression architectures.\n","authors":["Yue Lv","Jinxi Xiang","Jun Zhang","Wenming Yang","Xiao Han","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2308.07733v1.pdf","comment":"Accepted by ACM MM 2023, 13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2106.14136v2","updated":"2023-08-15T08:29:31Z","published":"2021-06-27T03:54:36Z","title":"Text-to-Audio based Event Detection Towards Intelligent Vehicle Road\n Cooperation","summary":" In this paper, we target at the text-to-audio grounding issue, namely,\ngrounding the segments of the sound event described by a natural language query\nin the untrimmed audio. This is a newly proposed but challenging audio-language\ntask, since it requires to not only precisely localize all the on- and off-sets\nof the desired segments in the audio, but also perform comprehensive acoustic\nand linguistic understandings and reason the multimodal interactions between\nthe audio and query. To tackle those problems, the existing methods often\nholistically treat the query as a single unit by a global query representation.\nWe argue that this approach suffers from several limitations. Motivated by the\nabove considerations, we propose a novel Cross-modal Graph Interaction (CGI)\nmodel, which comprehensively models the comprehensive relations between the\nwords in a query through a novel language graph. To capture the fine-grained\ninteractions between the audio and query, a cross-modal attention module is\nintroduced to assign higher weights to the keywords with more important\nsemantics and generate the snippet-specific query representations. Furthermore,\nwe design a cross-gating module to emphasize the crucial parts and weaken the\nirrelevant ones in the audio and query. We extensively evaluate the proposed\nCGI model on the public Audiogrounding dataset with significant improvements\nover several state-of-the-art methods. The ablation study demonstrate the\nconsistent effectiveness of different modules in our model.\n","authors":["Haoyu Tang","Yunxiao Wang","Jihua Zhu","Shuaike Zhang","Mingzhu Xu","Yupeng Hu","Qinghai Zheng"],"pdf_url":"https://arxiv.org/pdf/2106.14136v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.07622v1","updated":"2023-08-15T08:13:14Z","published":"2023-08-15T08:13:14Z","title":"EMID: An Emotional Aligned Dataset in Audio-Visual Modality","summary":" In this paper, we propose Emotionally paired Music and Image Dataset (EMID),\na novel dataset designed for the emotional matching of music and images, to\nfacilitate auditory-visual cross-modal tasks such as generation and retrieval.\nUnlike existing approaches that primarily focus on semantic correlations or\nroughly divided emotional relations, EMID emphasizes the significance of\nemotional consistency between music and images using an advanced 13-dimension\nemotional model. By incorporating emotional alignment into the dataset, it aims\nto establish pairs that closely align with human perceptual understanding,\nthereby raising the performance of auditory-visual cross-modal tasks. We also\ndesign a supplemental module named EMI-Adapter to optimize existing cross-modal\nalignment methods. To validate the effectiveness of the EMID, we conduct a\npsychological experiment, which has demonstrated that considering the emotional\nrelationship between the two modalities effectively improves the accuracy of\nmatching in abstract perspective. This research lays the foundation for future\ncross-modal research in domains such as psychotherapy and contributes to\nadvancing the understanding and utilization of emotions in cross-modal\nalignment. The EMID dataset is available at https://github.com/ecnu-aigc/EMID.\n","authors":["Jialing Zou","Jiahao Mei","Guangze Ye","Tianyu Huai","Qiwei Shen","Daoguo Dong"],"pdf_url":"https://arxiv.org/pdf/2308.07622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07605v1","updated":"2023-08-15T07:20:22Z","published":"2023-08-15T07:20:22Z","title":"SGDiff: A Style Guided Diffusion Model for Fashion Synthesis","summary":" This paper reports on the development of \\textbf{a novel style guided\ndiffusion model (SGDiff)} which overcomes certain weaknesses inherent in\nexisting models for image synthesis. The proposed SGDiff combines image\nmodality with a pretrained text-to-image diffusion model to facilitate creative\nfashion image synthesis. It addresses the limitations of text-to-image\ndiffusion models by incorporating supplementary style guidance, substantially\nreducing training costs, and overcoming the difficulties of controlling\nsynthesized styles with text-only inputs. This paper also introduces a new\ndataset -- SG-Fashion, specifically designed for fashion image synthesis\napplications, offering high-resolution images and an extensive range of garment\ncategories. By means of comprehensive ablation study, we examine the\napplication of classifier-free guidance to a variety of conditions and validate\nthe effectiveness of the proposed model for generating fashion images of the\ndesired categories, product attributes, and styles. The contributions of this\npaper include a novel classifier-free guidance method for multi-modal feature\nfusion, a comprehensive dataset for fashion image synthesis application, a\nthorough investigation on conditioned text-to-image synthesis, and valuable\ninsights for future research in the text-to-image synthesis domain. The code\nand dataset are available at: \\url{https://github.com/taited/SGDiff}.\n","authors":["Zhengwentai Sun","Yanghong Zhou","Honghong He","P. Y. Mok"],"pdf_url":"https://arxiv.org/pdf/2308.07605v1.pdf","comment":"Accepted by ACM MM'23"},{"id":"http://arxiv.org/abs/2308.07593v1","updated":"2023-08-15T06:38:38Z","published":"2023-08-15T06:38:38Z","title":"AKVSR: Audio Knowledge Empowered Visual Speech Recognition by\n Compressing Audio Knowledge of a Pretrained Model","summary":" Visual Speech Recognition (VSR) is the task of predicting spoken words from\nsilent lip movements. VSR is regarded as a challenging task because of the\ninsufficient information on lip movements. In this paper, we propose an Audio\nKnowledge empowered Visual Speech Recognition framework (AKVSR) to complement\nthe insufficient speech information of visual modality by using audio modality.\nDifferent from the previous methods, the proposed AKVSR 1) utilizes rich audio\nknowledge encoded by a large-scale pretrained audio model, 2) saves the\nlinguistic information of audio knowledge in compact audio memory by discarding\nthe non-linguistic information from the audio through quantization, and 3)\nincludes Audio Bridging Module which can find the best-matched audio features\nfrom the compact audio memory, which makes our training possible without audio\ninputs, once after the compact audio memory is composed. We validate the\neffectiveness of the proposed method through extensive experiments, and achieve\nnew state-of-the-art performances on the widely-used datasets, LRS2 and LRS3.\n","authors":["Jeong Hun Yeo","Minsu Kim","Jeongsoo Choi","Dae Hoe Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2308.07593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07578v1","updated":"2023-08-15T05:33:48Z","published":"2023-08-15T05:33:48Z","title":"Understanding User Behavior in Volumetric Video Watching: Dataset,\n Analysis and Prediction","summary":" Volumetric video emerges as a new attractive video paradigm in recent years\nsince it provides an immersive and interactive 3D viewing experience with six\ndegree-of-freedom (DoF). Unlike traditional 2D or panoramic videos, volumetric\nvideos require dense point clouds, voxels, meshes, or huge neural models to\ndepict volumetric scenes, which results in a prohibitively high bandwidth\nburden for video delivery. Users' behavior analysis, especially the viewport\nand gaze analysis, then plays a significant role in prioritizing the content\nstreaming within users' viewport and degrading the remaining content to\nmaximize user QoE with limited bandwidth. Although understanding user behavior\nis crucial, to the best of our best knowledge, there are no available 3D\nvolumetric video viewing datasets containing fine-grained user interactivity\nfeatures, not to mention further analysis and behavior prediction. In this\npaper, we for the first time release a volumetric video viewing behavior\ndataset, with a large scale, multiple dimensions, and diverse conditions. We\nconduct an in-depth analysis to understand user behaviors when viewing\nvolumetric videos. Interesting findings on user viewport, gaze, and motion\npreference related to different videos and users are revealed. We finally\ndesign a transformer-based viewport prediction model that fuses the features of\nboth gaze and motion, which is able to achieve high accuracy at various\nconditions. Our prediction model is expected to further benefit volumetric\nvideo streaming optimization. Our dataset, along with the corresponding\nvisualization tools is accessible at\nhttps://cuhksz-inml.github.io/user-behavior-in-vv-watching/\n","authors":["Kaiyuan Hu","Haowen Yang","Yili Jin","Junhua Liu","Yongting Chen","Miao Zhang","Fangxin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07970v1","updated":"2023-08-15T18:17:16Z","published":"2023-08-15T18:17:16Z","title":"Introducing a New Evaluation Criteria for EMD-Base Steganography Method","summary":" Steganography is a technique to hide the presence of secret communication.\nWhen one of the communication elements is under the influence of the enemy, it\ncan be used. The main measure to evaluate steganography methods in a certain\ncapacity is security. Therefore, in a certain capacity, reducing the amount of\nchanges in the cover media, creates a higher embedding efficiency and thus more\nsecurity of an steganography method. Mostly, security and capacity are in\nconflict with each other, the increase of one lead to the decrease of the\nother. The presence of a single criterion that represents security and capacity\nat the same time be useful in comparing steganography methods. EMD and the\nrelevant methods are a group of steganography techniques, which optimize the\namount of changes resulting from embedding (security). The present paper is\naimed to provide an evaluation criterion for this group of steganography\nmethods. In this study, after a general review and comparison of EMD-based\nsteganography techniques, we present a method to compare them exactly, from the\nperspective of embedding efficiency. First, a formula is presented to determine\nthe value of embedding efficiency, which indicates the effect of one or more\nchanges on one or more pixels. The results demonstrate that the proposed\nembedding efficiency formula shows the performance of the methods better when\nseveral changes are made on a pixel compared to the existing criteria. In the\nsecond step, we have obtained an upper bound, which determines the best\nefficiency for each certain capacity. Finally, based on the introduced bound,\nanother evaluation criterion for a better comparison of the methods is\npresented.\n","authors":["Hanieh Rafiee","Mojtaba Mahdavi","AhmadReza NaghshNilchi"],"pdf_url":"https://arxiv.org/pdf/2308.07970v1.pdf","comment":null}]},"2023-08-16T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.15780v2","updated":"2023-08-16T17:59:07Z","published":"2023-07-24T18:47:38Z","title":"LLM-Rec: Personalized Recommendation via Prompting Large Language Models","summary":" We investigate various prompting strategies for enhancing personalized\nrecommendation performance with large language models (LLMs) through input\naugmentation. Our proposed approach, termed LLM-Rec, encompasses four distinct\nprompting strategies: (1) basic prompting, (2) recommendation-driven prompting,\n(3) engagement-guided prompting, and (4) recommendation-driven +\nengagement-guided prompting. Our empirical experiments show that incorporating\nthe augmented input text generated by LLM leads to improved recommendation\nperformance. Recommendation-driven and engagement-guided prompting strategies\nare found to elicit LLM's understanding of global and local item\ncharacteristics. This finding highlights the importance of leveraging diverse\nprompts and input augmentation techniques to enhance the recommendation\ncapabilities with LLMs.\n","authors":["Hanjia Lyu","Song Jiang","Hanqing Zeng","Qifan Wang","Si Zhang","Ren Chen","Chris Leung","Jiajie Tang","Yinglong Xia","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2307.15780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04306v2","updated":"2023-08-16T17:44:59Z","published":"2023-06-07T10:11:09Z","title":"Allophant: Cross-lingual Phoneme Recognition with Articulatory\n Attributes","summary":" This paper proposes Allophant, a multilingual phoneme recognizer. It requires\nonly a phoneme inventory for cross-lingual transfer to a target language,\nallowing for low-resource recognition. The architecture combines a\ncompositional phone embedding approach with individually supervised phonetic\nattribute classifiers in a multi-task architecture. We also introduce\nAllophoible, an extension of the PHOIBLE database. When combined with a\ndistance based mapping approach for grapheme-to-phoneme outputs, it allows us\nto train on PHOIBLE inventories directly. By training and evaluating on 34\nlanguages, we found that the addition of multi-task learning improves the\nmodel's capability of being applied to unseen phonemes and phoneme inventories.\nOn supervised languages we achieve phoneme error rate improvements of 11\npercentage points (pp.) compared to a baseline without multi-task learning.\nEvaluation of zero-shot transfer on 84 languages yielded a decrease in PER of\n2.63 pp. over the baseline.\n","authors":["Kevin Glocker","Aaricia Herygers","Munir Georges"],"pdf_url":"https://arxiv.org/pdf/2306.04306v2.pdf","comment":"5 pages, 2 figures, 2 tables, accepted to INTERSPEECH 2023; published\n version"},{"id":"http://arxiv.org/abs/2308.08493v1","updated":"2023-08-16T16:48:57Z","published":"2023-08-16T16:48:57Z","title":"Time Travel in LLMs: Tracing Data Contamination in Large Language Models","summary":" Data contamination, i.e., the presence of test data from downstream tasks in\nthe training data of large language models (LLMs), is a potential major issue\nin understanding LLMs' effectiveness on other tasks. We propose a\nstraightforward yet effective method for identifying data contamination within\nLLMs. At its core, our approach starts by identifying potential contamination\nin individual instances that are drawn from a small random sample; using this\ninformation, our approach then assesses if an entire dataset partition is\ncontaminated. To estimate contamination of individual instances, we employ\n\"guided instruction:\" a prompt consisting of the dataset name, partition type,\nand the initial segment of a reference instance, asking the LLM to complete it.\nAn instance is flagged as contaminated if the LLM's output either exactly or\nclosely matches the latter segment of the reference. To understand if an entire\npartition is contaminated, we propose two ideas. The first idea marks a dataset\npartition as contaminated if the average overlap score with the reference\ninstances (as measured by ROUGE or BLEURT) is statistically significantly\nbetter with the guided instruction vs. a general instruction that does not\ninclude the dataset and partition name. The second idea marks a dataset as\ncontaminated if a classifier based on GPT-4 with in-context learning prompting\nmarks multiple instances as contaminated. Our best method achieves an accuracy\nbetween 92% and 100% in detecting if an LLM is contaminated with seven\ndatasets, containing train and test/validation partitions, when contrasted with\nmanual evaluation by human expert. Further, our findings indicate that GPT-4 is\ncontaminated with AG News, WNLI, and XSum datasets.\n","authors":["Shahriar Golchin","Mihai Surdeanu"],"pdf_url":"https://arxiv.org/pdf/2308.08493v1.pdf","comment":"v1 preprint"},{"id":"http://arxiv.org/abs/2304.01752v2","updated":"2023-08-16T15:54:54Z","published":"2023-04-04T12:42:29Z","title":"Black Box Few-Shot Adaptation for Vision-Language models","summary":" Vision-Language (V-L) models trained with contrastive learning to align the\nvisual and language modalities have been shown to be strong few-shot learners.\nSoft prompt learning is the method of choice for few-shot downstream adaption\naiming to bridge the modality gap caused by the distribution shift induced by\nthe new domain. While parameter-efficient, prompt learning still requires\naccess to the model weights and can be computationally infeasible for large\nmodels with billions of parameters. To address these shortcomings, in this\nwork, we describe a black-box method for V-L few-shot adaptation that (a)\noperates on pre-computed image and text features and hence works without access\nto the model's weights, (b) it is orders of magnitude faster at training time,\n(c) it is amenable to both supervised and unsupervised training, and (d) it can\nbe even used to align image and text features computed from uni-modal models.\nTo achieve this, we propose Linear Feature Alignment (LFA), a simple linear\napproach for V-L re-alignment in the target domain. LFA is initialized from a\nclosed-form solution to a least-squares problem and then it is iteratively\nupdated by minimizing a re-ranking loss. Despite its simplicity, our approach\ncan even surpass soft-prompt learning methods as shown by extensive experiments\non 11 image and 2 video datasets.\n","authors":["Yassine Ouali","Adrian Bulat","Brais Martinez","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2304.01752v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2306.01102v2","updated":"2023-08-16T15:49:48Z","published":"2023-06-01T19:33:21Z","title":"LLMatic: Neural Architecture Search via Large Language Models and\n Quality-Diversity Optimization","summary":" Large Language Models (LLMs) have emerged as powerful tools capable of\naccomplishing a broad spectrum of tasks. Their abilities span numerous areas,\nand one area where they have made a significant impact is in the domain of code\ngeneration. In this context, we view LLMs as mutation and crossover tools.\nMeanwhile, Quality-Diversity (QD) algorithms are known to discover diverse and\nrobust solutions. By merging the code-generating abilities of LLMs with the\ndiversity and robustness of QD solutions, we introduce LLMatic, a Neural\nArchitecture Search (NAS) algorithm. While LLMs struggle to conduct NAS\ndirectly through prompts, LLMatic uses a procedural approach, leveraging QD for\nprompts and network architecture to create diverse and highly performant\nnetworks. We test LLMatic on the CIFAR-10 image classification benchmark,\ndemonstrating that it can produce competitive networks with just $2,000$\nsearches, even without prior knowledge of the benchmark domain or exposure to\nany previous top-performing models for the benchmark.\n","authors":["Muhammad U. Nasir","Sam Earle","Julian Togelius","Steven James","Christopher Cleghorn"],"pdf_url":"https://arxiv.org/pdf/2306.01102v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08442v1","updated":"2023-08-16T15:49:36Z","published":"2023-08-16T15:49:36Z","title":"Mitigating the Exposure Bias in Sentence-Level Grapheme-to-Phoneme (G2P)\n Transduction","summary":" Text-to-Text Transfer Transformer (T5) has recently been considered for the\nGrapheme-to-Phoneme (G2P) transduction. As a follow-up, a tokenizer-free\nbyte-level model based on T5 referred to as ByT5, recently gave promising\nresults on word-level G2P conversion by representing each input character with\nits corresponding UTF-8 encoding. Although it is generally understood that\nsentence-level or paragraph-level G2P can improve usability in real-world\napplications as it is better suited to perform on heteronyms and linking sounds\nbetween words, we find that using ByT5 for these scenarios is nontrivial. Since\nByT5 operates on the character level, it requires longer decoding steps, which\ndeteriorates the performance due to the exposure bias commonly observed in\nauto-regressive generation models. This paper shows that the performance of\nsentence-level and paragraph-level G2P can be improved by mitigating such\nexposure bias using our proposed loss-based sampling method.\n","authors":["Eunseop Yoon","Hee Suk Yoon","Dhananjaya Gowda","SooHwan Eom","Daehyeok Kim","John Harvill","Heting Gao","Mark Hasegawa-Johnson","Chanwoo Kim","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2308.08442v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2308.08413v1","updated":"2023-08-16T14:58:12Z","published":"2023-08-16T14:58:12Z","title":"Knowledge-Enhanced Multi-Label Few-Shot Product Attribute-Value\n Extraction","summary":" Existing attribute-value extraction (AVE) models require large quantities of\nlabeled data for training. However, new products with new attribute-value pairs\nenter the market every day in real-world e-Commerce. Thus, we formulate AVE in\nmulti-label few-shot learning (FSL), aiming to extract unseen attribute value\npairs based on a small number of training examples. We propose a\nKnowledge-Enhanced Attentive Framework (KEAF) based on prototypical networks,\nleveraging the generated label description and category information to learn\nmore discriminative prototypes. Besides, KEAF integrates with hybrid attention\nto reduce noise and capture more informative semantics for each class by\ncalculating the label-relevant and query-related weights. To achieve\nmulti-label inference, KEAF further learns a dynamic threshold by integrating\nthe semantic information from both the support set and the query set. Extensive\nexperiments with ablation studies conducted on two datasets demonstrate that\nKEAF outperforms other SOTA models for information extraction in FSL. The code\ncan be found at: https://github.com/gjiaying/KEAF\n","authors":["Jiaying Gong","Wei-Te Chen","Hoda Eldardiry"],"pdf_url":"https://arxiv.org/pdf/2308.08413v1.pdf","comment":"6 pages, 2 figures, published in CIKM 2023"},{"id":"http://arxiv.org/abs/2307.07889v2","updated":"2023-08-16T14:55:35Z","published":"2023-07-15T22:02:12Z","title":"LLM Comparative Assessment: Zero-shot NLG Evaluation through Pairwise\n Comparisons using Large Language Models","summary":" Current developments in large language models (LLMs) have enabled impressive\nzero-shot capabilities across various natural language tasks. An interesting\napplication of these systems is in the automated assessment of natural language\ngeneration (NLG), a highly challenging area with great practical benefit. In\nthis paper, we explore two options for exploiting the emergent abilities of\nLLMs for zero-shot NLG assessment: absolute score prediction, and comparative\nassessment which uses relative comparisons between pairs of candidates. Though\ncomparative assessment has not been extensively studied in NLG assessment, we\nnote that humans often find it more intuitive to compare two options rather\nthan scoring each one independently. This work examines comparative assessment\nfrom multiple perspectives: performance compared to absolute grading;\npositional biases in the prompt; and efficient ranking in terms of the number\nof comparisons. We illustrate that LLM comparative assessment is a simple,\ngeneral and effective approach for NLG assessment. For moderate-sized\nopen-source LLMs, such as FlanT5 and Llama2-chat, comparative assessment is\nsuperior to prompt scoring, and in many cases can achieve performance\ncompetitive with state-of-the-art methods. Additionally, we demonstrate that\nLLMs often exhibit strong positional biases when making pairwise comparisons,\nand we propose debiasing methods that can further improve performance.\n","authors":["Adian Liusie","Potsawee Manakul","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2307.07889v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2307.11787v2","updated":"2023-08-16T14:03:03Z","published":"2023-07-20T16:22:36Z","title":"LLM Cognitive Judgements Differ From Human","summary":" Large Language Models (LLMs) have lately been on the spotlight of\nresearchers, businesses, and consumers alike. While the linguistic capabilities\nof such models have been studied extensively, there is growing interest in\ninvestigating them as cognitive subjects. In the present work I examine GPT-3\nand ChatGPT capabilities on an limited-data inductive reasoning task from the\ncognitive science literature. The results suggest that these models' cognitive\njudgements are not human-like.\n","authors":["Sotiris Lamprinidis"],"pdf_url":"https://arxiv.org/pdf/2307.11787v2.pdf","comment":"7 pages, 1 figure. License changed to CC BY-NC-SA"},{"id":"http://arxiv.org/abs/2308.08378v1","updated":"2023-08-16T14:01:25Z","published":"2023-08-16T14:01:25Z","title":"Advancing continual lifelong learning in neural information retrieval:\n definition, dataset, framework, and empirical evaluation","summary":" Continual learning refers to the capability of a machine learning model to\nlearn and adapt to new information, without compromising its performance on\npreviously learned tasks. Although several studies have investigated continual\nlearning methods for information retrieval tasks, a well-defined task\nformulation is still lacking, and it is unclear how typical learning strategies\nperform in this context. To address this challenge, a systematic task\nformulation of continual neural information retrieval is presented, along with\na multiple-topic dataset that simulates continuous information retrieval. A\ncomprehensive continual neural information retrieval framework consisting of\ntypical retrieval models and continual learning strategies is then proposed.\nEmpirical evaluations illustrate that the proposed framework can successfully\nprevent catastrophic forgetting in neural information retrieval and enhance\nperformance on previously learned tasks. The results indicate that\nembedding-based retrieval models experience a decline in their continual\nlearning performance as the topic shift distance and dataset volume of new\ntasks increase. In contrast, pretraining-based models do not show any such\ncorrelation. Adopting suitable learning strategies can mitigate the effects of\ntopic shift and data augmentation.\n","authors":["Jingrui Hou","Georgina Cosma","Axel Finke"],"pdf_url":"https://arxiv.org/pdf/2308.08378v1.pdf","comment":"Submitted to Information Sciences"},{"id":"http://arxiv.org/abs/2308.08363v1","updated":"2023-08-16T13:39:06Z","published":"2023-08-16T13:39:06Z","title":"SummHelper: Collaborative Human-Computer Summarization","summary":" Current approaches for text summarization are predominantly automatic, with\nrather limited space for human intervention and control over the process. In\nthis paper, we introduce SummHelper, a 2-phase summarization assistant designed\nto foster human-machine collaboration. The initial phase involves content\nselection, where the system recommends potential content, allowing users to\naccept, modify, or introduce additional selections. The subsequent phase,\ncontent consolidation, involves SummHelper generating a coherent summary from\nthese selections, which users can then refine using visual mappings between the\nsummary and the source text. Small-scale user studies reveal the effectiveness\nof our application, with participants being especially appreciative of the\nbalance between automated guidance and opportunities for personal input.\n","authors":["Aviv Slobodkin","Niv Nachum","Shmuel Amar","Ori Shapira","Ido Dagan"],"pdf_url":"https://arxiv.org/pdf/2308.08363v1.pdf","comment":"Demo paper"},{"id":"http://arxiv.org/abs/2305.09781v2","updated":"2023-08-16T13:33:06Z","published":"2023-05-16T20:12:59Z","title":"SpecInfer: Accelerating Generative Large Language Model Serving with\n Speculative Inference and Token Tree Verification","summary":" The high computational and memory requirements of generative large language\nmodels (LLMs) make it challenging to serve them quickly and cheaply. This paper\nintroduces SpecInfer, an LLM serving system that accelerates generative LLM\ninference with speculative inference and token tree verification. A key insight\nbehind Specinfer is to combine various collectively boost-tuned small language\nmodels to jointly predict the LLM's outputs; the predictions are organized as a\ntoken tree, whose nodes each represent a candidate token sequence. The\ncorrectness of all candidate token sequences represented by a token tree is\nverified against the LLM in parallel using a novel tree-based parallel decoding\nmechanism. SpecInfer uses an LLM as a token tree verifier instead of an\nincremental decoder, which significantly reduces the end-to-end latency and\ncomputational requirement for serving generative LLMs while provably preserving\nmodel quality. Our evaluation shows that SpecInfer outperforms existing LLM\nserving systems by 1.3-2.4x for distributed LLM inference and by 2.6-3.5x for\noffloading-based LLM inference, while preserving the same generative\nperformance. SpecInfer is publicly available at\nhttps://github.com/flexflow/FlexFlow/tree/inference.\n","authors":["Xupeng Miao","Gabriele Oliaro","Zhihao Zhang","Xinhao Cheng","Zeyu Wang","Rae Ying Yee Wong","Alan Zhu","Lijie Yang","Xiaoxiang Shi","Chunan Shi","Zhuoming Chen","Daiyaan Arfeen","Reyna Abhyankar","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2305.09781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03453v3","updated":"2023-08-16T12:50:46Z","published":"2023-05-05T11:56:30Z","title":"T-SciQ: Teaching Multimodal Chain-of-Thought Reasoning via Large\n Language Model Signals for Science Question Answering","summary":" Large Language Models (LLMs) have recently demonstrated exceptional\nperformance in various Natural Language Processing (NLP) tasks. They have also\nshown the ability to perform chain-of-thought (CoT) reasoning to solve complex\nproblems. Recent studies have explored CoT reasoning in complex multimodal\nscenarios, such as the science question answering task, by fine-tuning\nmultimodal models with high-quality human-annotated CoT rationales. However,\ncollecting high-quality COT rationales is usually time-consuming and costly.\nBesides, the annotated rationales are hardly accurate due to the external\nessential information missed. To address these issues, we propose a novel\nmethod termed \\emph{T-SciQ} that aims at teaching science question answering\nwith LLM signals. The T-SciQ approach generates high-quality CoT rationales as\nteaching signals and is advanced to train much smaller models to perform CoT\nreasoning in complex modalities. Additionally, we introduce a novel data mixing\nstrategy to produce more effective teaching data samples by policy for simple\nand complex science question answer problems. Extensive experimental results\nshow that our T-SciQ method achieves a new state-of-the-art performance on the\nScienceQA benchmark, with an accuracy of 96.18\\%. Moreover, our approach\noutperforms the most powerful fine-tuned baseline by 4.5\\%.\n","authors":["Lei Wang","Yi Hu","Jiabang He","Xing Xu","Ning Liu","Hui Liu","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2305.03453v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14036v2","updated":"2023-08-16T12:03:47Z","published":"2023-02-27T18:47:55Z","title":"Text-only domain adaptation for end-to-end ASR using integrated\n text-to-mel-spectrogram generator","summary":" We propose an end-to-end Automatic Speech Recognition (ASR) system that can\nbe trained on transcribed speech data, text-only data, or a mixture of both.\nThe proposed model uses an integrated auxiliary block for text-based training.\nThis block combines a non-autoregressive multi-speaker text-to-mel-spectrogram\ngenerator with a GAN-based enhancer to improve the spectrogram quality. The\nproposed system can generate a mel-spectrogram dynamically during training. It\ncan be used to adapt the ASR model to a new domain by using text-only data from\nthis domain. We demonstrate that the proposed training method significantly\nimproves ASR accuracy compared to the system trained on transcribed speech\nonly. It also surpasses cascade TTS systems with the vocoder in the adaptation\nquality and training speed.\n","authors":["Vladimir Bataev","Roman Korostik","Evgeny Shabalin","Vitaly Lavrukhin","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2302.14036v2.pdf","comment":"Accepted to INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2304.01622v2","updated":"2023-08-16T11:58:38Z","published":"2023-04-04T08:25:12Z","title":"An interpretability framework for Similar case matching","summary":" Similar Case Matching (SCM) plays a pivotal role in the legal system by\nfacilitating the efficient identification of similar cases for legal\nprofessionals. While previous research has primarily concentrated on enhancing\nthe performance of SCM models, the aspect of interpretability has been\nneglected. To bridge the gap, this study proposes an integrated pipeline\nframework for interpretable SCM. The framework comprises four modules: judicial\nfeature sentence identification, case matching, feature sentence alignment, and\nconflict resolution. In contrast to current SCM methods, our framework first\nextracts feature sentences within a legal case that contain essential\ninformation. Then it conducts case matching based on these extracted features.\nSubsequently, our framework aligns the corresponding sentences in two legal\ncases to provide evidence of similarity. In instances where the results of case\nmatching and feature sentence alignment exhibit conflicts, the conflict\nresolution module resolves these inconsistencies. The experimental results show\nthe effectiveness of our proposed framework, establishing a new benchmark for\ninterpretable SCM.\n","authors":["Nankai Lin","Haonan Liu","Jiajun Fang","Dong Zhou","Aimin Yang"],"pdf_url":"https://arxiv.org/pdf/2304.01622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08295v1","updated":"2023-08-16T11:50:38Z","published":"2023-08-16T11:50:38Z","title":"Detoxify Language Model Step-by-Step","summary":" Detoxification for LLMs is challenging since it requires models to avoid\ngenerating harmful content while maintaining the generation capability. To\nensure the safety of generations, previous detoxification methods detoxify the\nmodels by changing the data distributions or constraining the generations from\ndifferent aspects in a single-step manner. However, these approaches will\ndramatically affect the generation quality of LLMs, e.g., discourse coherence\nand semantic consistency, since language models tend to generate along the\ntoxic prompt while detoxification methods work in the opposite direction. To\nhandle such a conflict, we decompose the detoxification process into different\nsub-steps, where the detoxification is concentrated in the input stage and the\nsubsequent continual generation is based on the non-toxic prompt. Besides, we\nalso calibrate the strong reasoning ability of LLMs by designing a Detox-Chain\nto connect the above sub-steps in an orderly manner, which allows LLMs to\ndetoxify the text step-by-step. Automatic and human evaluation on two\nbenchmarks reveals that by training with Detox-Chain, six LLMs scaling from 1B\nto 33B can obtain significant detoxification and generation improvement. Our\ncode and data are available at https://github.com/CODINNLG/Detox-CoT. Warning:\nexamples in the paper may contain uncensored offensive content.\n","authors":["Zecheng Tang","Keyan Zhou","Pinzheng Wang","Yuyang Ding","Juntao Li"," Minzhang"],"pdf_url":"https://arxiv.org/pdf/2308.08295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08285v1","updated":"2023-08-16T11:10:43Z","published":"2023-08-16T11:10:43Z","title":"Pre-training with Large Language Model-based Document Expansion for\n Dense Passage Retrieval","summary":" In this paper, we systematically study the potential of pre-training with\nLarge Language Model(LLM)-based document expansion for dense passage retrieval.\nConcretely, we leverage the capabilities of LLMs for document expansion, i.e.\nquery generation, and effectively transfer expanded knowledge to retrievers\nusing pre-training strategies tailored for passage retrieval. These strategies\ninclude contrastive learning and bottlenecked query generation. Furthermore, we\nincorporate a curriculum learning strategy to reduce the reliance on LLM\ninferences. Experimental results demonstrate that pre-training with LLM-based\ndocument expansion significantly boosts the retrieval performance on\nlarge-scale web-search tasks. Our work shows strong zero-shot and out-of-domain\nretrieval abilities, making it more widely applicable for retrieval when\ninitializing with no human-labeled data.\n","authors":["Guangyuan Ma","Xing Wu","Peng Wang","Zijia Lin","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08285v1.pdf","comment":"10 pages, 3 tables, 4 figures, under review"},{"id":"http://arxiv.org/abs/2301.10405v5","updated":"2023-08-16T10:57:58Z","published":"2023-01-25T04:45:06Z","title":"Editing Language Model-based Knowledge Graph Embeddings","summary":" Recently decades have witnessed the empirical success of framing Knowledge\nGraph (KG) embeddings via language models. However, language model-based KG\nembeddings are usually deployed as static artifacts, making them difficult to\nmodify post-deployment without re-training after deployment. To address this\nissue, we propose a new task of editing language model-based KG embeddings in\nthis paper. This task is designed to facilitate rapid, data-efficient updates\nto KG embeddings without compromising the performance of other aspects. We\nbuild four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and\nevaluate several knowledge editing baselines demonstrating the limited ability\nof previous models to handle the proposed challenging task. We further propose\na simple yet strong baseline dubbed KGEditor, which utilizes additional\nparametric layers of the hyper network to edit/add facts. Our comprehensive\nexperimental results reveal that KGEditor excels in updating specific facts\nwithout impacting the overall performance, even when faced with limited\ntraining resources. Code and datasets are available in\nhttps://github.com/zjunlp/PromptKG/tree/main/deltaKG.\n","authors":["Siyuan Cheng","Ningyu Zhang","Bozhong Tian","Xi Chen","Qingbing Liu","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2301.10405v5.pdf","comment":"Work in progress and the project website is\n https://zjunlp.github.io/project/KGE_Editing/"},{"id":"http://arxiv.org/abs/2304.08862v2","updated":"2023-08-16T10:48:12Z","published":"2023-04-18T09:52:11Z","title":"Approximate Nearest Neighbour Phrase Mining for Contextual Speech\n Recognition","summary":" This paper presents an extension to train end-to-end Context-Aware\nTransformer Transducer ( CATT ) models by using a simple, yet efficient method\nof mining hard negative phrases from the latent space of the context encoder.\nDuring training, given a reference query, we mine a number of similar phrases\nusing approximate nearest neighbour search. These sampled phrases are then used\nas negative examples in the context list alongside random and ground truth\ncontextual information. By including approximate nearest neighbour phrases\n(ANN-P) in the context list, we encourage the learned representation to\ndisambiguate between similar, but not identical, biasing phrases. This improves\nbiasing accuracy when there are several similar phrases in the biasing\ninventory. We carry out experiments in a large-scale data regime obtaining up\nto 7% relative word error rate reductions for the contextual portion of test\ndata. We also extend and evaluate CATT approach in streaming applications.\n","authors":["Maurits Bleeker","Pawel Swietojanski","Stefan Braun","Xiaodan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2304.08862v2.pdf","comment":"Accepted to Interspeech 2023. 5 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.08253v1","updated":"2023-08-16T09:45:06Z","published":"2023-08-16T09:45:06Z","title":"Benchmarking Neural Network Generalization for Grammar Induction","summary":" How well do neural networks generalize? Even for grammar induction tasks,\nwhere the target generalization is fully known, previous works have left the\nquestion open, testing very limited ranges beyond the training set and using\ndifferent success criteria. We provide a measure of neural network\ngeneralization based on fully specified formal languages. Given a model and a\nformal grammar, the method assigns a generalization score representing how well\na model generalizes to unseen samples in inverse relation to the amount of data\nit was trained on. The benchmark includes languages such as $a^nb^n$,\n$a^nb^nc^n$, $a^nb^mc^{n+m}$, and Dyck-1 and 2. We evaluate selected\narchitectures using the benchmark and find that networks trained with a Minimum\nDescription Length objective (MDL) generalize better and using less data than\nnetworks trained using standard loss functions. The benchmark is available at\nhttps://github.com/taucompling/bliss.\n","authors":["Nur Lan","Emmanuel Chemla","Roni Katzir"],"pdf_url":"https://arxiv.org/pdf/2308.08253v1.pdf","comment":"10 pages, 4 figures, 2 tables. Conference: Learning with Small Data\n 2023"},{"id":"http://arxiv.org/abs/2112.08637v3","updated":"2023-08-16T09:20:12Z","published":"2021-12-16T05:36:08Z","title":"Analyzing the Limits of Self-Supervision in Handling Bias in Language","summary":" Prompting inputs with natural language task descriptions has emerged as a\npopular mechanism to elicit reasonably accurate outputs from large-scale\ngenerative language models with little to no in-context supervision. This also\nhelps gain insight into how well language models capture the semantics of a\nwide range of downstream tasks purely from self-supervised pre-training on\nmassive corpora of unlabeled text. Such models have naturally also been exposed\nto a lot of undesirable content like racist and sexist language and there is\nlimited work on awareness of models along these dimensions. In this paper, we\ndefine and comprehensively evaluate how well such language models capture the\nsemantics of four tasks for bias: diagnosis, identification, extraction and\nrephrasing. We define three broad classes of task descriptions for these tasks:\nstatement, question, and completion, with numerous lexical variants within each\nclass. We study the efficacy of prompting for each task using these classes and\nthe null task description across several decoding methods and few-shot\nexamples. Our analyses indicate that language models are capable of performing\nthese tasks to widely varying degrees across different bias dimensions, such as\ngender and political affiliation. We believe our work is an important step\ntowards unbiased language models by quantifying the limits of current\nself-supervision objectives at accomplishing such sociologically challenging\ntasks.\n","authors":["Lisa Bauer","Karthik Gopalakrishnan","Spandana Gella","Yang Liu","Mohit Bansal","Dilek Hakkani-Tur"],"pdf_url":"https://arxiv.org/pdf/2112.08637v3.pdf","comment":"Accepted at Findings of the Conference on Empirical Methods in\n Natural Language Processing (EMNLP) 2022"},{"id":"http://arxiv.org/abs/2308.08241v1","updated":"2023-08-16T09:16:02Z","published":"2023-08-16T09:16:02Z","title":"TEST: Text Prototype Aligned Embedding to Activate LLM's Ability for\n Time Series","summary":" This work summarizes two strategies for completing time-series (TS) tasks\nusing today's language model (LLM): LLM-for-TS, design and train a fundamental\nlarge model for TS data; TS-for-LLM, enable the pre-trained LLM to handle TS\ndata. Considering the insufficient data accumulation, limited resources, and\nsemantic context requirements, this work focuses on TS-for-LLM methods, where\nwe aim to activate LLM's ability for TS data by designing a TS embedding method\nsuitable for LLM. The proposed method is named TEST. It first tokenizes TS,\nbuilds an encoder to embed them by instance-wise, feature-wise, and\ntext-prototype-aligned contrast, and then creates prompts to make LLM more open\nto embeddings, and finally implements TS tasks. Experiments are carried out on\nTS classification and forecasting tasks using 8 LLMs with different structures\nand sizes. Although its results cannot significantly outperform the current\nSOTA models customized for TS tasks, by treating LLM as the pattern machine, it\ncan endow LLM's ability to process TS data without compromising the language\nability. This paper is intended to serve as a foundational work that will\ninspire further research.\n","authors":["Chenxi Sun","Yaliang Li","Hongyan Li","Shenda Hong"],"pdf_url":"https://arxiv.org/pdf/2308.08241v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.08239v1","updated":"2023-08-16T09:15:18Z","published":"2023-08-16T09:15:18Z","title":"MemoChat: Tuning LLMs to Use Memos for Consistent Long-Range Open-Domain\n Conversation","summary":" We propose MemoChat, a pipeline for refining instructions that enables large\nlanguage models (LLMs) to effectively employ self-composed memos for\nmaintaining consistent long-range open-domain conversations. We demonstrate a\nlong-range open-domain conversation through iterative\n\"memorization-retrieval-response\" cycles. This requires us to carefully design\ntailored tuning instructions for each distinct stage. The instructions are\nreconstructed from a collection of public datasets to teach the LLMs to\nmemorize and retrieve past dialogues with structured memos, leading to enhanced\nconsistency when participating in future conversations. We invite experts to\nmanually annotate a test set designed to evaluate the consistency of long-range\nconversations questions. Experiments on three testing scenarios involving both\nopen-source and API-accessible chatbots at scale verify the efficacy of\nMemoChat, which outperforms strong baselines.\n","authors":["Junru Lu","Siyu An","Mingbao Lin","Gabriele Pergola","Yulan He","Di Yin","Xing Sun","Yunsheng Wu"],"pdf_url":"https://arxiv.org/pdf/2308.08239v1.pdf","comment":"Codes, data and models will be available soon"},{"id":"http://arxiv.org/abs/2308.08234v1","updated":"2023-08-16T09:11:00Z","published":"2023-08-16T09:11:00Z","title":"Challenges and Opportunities of Using Transformer-Based Multi-Task\n Learning in NLP Through ML Lifecycle: A Survey","summary":" The increasing adoption of natural language processing (NLP) models across\nindustries has led to practitioners' need for machine learning systems to\nhandle these models efficiently, from training to serving them in production.\nHowever, training, deploying, and updating multiple models can be complex,\ncostly, and time-consuming, mainly when using transformer-based pre-trained\nlanguage models. Multi-Task Learning (MTL) has emerged as a promising approach\nto improve efficiency and performance through joint training, rather than\ntraining separate models. Motivated by this, we first provide an overview of\ntransformer-based MTL approaches in NLP. Then, we discuss the challenges and\nopportunities of using MTL approaches throughout typical ML lifecycle phases,\nspecifically focusing on the challenges related to data engineering, model\ndevelopment, deployment, and monitoring phases. This survey focuses on\ntransformer-based MTL architectures and, to the best of our knowledge, is novel\nin that it systematically analyses how transformer-based MTL in NLP fits into\nML lifecycle phases. Furthermore, we motivate research on the connection\nbetween MTL and continual learning (CL), as this area remains unexplored. We\nbelieve it would be practical to have a model that can handle both MTL and CL,\nas this would make it easier to periodically re-train the model, update it due\nto distribution shifts, and add new capabilities to meet real-world\nrequirements.\n","authors":["Lovre Torbarina","Tin Ferkovic","Lukasz Roguski","Velimir Mihelcic","Bruno Sarlija","Zeljko Kraljevic"],"pdf_url":"https://arxiv.org/pdf/2308.08234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09095v2","updated":"2023-08-16T09:09:53Z","published":"2022-12-18T14:36:07Z","title":"Rethinking the Role of Scale for In-Context Learning: An\n Interpretability-based Case Study at 66 Billion Scale","summary":" Language models have been shown to perform better with an increase in scale\non a wide variety of tasks via the in-context learning paradigm. In this paper,\nwe investigate the hypothesis that the ability of a large language model to\nin-context learn-perform a task is not uniformly spread across all of its\nunderlying components. Using a 66 billion parameter language model (OPT-66B)\nacross a diverse set of 14 downstream tasks, we find this is indeed the case:\n$\\sim$70% of attention heads and $\\sim$20% of feed forward networks can be\nremoved with minimal decline in task performance. We find substantial overlap\nin the set of attention heads (un)important for in-context learning across\ntasks and number of in-context examples. We also address our hypothesis through\na task-agnostic lens, finding that a small set of attention heads in OPT-66B\nscore highly on their ability to perform primitive induction operations\nassociated with in-context learning, namely, prefix matching and copying. These\ninduction heads overlap with task-specific important heads, reinforcing\narguments by Olsson et al. (arXiv:2209.11895) regarding induction head\ngenerality to more sophisticated behaviors associated with in-context learning.\nOverall, our study provides several insights that indicate large language\nmodels may be under-trained for in-context learning and opens up questions on\nhow to pre-train language models to more effectively perform in-context\nlearning.\n","authors":["Hritik Bansal","Karthik Gopalakrishnan","Saket Dingliwal","Sravan Bodapati","Katrin Kirchhoff","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2212.09095v2.pdf","comment":"Accepted at Annual Meeting of the Association for Computational\n Linguistics (ACL) 2023, Main Proceedings"},{"id":"http://arxiv.org/abs/2307.16082v2","updated":"2023-08-16T09:00:25Z","published":"2023-07-29T21:37:55Z","title":"EnrichEvent: Enriching Social Data with Contextual Information for\n Emerging Event Extraction","summary":" Social platforms have emerged as crucial platforms for disseminating\ninformation and discussing real-life social events, which offers an excellent\nopportunity for researchers to design and implement novel event detection\nframeworks. However, most existing approaches merely exploit keyword burstiness\nor network structures to detect unspecified events. Thus, they often fail to\nidentify unspecified events regarding the challenging nature of events and\nsocial data. Social data, e.g., tweets, is characterized by misspellings,\nincompleteness, word sense ambiguation, and irregular language, as well as\nvariation in aspects of opinions. Moreover, extracting discriminative features\nand patterns for evolving events by exploiting the limited structural knowledge\nis almost infeasible. To address these challenges, in this thesis, we propose a\nnovel framework, namely EnrichEvent, that leverages the lexical and contextual\nrepresentations of streaming social data. In particular, we leverage contextual\nknowledge, as well as lexical knowledge, to detect semantically related tweets\nand enhance the effectiveness of the event detection approaches. Eventually,\nour proposed framework produces cluster chains for each event to show the\nevolving variation of the event through time. We conducted extensive\nexperiments to evaluate our framework, validating its high performance and\neffectiveness in detecting and distinguishing unspecified social events.\n","authors":["Mohammadali Sefidi Esfahani","Mohammad Akbari"],"pdf_url":"https://arxiv.org/pdf/2307.16082v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17558v2","updated":"2023-08-16T08:46:52Z","published":"2023-06-30T11:21:40Z","title":"Towards the extraction of robust sign embeddings for low resource sign\n language recognition","summary":" Isolated Sign Language Recognition (SLR) has mostly been applied on datasets\ncontaining signs executed slowly and clearly by a limited group of signers. In\nreal-world scenarios, however, we are met with challenging visual conditions,\ncoarticulated signing, small datasets, and the need for signer independent\nmodels. To tackle this difficult problem, we require a robust feature extractor\nto process the sign language videos. One could expect human pose estimators to\nbe ideal candidates. However, due to a domain mismatch with their training sets\nand challenging poses in sign language, they lack robustness on sign language\ndata and image-based models often still outperform keypoint-based models.\nFurthermore, whereas the common practice of transfer learning with image-based\nmodels yields even higher accuracy, keypoint-based models are typically trained\nfrom scratch on every SLR dataset. These factors limit their usefulness for\nSLR. From the existing literature, it is also not clear which, if any, pose\nestimator performs best for SLR. We compare the three most popular pose\nestimators for SLR: OpenPose, MMPose and MediaPipe. We show that through\nkeypoint normalization, missing keypoint imputation, and learning a pose\nembedding, we can obtain significantly better results and enable transfer\nlearning. We show that keypoint-based embeddings contain cross-lingual\nfeatures: they can transfer between sign languages and achieve competitive\nperformance even when fine-tuning only the classifier layer of an SLR model on\na target sign language. We furthermore achieve better performance using\nfine-tuned transferred embeddings than models trained only on the target sign\nlanguage. The embeddings can also be learned in a multilingual fashion. The\napplication of these embeddings could prove particularly useful for low\nresource sign languages in the future.\n","authors":["Mathieu De Coster","Ellen Rushe","Ruth Holmes","Anthony Ventresque","Joni Dambre"],"pdf_url":"https://arxiv.org/pdf/2306.17558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12267v4","updated":"2023-08-16T08:44:08Z","published":"2023-07-23T08:47:51Z","title":"Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid\n Essay in Education","summary":" The recent large language models (LLMs), e.g., ChatGPT, have been able to\ngenerate human-like and fluent responses when provided with specific\ninstructions. While admitting the convenience brought by technological\nadvancement, educators also have concerns that students might leverage LLMs to\ncomplete their writing assignments and pass them off as their original work.\nAlthough many AI content detection studies have been conducted as a result of\nsuch concerns, most of these prior studies modeled AI content detection as a\nclassification problem, assuming that a text is either entirely human-written\nor entirely AI-generated. In this study, we investigated AI content detection\nin a rarely explored yet realistic setting where the text to be detected is\ncollaboratively written by human and generative LLMs (i.e., hybrid text). We\nfirst formalized the detection task as identifying the transition points\nbetween human-written content and AI-generated content from a given hybrid text\n(boundary detection). Then we proposed a two-step approach where we (1)\nseparated AI-generated content from human-written content during the encoder\ntraining process; and (2) calculated the distances between every two adjacent\nprototypes and assumed that the boundaries exist between the two adjacent\nprototypes that have the furthest distance from each other. Through extensive\nexperiments, we observed the following main findings: (1) the proposed approach\nconsistently outperformed the baseline methods across different experiment\nsettings; (2) the encoder training process can significantly boost the\nperformance of the proposed approach; (3) when detecting boundaries for\nsingle-boundary hybrid essays, the proposed approach could be enhanced by\nadopting a relatively large prototype size, leading to a 22% improvement in the\nIn-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation.\n","authors":["Zijie Zeng","Lele Sha","Yuheng Li","Kaixun Yang","Dragan Gašević","Guanliang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12267v4.pdf","comment":"9 pages including references, 2 figures"},{"id":"http://arxiv.org/abs/2303.09713v2","updated":"2023-08-16T08:17:02Z","published":"2023-03-17T01:10:33Z","title":"CHAMPAGNE: Learning Real-world Conversation from Large-Scale Web Videos","summary":" Visual information is central to conversation: body gestures and physical\nbehaviour, for example, contribute to meaning that transcends words alone. To\ndate, however, most neural conversational models are limited to just text. We\nintroduce CHAMPAGNE, a generative model of conversations that can account for\nvisual contexts. To train CHAMPAGNE, we collect and release YTD-18M, a\nlarge-scale corpus of 18M video-based dialogues. YTD-18M is constructed from\nweb videos: crucial to our data collection pipeline is a pretrained language\nmodel that converts error-prone automatic transcripts to a cleaner dialogue\nformat while maintaining meaning. Human evaluation reveals that YTD-18M is more\nsensible and specific than prior resources (MMDialog, 1M dialogues), while\nmaintaining visual-groundedness. Experiments demonstrate that 1) CHAMPAGNE\nlearns to conduct conversation from YTD-18M; and 2) when fine-tuned, it\nachieves state-of-the-art results on four vision-language tasks focused on\nreal-world conversations. We release data, models, and code.\n","authors":["Seungju Han","Jack Hessel","Nouha Dziri","Yejin Choi","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2303.09713v2.pdf","comment":"ICCV 2023, Project page: https://seungjuhan.me/champagne"},{"id":"http://arxiv.org/abs/2308.08204v1","updated":"2023-08-16T08:09:10Z","published":"2023-08-16T08:09:10Z","title":"MoCoSA: Momentum Contrast for Knowledge Graph Completion with\n Structure-Augmented Pre-trained Language Models","summary":" Knowledge Graph Completion (KGC) aims to conduct reasoning on the facts\nwithin knowledge graphs and automatically infer missing links. Existing methods\ncan mainly be categorized into structure-based or description-based. On the one\nhand, structure-based methods effectively represent relational facts in\nknowledge graphs using entity embeddings. However, they struggle with\nsemantically rich real-world entities due to limited structural information and\nfail to generalize to unseen entities. On the other hand, description-based\nmethods leverage pre-trained language models (PLMs) to understand textual\ninformation. They exhibit strong robustness towards unseen entities. However,\nthey have difficulty with larger negative sampling and often lag behind\nstructure-based methods. To address these issues, in this paper, we propose\nMomentum Contrast for knowledge graph completion with Structure-Augmented\npre-trained language models (MoCoSA), which allows the PLM to perceive the\nstructural information by the adaptable structure encoder. To improve learning\nefficiency, we proposed momentum hard negative and intra-relation negative\nsampling. Experimental results demonstrate that our approach achieves\nstate-of-the-art performance in terms of mean reciprocal rank (MRR), with\nimprovements of 2.5% on WN18RR and 21% on OpenBG500.\n","authors":["Jiabang He","Liu Jia","Lei Wang","Xiyao Li","Xing Xu"],"pdf_url":"https://arxiv.org/pdf/2308.08204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08181v1","updated":"2023-08-16T07:21:01Z","published":"2023-08-16T07:21:01Z","title":"ChinaTelecom System Description to VoxCeleb Speaker Recognition\n Challenge 2023","summary":" This technical report describes ChinaTelecom system for Track 1 (closed) of\nthe VoxCeleb2023 Speaker Recognition Challenge (VoxSRC 2023). Our system\nconsists of several ResNet variants trained only on VoxCeleb2, which were fused\nfor better performance later. Score calibration was also applied for each\nvariant and the fused system. The final submission achieved minDCF of 0.1066\nand EER of 1.980%.\n","authors":["Mengjie Du","Xiang Fang","Jie Li"],"pdf_url":"https://arxiv.org/pdf/2308.08181v1.pdf","comment":"System description of VoxSRC 2023"},{"id":"http://arxiv.org/abs/2308.08176v1","updated":"2023-08-16T07:12:23Z","published":"2023-08-16T07:12:23Z","title":"RSpell: Retrieval-augmented Framework for Domain Adaptive Chinese\n Spelling Check","summary":" Chinese Spelling Check (CSC) refers to the detection and correction of\nspelling errors in Chinese texts. In practical application scenarios, it is\nimportant to make CSC models have the ability to correct errors across\ndifferent domains. In this paper, we propose a retrieval-augmented spelling\ncheck framework called RSpell, which searches corresponding domain terms and\nincorporates them into CSC models. Specifically, we employ pinyin fuzzy\nmatching to search for terms, which are combined with the input and fed into\nthe CSC model. Then, we introduce an adaptive process control mechanism to\ndynamically adjust the impact of external knowledge on the model. Additionally,\nwe develop an iterative strategy for the RSpell framework to enhance reasoning\ncapabilities. We conducted experiments on CSC datasets in three domains: law,\nmedicine, and official document writing. The results demonstrate that RSpell\nachieves state-of-the-art performance in both zero-shot and fine-tuning\nscenarios, demonstrating the effectiveness of the retrieval-augmented CSC\nframework. Our code is available at https://github.com/47777777/Rspell.\n","authors":["Siqi Song","Qi Lv","Lei Geng","Ziqiang Cao","Guohong Fu"],"pdf_url":"https://arxiv.org/pdf/2308.08176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03266v2","updated":"2023-08-16T07:03:42Z","published":"2023-08-07T03:12:27Z","title":"SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and\n Effective Hotword Customization Ability","summary":" Hotword customization is one of the important issues remained in ASR field -\nit is of value to enable users of ASR systems to customize names of entities,\npersons and other phrases. The past few years have seen both implicit and\nexplicit modeling strategies for ASR contextualization developed. While these\napproaches have performed adequately, they still exhibit certain shortcomings\nsuch as instability in effectiveness. In this paper we propose\nSemantic-augmented Contextual-Paraformer (SeACo-Paraformer) a novel NAR based\nASR system with flexible and effective hotword customization ability. It\ncombines the accuracy of the AED-based model, the efficiency of the NAR model,\nand the excellent performance in contextualization. In 50,000 hours industrial\nbig data experiments, our proposed model outperforms strong baselines in\ncustomization and general ASR tasks. Besides, we explore an efficient way to\nfilter large scale incoming hotwords for further improvement. The source codes\nand industrial models proposed and compared are all opened as well as two\nhotword test sets.\n","authors":["Xian Shi","Yexin Yang","Zerui Li","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03266v2.pdf","comment":"updated draft"},{"id":"http://arxiv.org/abs/2308.08169v1","updated":"2023-08-16T06:52:10Z","published":"2023-08-16T06:52:10Z","title":"Enhancing Performance on Seen and Unseen Dialogue Scenarios using\n Retrieval-Augmented End-to-End Task-Oriented System","summary":" End-to-end task-oriented dialogue (TOD) systems have achieved promising\nperformance by leveraging sophisticated natural language understanding and\nnatural language generation capabilities of pre-trained models. This work\nenables the TOD systems with more flexibility through a simple cache. The cache\nprovides the flexibility to dynamically update the TOD systems and handle both\nexisting and unseen dialogue scenarios. Towards this end, we first fine-tune a\nretrieval module to effectively retrieve the most relevant information entries\nfrom the cache. We then train end-to-end TOD models that can refer to and\nground on both dialogue history and retrieved information during TOD\ngeneration. The cache is straightforward to construct, and the backbone models\nof TOD systems are compatible with existing pre-trained generative models.\nExtensive experiments demonstrate the superior performance of our framework,\nwith a notable improvement in non-empty joint goal accuracy by 6.7% compared to\nstrong baselines.\n","authors":["Jianguo Zhang","Stephen Roller","Kun Qian","Zhiwei Liu","Rui Meng","Shelby Heinecke","Huan Wang","Silvio Savarese","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.08169v1.pdf","comment":"Accepted by SIGDIAL 2023 as a long paper"},{"id":"http://arxiv.org/abs/2307.14385v2","updated":"2023-08-16T06:04:48Z","published":"2023-07-26T06:00:50Z","title":"Mental-LLM: Leveraging Large Language Models for Mental Health\n Prediction via Online Text Data","summary":" Advances in large language models (LLMs) have empowered a variety of\napplications. However, there is still a significant gap in research when it\ncomes to understanding and enhancing the capabilities of LLMs in the field of\nmental health. In this work, we present the first comprehensive evaluation of\nmultiple LLMs, including Alpaca, Alpaca-LoRA, FLAN-T5, GPT-3.5, and GPT-4, on\nvarious mental health prediction tasks via online text data. We conduct a broad\nrange of experiments, covering zero-shot prompting, few-shot prompting, and\ninstruction fine-tuning. The results indicate a promising yet limited\nperformance of LLMs with zero-shot and few-shot prompt designs for the mental\nhealth tasks. More importantly, our experiments show that instruction\nfinetuning can significantly boost the performance of LLMs for all tasks\nsimultaneously. Our best-finetuned models, Mental-Alpaca and Mental-FLAN-T5,\noutperform the best prompt design of GPT-3.5 (25 and 15 times bigger) by 10.9%\non balanced accuracy and the best of GPT-4 (250 and 150 times bigger) by 4.8%.\nThey further perform on par with the state-of-the-art task-specific language\nmodel. We also conduct an exploratory case study on LLMs' capability on the\nmental health reasoning tasks, illustrating the promising capability of certain\nmodels such as GPT-4. We summarize our findings into a set of action guidelines\nfor potential methods to enhance LLMs' capability for mental health tasks.\nMeanwhile, we also emphasize the important limitations before achieving\ndeployability in real-world mental health settings, such as known racial and\ngender bias. We highlight the important ethical risks accompanying this line of\nresearch.\n","authors":["Xuhai Xu","Bingshen Yao","Yuanzhe Dong","Saadia Gabriel","Hong Yu","James Hendler","Marzyeh Ghassemi","Anind K. Dey","Dakuo Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07711v2","updated":"2023-08-16T05:58:16Z","published":"2023-08-15T11:45:34Z","title":"SPM: Structured Pretraining and Matching Architectures for Relevance\n Modeling in Meituan Search","summary":" In e-commerce search, relevance between query and documents is an essential\nrequirement for satisfying user experience. Different from traditional\ne-commerce platforms that offer products, users search on life service\nplatforms such as Meituan mainly for product providers, which usually have\nabundant structured information, e.g. name, address, category, thousands of\nproducts. Modeling search relevance with these rich structured contents is\nchallenging due to the following issues: (1) there is language distribution\ndiscrepancy among different fields of structured document, making it difficult\nto directly adopt off-the-shelf pretrained language model based methods like\nBERT. (2) different fields usually have different importance and their length\nvary greatly, making it difficult to extract document information helpful for\nrelevance matching.\n To tackle these issues, in this paper we propose a novel two-stage\npretraining and matching architecture for relevance matching with rich\nstructured documents. At pretraining stage, we propose an effective pretraining\nmethod that employs both query and multiple fields of document as inputs,\nincluding an effective information compression method for lengthy fields. At\nrelevance matching stage, a novel matching method is proposed by leveraging\ndomain knowledge in search query to generate more effective document\nrepresentations for relevance scoring. Extensive offline experiments and online\nA/B tests on millions of users verify that the proposed architectures\neffectively improve the performance of relevance modeling. The model has\nalready been deployed online, serving the search traffic of Meituan for over a\nyear.\n","authors":["Wen Zan","Yaopeng Han","Xiaotian Jiang","Yao Xiao","Yang Yang","Dayao Chen","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07711v2.pdf","comment":"Accepted by CIKM '23"},{"id":"http://arxiv.org/abs/2308.08156v1","updated":"2023-08-16T05:58:12Z","published":"2023-08-16T05:58:12Z","title":"Sarcasm Detection in a Disaster Context","summary":" During natural disasters, people often use social media platforms such as\nTwitter to ask for help, to provide information about the disaster situation,\nor to express contempt about the unfolding event or public policies and\nguidelines. This contempt is in some cases expressed as sarcasm or irony.\nUnderstanding this form of speech in a disaster-centric context is essential to\nimproving natural language understanding of disaster-related tweets. In this\npaper, we introduce HurricaneSARC, a dataset of 15,000 tweets annotated for\nintended sarcasm, and provide a comprehensive investigation of sarcasm\ndetection using pre-trained language models. Our best model is able to obtain\nas much as 0.70 F1 on our dataset. We also demonstrate that the performance on\nHurricaneSARC can be improved by leveraging intermediate task transfer\nlearning. We release our data and code at\nhttps://github.com/tsosea2/HurricaneSarc.\n","authors":["Tiberiu Sosea","Junyi Jessy Li","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2308.08156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08155v1","updated":"2023-08-16T05:57:52Z","published":"2023-08-16T05:57:52Z","title":"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation\n Framework","summary":" This technical report presents AutoGen, a new framework that enables\ndevelopment of LLM applications using multiple agents that can converse with\neach other to solve tasks. AutoGen agents are customizable, conversable, and\nseamlessly allow human participation. They can operate in various modes that\nemploy combinations of LLMs, human inputs, and tools. AutoGen's design offers\nmultiple advantages: a) it gracefully navigates the strong but imperfect\ngeneration and reasoning abilities of these LLMs; b) it leverages human\nunderstanding and intelligence, while providing valuable automation through\nconversations between agents; c) it simplifies and unifies the implementation\nof complex LLM workflows as automated agent chats. We provide many diverse\nexamples of how developers can easily use AutoGen to effectively solve tasks or\nbuild applications, ranging from coding, mathematics, operations research,\nentertainment, online decision-making, question answering, etc.\n","authors":["Qingyun Wu","Gagan Bansal","Jieyu Zhang","Yiran Wu","Shaokun Zhang","Erkang Zhu","Beibin Li","Li Jiang","Xiaoyun Zhang","Chi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08155v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2308.08153v1","updated":"2023-08-16T05:48:50Z","published":"2023-08-16T05:48:50Z","title":"Fast Training of NMT Model with Data Sorting","summary":" The Transformer model has revolutionized Natural Language Processing tasks\nsuch as Neural Machine Translation, and many efforts have been made to study\nthe Transformer architecture, which increased its efficiency and accuracy. One\npotential area for improvement is to address the computation of empty tokens\nthat the Transformer computes only to discard them later, leading to an\nunnecessary computational burden. To tackle this, we propose an algorithm that\nsorts translation sentence pairs based on their length before batching,\nminimizing the waste of computing power. Since the amount of sorting could\nviolate the independent and identically distributed (i.i.d) data assumption, we\nsort the data partially. In experiments, we apply the proposed method to\nEnglish-Korean and English-Luganda language pairs for machine translation and\nshow that there are gains in computational time while maintaining the\nperformance. Our method is independent of architectures, so that it can be\neasily integrated into any training process with flexible data lengths.\n","authors":["Daniela N. Rim","Kimera Richard","Heeyoul Choi"],"pdf_url":"https://arxiv.org/pdf/2308.08153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08147v1","updated":"2023-08-16T04:56:55Z","published":"2023-08-16T04:56:55Z","title":"MDDial: A Multi-turn Differential Diagnosis Dialogue Dataset with\n Reliability Evaluation","summary":" Dialogue systems for Automatic Differential Diagnosis (ADD) have a wide range\nof real-life applications. These dialogue systems are promising for providing\neasy access and reducing medical costs. Building end-to-end ADD dialogue\nsystems requires dialogue training datasets. However, to the best of our\nknowledge, there is no publicly available ADD dialogue dataset in English\n(although non-English datasets exist). Driven by this, we introduce MDDial, the\nfirst differential diagnosis dialogue dataset in English which can aid to build\nand evaluate end-to-end ADD dialogue systems. Additionally, earlier studies\npresent the accuracy of diagnosis and symptoms either individually or as a\ncombined weighted score. This method overlooks the connection between the\nsymptoms and the diagnosis. We introduce a unified score for the ADD system\nthat takes into account the interplay between symptoms and diagnosis. This\nscore also indicates the system's reliability. To the end, we train two\nmoderate-size of language models on MDDial. Our experiments suggest that while\nthese language models can perform well on many natural language understanding\ntasks, including dialogue tasks in the general domain, they struggle to relate\nrelevant symptoms and disease and thus have poor performance on MDDial. MDDial\nwill be released publicly to aid the study of ADD dialogue research.\n","authors":["Srija Macherla","Man Luo","Mihir Parmar","Chitta Baral"],"pdf_url":"https://arxiv.org/pdf/2308.08147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08125v1","updated":"2023-08-16T03:31:30Z","published":"2023-08-16T03:31:30Z","title":"Radio2Text: Streaming Speech Recognition Using mmWave Radio Signals","summary":" Millimeter wave (mmWave) based speech recognition provides more possibility\nfor audio-related applications, such as conference speech transcription and\neavesdropping. However, considering the practicality in real scenarios, latency\nand recognizable vocabulary size are two critical factors that cannot be\noverlooked. In this paper, we propose Radio2Text, the first mmWave-based system\nfor streaming automatic speech recognition (ASR) with a vocabulary size\nexceeding 13,000 words. Radio2Text is based on a tailored streaming Transformer\nthat is capable of effectively learning representations of speech-related\nfeatures, paving the way for streaming ASR with a large vocabulary. To\nalleviate the deficiency of streaming networks unable to access entire future\ninputs, we propose the Guidance Initialization that facilitates the transfer of\nfeature knowledge related to the global context from the non-streaming\nTransformer to the tailored streaming Transformer through weight inheritance.\nFurther, we propose a cross-modal structure based on knowledge distillation\n(KD), named cross-modal KD, to mitigate the negative effect of low quality\nmmWave signals on recognition performance. In the cross-modal KD, the audio\nstreaming Transformer provides feature and response guidance that inherit\nfruitful and accurate speech information to supervise the training of the\ntailored radio streaming Transformer. The experimental results show that our\nRadio2Text can achieve a character error rate of 5.7% and a word error rate of\n9.4% for the recognition of a vocabulary consisting of over 13,000 words.\n","authors":["Running Zhao","Jiangtao Yu","Hang Zhao","Edith C. H. Ngai"],"pdf_url":"https://arxiv.org/pdf/2308.08125v1.pdf","comment":"Accepted by Proceedings of the ACM on Interactive, Mobile, Wearable\n and Ubiquitous Technologies (ACM IMWUT/UbiComp 2023)"},{"id":"http://arxiv.org/abs/2308.08090v1","updated":"2023-08-16T01:46:01Z","published":"2023-08-16T01:46:01Z","title":"Separate the Wheat from the Chaff: Model Deficiency Unlearning via\n Parameter-Efficient Module Operation","summary":" Large language models (LLMs) have been widely used in various applications\nbut are known to suffer from issues related to untruthfulness and toxicity.\nWhile parameter-efficient modules (PEMs) have demonstrated their effectiveness\nin equipping models with new skills, leveraging PEMs for deficiency unlearning\nremains underexplored. In this work, we propose a PEMs operation approach,\nnamely Extraction-before-Subtraction (Ext-Sub), to enhance the truthfulness and\ndetoxification of LLMs through the integration of ``expert'' PEM and\n``anti-expert'' PEM. Remarkably, even anti-expert PEM possess valuable\ncapabilities due to their proficiency in generating fabricated content, which\nnecessitates language modeling and logical narrative competence. Rather than\nmerely negating the parameters, our approach involves extracting and\neliminating solely the deficiency capability within anti-expert PEM while\npreserving the general capabilities. To evaluate the effectiveness of our\napproach in terms of truthfulness and detoxification, we conduct extensive\nexperiments on LLMs, encompassing additional abilities such as language\nmodeling and mathematical reasoning. Our empirical results demonstrate that our\napproach effectively improves truthfulness and detoxification, while largely\npreserving the fundamental abilities of LLMs.\n","authors":["Xinshuo Hu","Dongfang Li","Zihao Zheng","Zhenyu Liu","Baotian Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11095v3","updated":"2023-08-16T00:57:34Z","published":"2023-05-18T16:32:58Z","title":"Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot\n Task Generalization","summary":" We investigate the emergent abilities of the recently proposed web-scale\nspeech model Whisper, by adapting it to unseen tasks with prompt engineering.\nWe selected three tasks: audio-visual speech recognition (AVSR), code-switched\nspeech recognition (CS-ASR), and speech translation (ST) on unseen language\npairs. We design task-specific prompts, by either leveraging another\nlarge-scale model, or simply manipulating the special tokens in the default\nprompts. Experiments show that compared to the default prompts, our proposed\nprompts improve performance by 10% to 45% on the three zero-shot tasks, and\neven outperform SotA supervised models on some datasets. In addition, our\nexperiments reveal many interesting properties of Whisper, including its\nrobustness to prompts, bias on accents, and the multilingual understanding in\nits latent space. Code is available at\nhttps://github.com/jasonppy/PromptingWhisper\n","authors":["Puyuan Peng","Brian Yan","Shinji Watanabe","David Harwath"],"pdf_url":"https://arxiv.org/pdf/2305.11095v3.pdf","comment":"Interspeech 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.08545v1","updated":"2023-08-16T17:59:13Z","published":"2023-08-16T17:59:13Z","title":"TeCH: Text-guided Reconstruction of Lifelike Clothed Humans","summary":" Despite recent research advancements in reconstructing clothed humans from a\nsingle image, accurately restoring the \"unseen regions\" with high-level details\nremains an unsolved challenge that lacks attention. Existing methods often\ngenerate overly smooth back-side surfaces with a blurry texture. But how to\neffectively capture all visual attributes of an individual from a single image,\nwhich are sufficient to reconstruct unseen areas (e.g., the back view)?\nMotivated by the power of foundation models, TeCH reconstructs the 3D human by\nleveraging 1) descriptive text prompts (e.g., garments, colors, hairstyles)\nwhich are automatically generated via a garment parsing model and Visual\nQuestion Answering (VQA), 2) a personalized fine-tuned Text-to-Image diffusion\nmodel (T2I) which learns the \"indescribable\" appearance. To represent\nhigh-resolution 3D clothed humans at an affordable cost, we propose a hybrid 3D\nrepresentation based on DMTet, which consists of an explicit body shape grid\nand an implicit distance field. Guided by the descriptive prompts +\npersonalized T2I diffusion model, the geometry and texture of the 3D humans are\noptimized through multi-view Score Distillation Sampling (SDS) and\nreconstruction losses based on the original observation. TeCH produces\nhigh-fidelity 3D clothed humans with consistent & delicate texture, and\ndetailed full-body geometry. Quantitative and qualitative experiments\ndemonstrate that TeCH outperforms the state-of-the-art methods in terms of\nreconstruction accuracy and rendering quality. The code will be publicly\navailable for research purposes at https://huangyangyi.github.io/tech\n","authors":["Yangyi Huang","Hongwei Yi","Yuliang Xiu","Tingting Liao","Jiaxiang Tang","Deng Cai","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2308.08545v1.pdf","comment":"Project: https://huangyangyi.github.io/tech"},{"id":"http://arxiv.org/abs/2303.12791v2","updated":"2023-08-16T17:58:35Z","published":"2023-03-22T17:59:12Z","title":"SHERF: Generalizable Human NeRF from a Single Image","summary":" Existing Human NeRF methods for reconstructing 3D humans typically rely on\nmultiple 2D images from multi-view cameras or monocular videos captured from\nfixed camera views. However, in real-world scenarios, human images are often\ncaptured from random camera angles, presenting challenges for high-quality 3D\nhuman reconstruction. In this paper, we propose SHERF, the first generalizable\nHuman NeRF model for recovering animatable 3D humans from a single input image.\nSHERF extracts and encodes 3D human representations in canonical space,\nenabling rendering and animation from free views and poses. To achieve\nhigh-fidelity novel view and pose synthesis, the encoded 3D human\nrepresentations should capture both global appearance and local fine-grained\ntextures. To this end, we propose a bank of 3D-aware hierarchical features,\nincluding global, point-level, and pixel-aligned features, to facilitate\ninformative encoding. Global features enhance the information extracted from\nthe single input image and complement the information missing from the partial\n2D observation. Point-level features provide strong clues of 3D human\nstructure, while pixel-aligned features preserve more fine-grained details. To\neffectively integrate the 3D-aware hierarchical feature bank, we design a\nfeature fusion transformer. Extensive experiments on THuman, RenderPeople,\nZJU_MoCap, and HuMMan datasets demonstrate that SHERF achieves state-of-the-art\nperformance, with better generalizability for novel view and pose synthesis.\n","authors":["Shoukang Hu","Fangzhou Hong","Liang Pan","Haiyi Mei","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2303.12791v2.pdf","comment":"Accepted by ICCV2023. Project webpage:\n https://skhu101.github.io/SHERF/"},{"id":"http://arxiv.org/abs/2308.08544v1","updated":"2023-08-16T17:58:34Z","published":"2023-08-16T17:58:34Z","title":"MeViS: A Large-scale Benchmark for Video Segmentation with Motion\n Expressions","summary":" This paper strives for motion expressions guided video segmentation, which\nfocuses on segmenting objects in video content based on a sentence describing\nthe motion of the objects. Existing referring video object datasets typically\nfocus on salient objects and use language expressions that contain excessive\nstatic attributes that could potentially enable the target object to be\nidentified in a single frame. These datasets downplay the importance of motion\nin video content for language-guided video object segmentation. To investigate\nthe feasibility of using motion expressions to ground and segment objects in\nvideos, we propose a large-scale dataset called MeViS, which contains numerous\nmotion expressions to indicate target objects in complex environments. We\nbenchmarked 5 existing referring video object segmentation (RVOS) methods and\nconducted a comprehensive comparison on the MeViS dataset. The results show\nthat current RVOS methods cannot effectively address motion expression-guided\nvideo segmentation. We further analyze the challenges and propose a baseline\napproach for the proposed MeViS dataset. The goal of our benchmark is to\nprovide a platform that enables the development of effective language-guided\nvideo segmentation algorithms that leverage motion expressions as a primary cue\nfor object segmentation in complex video scenes. The proposed MeViS dataset has\nbeen released at https://henghuiding.github.io/MeViS.\n","authors":["Henghui Ding","Chang Liu","Shuting He","Xudong Jiang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2308.08544v1.pdf","comment":"ICCV 2023, Project Page: https://henghuiding.github.io/MeViS/"},{"id":"http://arxiv.org/abs/2308.08543v1","updated":"2023-08-16T17:58:28Z","published":"2023-08-16T17:58:28Z","title":"InsightMapper: A Closer Look at Inner-instance Information for\n Vectorized High-Definition Mapping","summary":" Vectorized high-definition (HD) maps contain detailed information about\nsurrounding road elements, which are crucial for various downstream tasks in\nmodern autonomous driving vehicles, such as vehicle planning and control.\nRecent works have attempted to directly detect the vectorized HD map as a point\nset prediction task, resulting in significant improvements in detection\nperformance. However, these approaches fail to analyze and exploit the\ninner-instance correlations between predicted points, impeding further\nadvancements. To address these challenges, we investigate the utilization of\ninner-$\\textbf{INS}$tance information for vectorized h$\\textbf{IGH}$-definition\nmapping through $\\textbf{T}$ransformers and introduce InsightMapper. This paper\npresents three novel designs within InsightMapper that leverage inner-instance\ninformation in distinct ways, including hybrid query generation, inner-instance\nquery fusion, and inner-instance feature aggregation. Comparative experiments\nare conducted on the NuScenes dataset, showcasing the superiority of our\nproposed method. InsightMapper surpasses previous state-of-the-art (SOTA)\nmethods by 5.78 mAP and 5.12 TOPO, which assess topology correctness.\nSimultaneously, InsightMapper maintains high efficiency during both training\nand inference phases, resulting in remarkable comprehensive performance. The\nproject page for this work is available at\nhttps://tonyxuqaq.github.io/projects/InsightMapper .\n","authors":["Zhenhua Xu","Kenneth K. Y. Wong","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.08543v1.pdf","comment":"Code and demo will be available at\n https://tonyxuqaq.github.io/projects/InsightMapper"},{"id":"http://arxiv.org/abs/2211.10946v2","updated":"2023-08-16T17:54:55Z","published":"2022-11-20T11:02:50Z","title":"Normalizing Flows for Human Pose Anomaly Detection","summary":" Video anomaly detection is an ill-posed problem because it relies on many\nparameters such as appearance, pose, camera angle, background, and more. We\ndistill the problem to anomaly detection of human pose, thus decreasing the\nrisk of nuisance parameters such as appearance affecting the result. Focusing\non pose alone also has the side benefit of reducing bias against distinct\nminority groups. Our model works directly on human pose graph sequences and is\nexceptionally lightweight (~1K parameters), capable of running on any machine\nable to run the pose estimation with negligible additional resources. We\nleverage the highly compact pose representation in a normalizing flows\nframework, which we extend to tackle the unique characteristics of\nspatio-temporal pose data and show its advantages in this use case. The\nalgorithm is quite general and can handle training data of only normal examples\nas well as a supervised setting that consists of labeled normal and abnormal\nexamples. We report state-of-the-art results on two anomaly detection\nbenchmarks - the unsupervised ShanghaiTech dataset and the recent supervised\nUBnormal dataset.\n","authors":["Or Hirschorn","Shai Avidan"],"pdf_url":"https://arxiv.org/pdf/2211.10946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09375v3","updated":"2023-08-16T17:54:39Z","published":"2023-03-16T15:04:10Z","title":"DINAR: Diffusion Inpainting of Neural Textures for One-Shot Human\n Avatars","summary":" We present DINAR, an approach for creating realistic rigged fullbody avatars\nfrom single RGB images. Similarly to previous works, our method uses neural\ntextures combined with the SMPL-X body model to achieve photo-realistic quality\nof avatars while keeping them easy to animate and fast to infer. To restore the\ntexture, we use a latent diffusion model and show how such model can be trained\nin the neural texture space. The use of the diffusion model allows us to\nrealistically reconstruct large unseen regions such as the back of a person\ngiven the frontal view. The models in our pipeline are trained using 2D images\nand videos only. In the experiments, our approach achieves state-of-the-art\nrendering quality and good generalization to new poses and viewpoints. In\nparticular, the approach improves state-of-the-art on the SnapshotPeople public\nbenchmark.\n","authors":["David Svitov","Dmitrii Gudkov","Renat Bashirov","Victor Lempitsky"],"pdf_url":"https://arxiv.org/pdf/2303.09375v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08530v1","updated":"2023-08-16T17:40:18Z","published":"2023-08-16T17:40:18Z","title":"Ref-DVGO: Reflection-Aware Direct Voxel Grid Optimization for an\n Improved Quality-Efficiency Trade-Off in Reflective Scene Reconstructio","summary":" Neural Radiance Fields (NeRFs) have revolutionized the field of novel view\nsynthesis, demonstrating remarkable performance. However, the modeling and\nrendering of reflective objects remain challenging problems. Recent methods\nhave shown significant improvements over the baselines in handling reflective\nscenes, albeit at the expense of efficiency. In this work, we aim to strike a\nbalance between efficiency and quality. To this end, we investigate an\nimplicit-explicit approach based on conventional volume rendering to enhance\nthe reconstruction quality and accelerate the training and rendering processes.\nWe adopt an efficient density-based grid representation and reparameterize the\nreflected radiance in our pipeline. Our proposed reflection-aware approach\nachieves a competitive quality efficiency trade-off compared to competing\nmethods. Based on our experimental results, we propose and discuss hypotheses\nregarding the factors influencing the results of density-based methods for\nreconstructing reflective objects. The source code is available at:\nhttps://github.com/gkouros/ref-dvgo\n","authors":["Georgios Kouros","Minye Wu","Sushruth Nagesh","Shubham Shrivastava","Punarjay Chakravarty","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2308.08530v1.pdf","comment":"5 pages, 4 figures, 3 tables, ICCV TRICKY 2023 Workshop"},{"id":"http://arxiv.org/abs/2308.02716v2","updated":"2023-08-16T17:39:15Z","published":"2023-08-04T21:38:29Z","title":"EndoDepthL: Lightweight Endoscopic Monocular Depth Estimation with\n CNN-Transformer","summary":" In this study, we address the key challenges concerning the accuracy and\neffectiveness of depth estimation for endoscopic imaging, with a particular\nemphasis on real-time inference and the impact of light reflections. We propose\na novel lightweight solution named EndoDepthL that integrates Convolutional\nNeural Networks (CNN) and Transformers to predict multi-scale depth maps. Our\napproach includes optimizing the network architecture, incorporating\nmulti-scale dilated convolution, and a multi-channel attention mechanism. We\nalso introduce a statistical confidence boundary mask to minimize the impact of\nreflective areas. To better evaluate the performance of monocular depth\nestimation in endoscopic imaging, we propose a novel complexity evaluation\nmetric that considers network parameter size, floating-point operations, and\ninference frames per second. We comprehensively evaluate our proposed method\nand compare it with existing baseline solutions. The results demonstrate that\nEndoDepthL ensures depth estimation accuracy with a lightweight structure.\n","authors":["Yangke Li"],"pdf_url":"https://arxiv.org/pdf/2308.02716v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08529v1","updated":"2023-08-16T17:39:15Z","published":"2023-08-16T17:39:15Z","title":"Diagnosing Human-object Interaction Detectors","summary":" Although we have witnessed significant progress in human-object interaction\n(HOI) detection with increasingly high mAP (mean Average Precision), a single\nmAP score is too concise to obtain an informative summary of a model's\nperformance and to understand why one approach is better than another. In this\npaper, we introduce a diagnosis toolbox for analyzing the error sources of the\nexisting HOI detection models. We first conduct holistic investigations in the\npipeline of HOI detection, consisting of human-object pair detection and then\ninteraction classification. We define a set of errors and the oracles to fix\neach of them. By measuring the mAP improvement obtained from fixing an error\nusing its oracle, we can have a detailed analysis of the significance of\ndifferent errors. We then delve into the human-object detection and interaction\nclassification, respectively, and check the model's behavior. For the first\ndetection task, we investigate both recall and precision, measuring the\ncoverage of ground-truth human-object pairs as well as the noisiness level in\nthe detections. For the second classification task, we compute mAP for\ninteraction classification only, without considering the detection scores. We\nalso measure the performance of the models in differentiating human-object\npairs with and without actual interactions using the AP (Average Precision)\nscore. Our toolbox is applicable for different methods across different\ndatasets and available at https://github.com/neu-vi/Diag-HOI.\n","authors":["Fangrui Zhu","Yiming Xie","Weidi Xie","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.08529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08525v1","updated":"2023-08-16T17:26:47Z","published":"2023-08-16T17:26:47Z","title":"Likelihood-Based Text-to-Image Evaluation with Patch-Level Perceptual\n and Semantic Credit Assignment","summary":" Text-to-image synthesis has made encouraging progress and attracted lots of\npublic attention recently. However, popular evaluation metrics in this area,\nlike the Inception Score and Fr'echet Inception Distance, incur several issues.\nFirst of all, they cannot explicitly assess the perceptual quality of generated\nimages and poorly reflect the semantic alignment of each text-image pair. Also,\nthey are inefficient and need to sample thousands of images to stabilise their\nevaluation results. In this paper, we propose to evaluate text-to-image\ngeneration performance by directly estimating the likelihood of the generated\nimages using a pre-trained likelihood-based text-to-image generative model,\ni.e., a higher likelihood indicates better perceptual quality and better\ntext-image alignment. To prevent the likelihood of being dominated by the\nnon-crucial part of the generated image, we propose several new designs to\ndevelop a credit assignment strategy based on the semantic and perceptual\nsignificance of the image patches. In the experiments, we evaluate the proposed\nmetric on multiple popular text-to-image generation models and datasets in\naccessing both the perceptual quality and the text-image alignment. Moreover,\nit can successfully assess the generation ability of these models with as few\nas a hundred samples, making it very efficient in practice.\n","authors":["Qi Chen","Chaorui Deng","Zixiong Huang","Bowen Zhang","Mingkui Tan","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.08525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08520v1","updated":"2023-08-16T17:18:30Z","published":"2023-08-16T17:18:30Z","title":"Painter: Teaching Auto-regressive Language Models to Draw Sketches","summary":" Large language models (LLMs) have made tremendous progress in natural\nlanguage understanding and they have also been successfully adopted in other\ndomains such as computer vision, robotics, reinforcement learning, etc. In this\nwork, we apply LLMs to image generation tasks by directly generating the\nvirtual brush strokes to paint an image. We present Painter, an LLM that can\nconvert user prompts in text description format to sketches by generating the\ncorresponding brush strokes in an auto-regressive way. We construct Painter\nbased on off-the-shelf LLM that is pre-trained on a large text corpus, by\nfine-tuning it on the new task while preserving language understanding\ncapabilities. We create a dataset of diverse multi-object sketches paired with\ntextual prompts that covers several object types and tasks. Painter can\ngenerate sketches from text descriptions, remove objects from canvas, and\ndetect and classify objects in sketches. Although this is an unprecedented\npioneering work in using LLMs for auto-regressive image generation, the results\nare very encouraging.\n","authors":["Reza Pourreza","Apratim Bhattacharyya","Sunny Panchal","Mingu Lee","Pulkit Madan","Roland Memisevic"],"pdf_url":"https://arxiv.org/pdf/2308.08520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08518v1","updated":"2023-08-16T17:13:45Z","published":"2023-08-16T17:13:45Z","title":"Exploiting Point-Wise Attention in 6D Object Pose Estimation Based on\n Bidirectional Prediction","summary":" Traditional geometric registration based estimation methods only exploit the\nCAD model implicitly, which leads to their dependence on observation quality\nand deficiency to occlusion.To address the problem,the paper proposes a\nbidirectional correspondence prediction network with a point-wise\nattention-aware mechanism. This network not only requires the model points to\npredict the correspondence but also explicitly models the geometric\nsimilarities between observations and the model prior.} Our key insight is that\nthe correlations between each model point and scene point provide essential\ninformation for learning point-pair matches. To further tackle the correlation\nnoises brought by feature distribution divergence, we design a simple but\neffective pseudo-siamese network to improve feature homogeneity.Experimental\nresults on the public datasets of LineMOD, YCB-Video, and Occ-LineMOD show that\nthe proposed method achieves better performance than other state-of-the-art\nmethods under the same evaluation criteria. Its robustness in estimating poses\nis greatly improved, especially in an environment with severe occlusions.\n","authors":["Yuhao Yang","Jun Wu","Guangjian Zhang","Rong Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.08518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.12196v2","updated":"2023-08-16T17:09:41Z","published":"2022-04-26T10:00:28Z","title":"Adaptive Split-Fusion Transformer","summary":" Neural networks for visual content understanding have recently evolved from\nconvolutional ones (CNNs) to transformers. The prior (CNN) relies on\nsmall-windowed kernels to capture the regional clues, demonstrating solid local\nexpressiveness. On the contrary, the latter (transformer) establishes\nlong-range global connections between localities for holistic learning.\nInspired by this complementary nature, there is a growing interest in designing\nhybrid models to best utilize each technique. Current hybrids merely replace\nconvolutions as simple approximations of linear projection or juxtapose a\nconvolution branch with attention, without concerning the importance of\nlocal/global modeling. To tackle this, we propose a new hybrid named Adaptive\nSplit-Fusion Transformer (ASF-former) to treat convolutional and attention\nbranches differently with adaptive weights. Specifically, an ASF-former encoder\nequally splits feature channels into half to fit dual-path inputs. Then, the\noutputs of dual-path are fused with weighting scalars calculated from visual\ncues. We also design the convolutional path compactly for efficiency concerns.\nExtensive experiments on standard benchmarks, such as ImageNet-1K, CIFAR-10,\nand CIFAR-100, show that our ASF-former outperforms its CNN, transformer\ncounterparts, and hybrid pilots in terms of accuracy (83.9% on ImageNet-1K),\nunder similar conditions (12.9G MACs/56.7M Params, without large-scale\npre-training). The code is available at:\nhttps://github.com/szx503045266/ASF-former.\n","authors":["Zixuan Su","Hao Zhang","Jingjing Chen","Lei Pang","Chong-Wah Ngo","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2204.12196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08511v1","updated":"2023-08-16T17:07:40Z","published":"2023-08-16T17:07:40Z","title":"Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse\n Problems","summary":" Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial\ntechnologies in the field of medical imaging. Score-based models have proven to\nbe effective in addressing different inverse problems encountered in CT and\nMRI, such as sparse-view CT and fast MRI reconstruction. However, these models\nface challenges in achieving accurate three dimensional (3D) volumetric\nreconstruction. The existing score-based models primarily focus on\nreconstructing two dimensional (2D) data distribution, leading to\ninconsistencies between adjacent slices in the reconstructed 3D volumetric\nimages. To overcome this limitation, we propose a novel two-and-a-half order\nscore-based model (TOSM). During the training phase, our TOSM learns data\ndistributions in 2D space, which reduces the complexity of training compared to\ndirectly working on 3D volumes. However, in the reconstruction phase, the TOSM\nupdates the data distribution in 3D space, utilizing complementary scores along\nthree directions (sagittal, coronal, and transaxial) to achieve a more precise\nreconstruction. The development of TOSM is built on robust theoretical\nprinciples, ensuring its reliability and efficacy. Through extensive\nexperimentation on large-scale sparse-view CT and fast MRI datasets, our method\ndemonstrates remarkable advancements and attains state-of-the-art results in\nsolving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively\naddresses the inter-slice inconsistency issue, resulting in high-quality 3D\nvolumetric reconstruction.\n","authors":["Zirong Li","Yanyang Wang","Jianjia Zhang","Weiwen Wu","Hengyong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08504v1","updated":"2023-08-16T16:58:25Z","published":"2023-08-16T16:58:25Z","title":"ResBuilder: Automated Learning of Depth with Residual Structures","summary":" In this work, we develop a neural architecture search algorithm, termed\nResbuilder, that develops ResNet architectures from scratch that achieve high\naccuracy at moderate computational cost. It can also be used to modify existing\narchitectures and has the capability to remove and insert ResNet blocks, in\nthis way searching for suitable architectures in the space of ResNet\narchitectures. In our experiments on different image classification datasets,\nResbuilder achieves close to state-of-the-art performance while saving\ncomputational cost compared to off-the-shelf ResNets. Noteworthy, we once tune\nthe parameters on CIFAR10 which yields a suitable default choice for all other\ndatasets. We demonstrate that this property generalizes even to industrial\napplications by applying our method with default parameters on a proprietary\nfraud detection dataset.\n","authors":["Julian Burghoff","Matthias Rottmann","Jill von Conta","Sebastian Schoenen","Andreas Witte","Hanno Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2308.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08495v1","updated":"2023-08-16T16:49:50Z","published":"2023-08-16T16:49:50Z","title":"Self-Supervised Online Camera Calibration for Automated Driving and\n Parking Applications","summary":" Camera-based perception systems play a central role in modern autonomous\nvehicles. These camera based perception algorithms require an accurate\ncalibration to map the real world distances to image pixels. In practice,\ncalibration is a laborious procedure requiring specialised data collection and\ncareful tuning. This process must be repeated whenever the parameters of the\ncamera change, which can be a frequent occurrence in autonomous vehicles. Hence\nthere is a need to calibrate at regular intervals to ensure the camera is\naccurate. Proposed is a deep learning framework to learn intrinsic and\nextrinsic calibration of the camera in real time. The framework is\nself-supervised and doesn't require any labelling or supervision to learn the\ncalibration parameters. The framework learns calibration without the need for\nany physical targets or to drive the car on special planar surfaces.\n","authors":["Ciarán Hogan","Ganesh Sistu","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2308.08495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08479v1","updated":"2023-08-16T16:37:02Z","published":"2023-08-16T16:37:02Z","title":"DeDoDe: Detect, Don't Describe -- Describe, Don't Detect for Local\n Feature Matching","summary":" Keypoint detection is a pivotal step in 3D reconstruction, whereby sets of\n(up to) K points are detected in each view of a scene. Crucially, the detected\npoints need to be consistent between views, i.e., correspond to the same 3D\npoint in the scene. One of the main challenges with keypoint detection is the\nformulation of the learning objective. Previous learning-based methods\ntypically jointly learn descriptors with keypoints, and treat the keypoint\ndetection as a binary classification task on mutual nearest neighbours.\nHowever, basing keypoint detection on descriptor nearest neighbours is a proxy\ntask, which is not guaranteed to produce 3D-consistent keypoints. Furthermore,\nthis ties the keypoints to a specific descriptor, complicating downstream\nusage. In this work, we instead learn keypoints directly from 3D consistency.\nTo this end, we train the detector to detect tracks from large-scale SfM. As\nthese points are often overly sparse, we derive a semi-supervised two-view\ndetection objective to expand this set to a desired number of detections. To\ntrain a descriptor, we maximize the mutual nearest neighbour objective over the\nkeypoints with a separate network. Results show that our approach, DeDoDe,\nachieves significant gains on multiple geometry benchmarks. Code is provided at\nhttps://github.com/Parskatt/DeDoDe .\n","authors":["Johan Edstedt","Georg Bökman","Mårten Wadenbäck","Michael Felsberg"],"pdf_url":"https://arxiv.org/pdf/2308.08479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08476v1","updated":"2023-08-16T16:31:36Z","published":"2023-08-16T16:31:36Z","title":"Classification Committee for Active Deep Object Detection","summary":" In object detection, the cost of labeling is much high because it needs not\nonly to confirm the categories of multiple objects in an image but also to\naccurately determine the bounding boxes of each object. Thus, integrating\nactive learning into object detection will raise pretty positive significance.\nIn this paper, we propose a classification committee for active deep object\ndetection method by introducing a discrepancy mechanism of multiple classifiers\nfor samples' selection when training object detectors. The model contains a\nmain detector and a classification committee. The main detector denotes the\ntarget object detector trained from a labeled pool composed of the selected\ninformative images. The role of the classification committee is to select the\nmost informative images according to their uncertainty values from the view of\nclassification, which is expected to focus more on the discrepancy and\nrepresentative of instances. Specifically, they compute the uncertainty for a\nspecified instance within the image by measuring its discrepancy output by the\ncommittee pre-trained via the proposed Maximum Classifiers Discrepancy Group\nLoss (MCDGL). The most informative images are finally determined by selecting\nthe ones with many high-uncertainty instances. Besides, to mitigate the impact\nof interference instances, we design a Focus on Positive Instances Loss (FPIL)\nto make the committee the ability to automatically focus on the representative\ninstances as well as precisely encode their discrepancies for the same\ninstance. Experiments are conducted on Pascal VOC and COCO datasets versus some\npopular object detectors. And results show that our method outperforms the\nstate-of-the-art active learning methods, which verifies the effectiveness of\nthe proposed method.\n","authors":["Lei Zhao","Bo Li","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2308.08476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08465v1","updated":"2023-08-16T16:09:23Z","published":"2023-08-16T16:09:23Z","title":"Hierarchical Uncertainty Estimation for Medical Image Segmentation\n Networks","summary":" Learning a medical image segmentation model is an inherently ambiguous task,\nas uncertainties exist in both images (noise) and manual annotations (human\nerrors and bias) used for model training. To build a trustworthy image\nsegmentation model, it is important to not just evaluate its performance but\nalso estimate the uncertainty of the model prediction. Most state-of-the-art\nimage segmentation networks adopt a hierarchical encoder architecture,\nextracting image features at multiple resolution levels from fine to coarse. In\nthis work, we leverage this hierarchical image representation and propose a\nsimple yet effective method for estimating uncertainties at multiple levels.\nThe multi-level uncertainties are modelled via the skip-connection module and\nthen sampled to generate an uncertainty map for the predicted image\nsegmentation. We demonstrate that a deep learning segmentation network such as\nU-net, when implemented with such hierarchical uncertainty estimation module,\ncan achieve a high segmentation performance, while at the same time provide\nmeaningful uncertainty maps that can be used for out-of-distribution detection.\n","authors":["Xinyu Bai","Wenjia Bai"],"pdf_url":"https://arxiv.org/pdf/2308.08465v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.08463v1","updated":"2023-08-16T16:08:22Z","published":"2023-08-16T16:08:22Z","title":"Learning to Distill Global Representation for Sparse-View CT","summary":" Sparse-view computed tomography (CT) -- using a small number of projections\nfor tomographic reconstruction -- enables much lower radiation dose to patients\nand accelerated data acquisition. The reconstructed images, however, suffer\nfrom strong artifacts, greatly limiting their diagnostic value. Current trends\nfor sparse-view CT turn to the raw data for better information recovery. The\nresultant dual-domain methods, nonetheless, suffer from secondary artifacts,\nespecially in ultra-sparse view scenarios, and their generalization to other\nscanners/protocols is greatly limited. A crucial question arises: have the\nimage post-processing methods reached the limit? Our answer is not yet. In this\npaper, we stick to image post-processing methods due to great flexibility and\npropose global representation (GloRe) distillation framework for sparse-view\nCT, termed GloReDi. First, we propose to learn GloRe with Fourier convolution,\nso each element in GloRe has an image-wide receptive field. Second, unlike\nmethods that only use the full-view images for supervision, we propose to\ndistill GloRe from intermediate-view reconstructed images that are readily\navailable but not explored in previous literature. The success of GloRe\ndistillation is attributed to two key components: representation directional\ndistillation to align the GloRe directions, and band-pass-specific contrastive\ndistillation to gain clinically important details. Extensive experiments\ndemonstrate the superiority of the proposed GloReDi over the state-of-the-art\nmethods, including dual-domain ones. The source code is available at\nhttps://github.com/longzilicart/GloReDi.\n","authors":["Zilong Li","Chenglong Ma","Jie Chen","Junping Zhang","Hongming shan"],"pdf_url":"https://arxiv.org/pdf/2308.08463v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2304.01752v2","updated":"2023-08-16T15:54:54Z","published":"2023-04-04T12:42:29Z","title":"Black Box Few-Shot Adaptation for Vision-Language models","summary":" Vision-Language (V-L) models trained with contrastive learning to align the\nvisual and language modalities have been shown to be strong few-shot learners.\nSoft prompt learning is the method of choice for few-shot downstream adaption\naiming to bridge the modality gap caused by the distribution shift induced by\nthe new domain. While parameter-efficient, prompt learning still requires\naccess to the model weights and can be computationally infeasible for large\nmodels with billions of parameters. To address these shortcomings, in this\nwork, we describe a black-box method for V-L few-shot adaptation that (a)\noperates on pre-computed image and text features and hence works without access\nto the model's weights, (b) it is orders of magnitude faster at training time,\n(c) it is amenable to both supervised and unsupervised training, and (d) it can\nbe even used to align image and text features computed from uni-modal models.\nTo achieve this, we propose Linear Feature Alignment (LFA), a simple linear\napproach for V-L re-alignment in the target domain. LFA is initialized from a\nclosed-form solution to a least-squares problem and then it is iteratively\nupdated by minimizing a re-ranking loss. Despite its simplicity, our approach\ncan even surpass soft-prompt learning methods as shown by extensive experiments\non 11 image and 2 video datasets.\n","authors":["Yassine Ouali","Adrian Bulat","Brais Martinez","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2304.01752v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08443v1","updated":"2023-08-16T15:51:05Z","published":"2023-08-16T15:51:05Z","title":"High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement:\n Establishing a Novel Baseline and Benchmark","summary":" The extraction of lakes from remote sensing images is a complex challenge due\nto the varied lake shapes and data noise. Current methods rely on multispectral\nimage datasets, making it challenging to learn lake features accurately from\npixel arrangements. This, in turn, affects model learning and the creation of\naccurate segmentation masks. This paper introduces a unified prompt-based\ndataset construction approach that provides approximate lake locations using\npoint, box, and mask prompts. We also propose a two-stage prompt enhancement\nframework, LEPrompter, which involves prompt-based and prompt-free stages\nduring training. The prompt-based stage employs a prompt encoder to extract\nprior information, integrating prompt tokens and image embeddings through self-\nand cross-attention in the prompt decoder. Prompts are deactivated once the\nmodel is trained to ensure independence during inference, enabling automated\nlake extraction. Evaluations on Surface Water and Qinghai-Tibet Plateau Lake\ndatasets show consistent performance improvements compared to the previous\nstate-of-the-art method. LEPrompter achieves mIoU scores of 91.48% and 97.43%\non the respective datasets without introducing additional parameters or GFLOPs.\nSupplementary materials provide the source code, pre-trained models, and\ndetailed user studies.\n","authors":["Ben Chen","Xuechao Zou","Kai Li","Yu Zhang","Junliang Xing","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2308.08443v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2009.07140v2","updated":"2023-08-16T15:40:23Z","published":"2020-09-15T14:51:10Z","title":"HGCN-GJS: Hierarchical Graph Convolutional Network with Groupwise Joint\n Sampling for Trajectory Prediction","summary":" Accurate pedestrian trajectory prediction is of great importance for\ndownstream tasks such as autonomous driving and mobile robot navigation. Fully\ninvestigating the social interactions within the crowd is crucial for accurate\npedestrian trajectory prediction. However, most existing methods do not capture\ngroup level interactions well, focusing only on pairwise interactions and\nneglecting group-wise interactions. In this work, we propose a hierarchical\ngraph convolutional network, HGCN-GJS, for trajectory prediction which well\nleverages group level interactions within the crowd. Furthermore, we introduce\na novel joint sampling scheme for modeling the joint distribution of multiple\npedestrians in the future trajectories. Based on the group information, this\nscheme associates the trajectory of one person with the trajectory of other\npeople in the group, but maintains the independence of the trajectories of\noutsiders. We demonstrate the performance of our network on several trajectory\nprediction datasets, achieving state-of-the-art results on all datasets\nconsidered.\n","authors":["Yuying Chen","Congcong Liu","Xiaodong Mei","Bertram E. Shi","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2009.07140v2.pdf","comment":"8 pages, 8 figures, accepted by IROS 2022"},{"id":"http://arxiv.org/abs/2308.08431v1","updated":"2023-08-16T15:23:14Z","published":"2023-08-16T15:23:14Z","title":"Integrating Visual and Semantic Similarity Using Hierarchies for Image\n Retrieval","summary":" Most of the research in content-based image retrieval (CBIR) focus on\ndeveloping robust feature representations that can effectively retrieve\ninstances from a database of images that are visually similar to a query.\nHowever, the retrieved images sometimes contain results that are not\nsemantically related to the query. To address this, we propose a method for\nCBIR that captures both visual and semantic similarity using a visual\nhierarchy. The hierarchy is constructed by merging classes with overlapping\nfeatures in the latent space of a deep neural network trained for\nclassification, assuming that overlapping classes share high visual and\nsemantic similarities. Finally, the constructed hierarchy is integrated into\nthe distance calculation metric for similarity search. Experiments on standard\ndatasets: CUB-200-2011 and CIFAR100, and a real-life use case using diatom\nmicroscopy images show that our method achieves superior performance compared\nto the existing methods on image retrieval.\n","authors":["Aishwarya Venkataramanan","Martin Laviale","Cédric Pradalier"],"pdf_url":"https://arxiv.org/pdf/2308.08431v1.pdf","comment":"Accepted in ICVS 2023"},{"id":"http://arxiv.org/abs/2308.08428v1","updated":"2023-08-16T15:19:52Z","published":"2023-08-16T15:19:52Z","title":"ALIP: Adaptive Language-Image Pre-training with Synthetic Caption","summary":" Contrastive Language-Image Pre-training (CLIP) has significantly boosted the\nperformance of various vision-language tasks by scaling up the dataset with\nimage-text pairs collected from the web. However, the presence of intrinsic\nnoise and unmatched image-text pairs in web data can potentially affect the\nperformance of representation learning. To address this issue, we first utilize\nthe OFA model to generate synthetic captions that focus on the image content.\nThe generated captions contain complementary information that is beneficial for\npre-training. Then, we propose an Adaptive Language-Image Pre-training (ALIP),\na bi-path model that integrates supervision from both raw text and synthetic\ncaption. As the core components of ALIP, the Language Consistency Gate (LCG)\nand Description Consistency Gate (DCG) dynamically adjust the weights of\nsamples and image-text/caption pairs during the training process. Meanwhile,\nthe adaptive contrastive loss can effectively reduce the impact of noise data\nand enhances the efficiency of pre-training data. We validate ALIP with\nexperiments on different scales of models and pre-training datasets.\nExperiments results show that ALIP achieves state-of-the-art performance on\nmultiple downstream tasks including zero-shot image-text retrieval and linear\nprobe. To facilitate future research, the code and pre-trained models are\nreleased at https://github.com/deepglint/ALIP.\n","authors":["Kaicheng Yang","Jiankang Deng","Xiang An","Jiawei Li","Ziyong Feng","Jia Guo","Jing Yang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.08428v1.pdf","comment":"15pages, 10figures, ICCV2023"},{"id":"http://arxiv.org/abs/2307.12502v2","updated":"2023-08-16T15:19:49Z","published":"2023-07-24T03:27:41Z","title":"Cross Contrasting Feature Perturbation for Domain Generalization","summary":" Domain generalization (DG) aims to learn a robust model from source domains\nthat generalize well on unseen target domains. Recent studies focus on\ngenerating novel domain samples or features to diversify distributions\ncomplementary to source domains. Yet, these approaches can hardly deal with the\nrestriction that the samples synthesized from various domains can cause\nsemantic distortion. In this paper, we propose an online one-stage Cross\nContrasting Feature Perturbation (CCFP) framework to simulate domain shift by\ngenerating perturbed features in the latent space while regularizing the model\nprediction against domain shift. Different from the previous fixed synthesizing\nstrategy, we design modules with learnable feature perturbations and semantic\nconsistency constraints. In contrast to prior work, our method does not use any\ngenerative-based models or domain labels. We conduct extensive experiments on a\nstandard DomainBed benchmark with a strict evaluation protocol for a fair\ncomparison. Comprehensive experiments show that our method outperforms the\nprevious state-of-the-art, and quantitative analyses illustrate that our\napproach can alleviate the domain shift problem in out-of-distribution (OOD)\nscenarios.\n","authors":["Chenming Li","Daoan Zhang","Wenjian Huang","Jianguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12502v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09703v3","updated":"2023-08-16T15:16:43Z","published":"2022-11-17T17:38:55Z","title":"EfficientTrain: Exploring Generalized Curriculum Learning for Training\n Visual Backbones","summary":" The superior performance of modern deep networks usually comes with a costly\ntraining procedure. This paper presents a new curriculum learning approach for\nthe efficient training of visual backbones (e.g., vision Transformers). Our\nwork is inspired by the inherent learning dynamics of deep networks: we\nexperimentally show that at an earlier training stage, the model mainly learns\nto recognize some 'easier-to-learn' discriminative patterns within each\nexample, e.g., the lower-frequency components of images and the original\ninformation before data augmentation. Driven by this phenomenon, we propose a\ncurriculum where the model always leverages all the training data at each\nepoch, while the curriculum starts with only exposing the 'easier-to-learn'\npatterns of each example, and introduces gradually more difficult patterns. To\nimplement this idea, we 1) introduce a cropping operation in the Fourier\nspectrum of the inputs, which enables the model to learn from only the\nlower-frequency components efficiently, 2) demonstrate that exposing the\nfeatures of original images amounts to adopting weaker data augmentation, and\n3) integrate 1) and 2) and design a curriculum learning schedule with a\ngreedy-search algorithm. The resulting approach, EfficientTrain, is simple,\ngeneral, yet surprisingly effective. As an off-the-shelf method, it reduces the\nwall-time training cost of a wide variety of popular models (e.g., ResNet,\nConvNeXt, DeiT, PVT, Swin, and CSWin) by >1.5x on ImageNet-1K/22K without\nsacrificing accuracy. It is also effective for self-supervised learning (e.g.,\nMAE). Code is available at https://github.com/LeapLabTHU/EfficientTrain.\n","authors":["Yulin Wang","Yang Yue","Rui Lu","Tianjiao Liu","Zhao Zhong","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2211.09703v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08414v1","updated":"2023-08-16T15:00:50Z","published":"2023-08-16T15:00:50Z","title":"Tem-adapter: Adapting Image-Text Pretraining for Video Question Answer","summary":" Video-language pre-trained models have shown remarkable success in guiding\nvideo question-answering (VideoQA) tasks. However, due to the length of video\nsequences, training large-scale video-based models incurs considerably higher\ncosts than training image-based ones. This motivates us to leverage the\nknowledge from image-based pretraining, despite the obvious gaps between image\nand video domains. To bridge these gaps, in this paper, we propose Tem-Adapter,\nwhich enables the learning of temporal dynamics and complex semantics by a\nvisual Temporal Aligner and a textual Semantic Aligner. Unlike conventional\npretrained knowledge adaptation methods that only concentrate on the downstream\ntask objective, the Temporal Aligner introduces an extra language-guided\nautoregressive task aimed at facilitating the learning of temporal\ndependencies, with the objective of predicting future states based on\nhistorical clues and language guidance that describes event progression.\nBesides, to reduce the semantic gap and adapt the textual representation for\nbetter event description, we introduce a Semantic Aligner that first designs a\ntemplate to fuse question and answer pairs as event descriptions and then\nlearns a Transformer decoder with the whole video sequence as guidance for\nrefinement. We evaluate Tem-Adapter and different pre-train transferring\nmethods on two VideoQA benchmarks, and the significant performance improvement\ndemonstrates the effectiveness of our method.\n","authors":["Guangyi Chen","Xiao Liu","Guangrun Wang","Kun Zhang","Philip H. S. Torr","Xiao-Ping Zhang","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2308.08414v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2303.09472v3","updated":"2023-08-16T14:36:41Z","published":"2023-03-16T16:47:14Z","title":"DiffIR: Efficient Diffusion Model for Image Restoration","summary":" Diffusion model (DM) has achieved SOTA performance by modeling the image\nsynthesis process into a sequential application of a denoising network.\nHowever, different from image synthesis, image restoration (IR) has a strong\nconstraint to generate results in accordance with ground-truth. Thus, for IR,\ntraditional DMs running massive iterations on a large model to estimate whole\nimages or feature maps is inefficient. To address this issue, we propose an\nefficient DM for IR (DiffIR), which consists of a compact IR prior extraction\nnetwork (CPEN), dynamic IR transformer (DIRformer), and denoising network.\nSpecifically, DiffIR has two training stages: pretraining and training DM. In\npretraining, we input ground-truth images into CPEN$_{S1}$ to capture a compact\nIR prior representation (IPR) to guide DIRformer. In the second stage, we train\nthe DM to directly estimate the same IRP as pretrained CPEN$_{S1}$ only using\nLQ images. We observe that since the IPR is only a compact vector, DiffIR can\nuse fewer iterations than traditional DM to obtain accurate estimations and\ngenerate more stable and realistic results. Since the iterations are few, our\nDiffIR can adopt a joint optimization of CPEN$_{S2}$, DIRformer, and denoising\nnetwork, which can further reduce the estimation error influence. We conduct\nextensive experiments on several IR tasks and achieve SOTA performance while\nconsuming less computational costs. Code is available at\n\\url{https://github.com/Zj-BinXia/DiffIR}.\n","authors":["Bin Xia","Yulun Zhang","Shiyin Wang","Yitong Wang","Xinglong Wu","Yapeng Tian","Wenming Yang","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.09472v3.pdf","comment":"This paper is accepted by ICCV2023. Codes and models are available at\n https://github.com/Zj-BinXia/DiffIR"},{"id":"http://arxiv.org/abs/2308.08396v1","updated":"2023-08-16T14:28:36Z","published":"2023-08-16T14:28:36Z","title":"Prediction of post-radiotherapy recurrence volumes in head and neck\n squamous cell carcinoma using 3D U-Net segmentation","summary":" Locoregional recurrences (LRR) are still a frequent site of treatment failure\nfor head and neck squamous cell carcinoma (HNSCC) patients.\n Identification of high risk subvolumes based on pretreatment imaging is key\nto biologically targeted radiation therapy. We investigated the extent to which\na Convolutional neural network (CNN) is able to predict LRR volumes based on\npre-treatment 18F-fluorodeoxyglucose positron emission tomography\n(FDG-PET)/computed tomography (CT) scans in HNSCC patients and thus the\npotential to identify biological high risk volumes using CNNs.\n For 37 patients who had undergone primary radiotherapy for oropharyngeal\nsquamous cell carcinoma, five oncologists contoured the relapse volumes on\nrecurrence CT scans. Datasets of pre-treatment FDG-PET/CT, gross tumour volume\n(GTV) and contoured relapse for each of the patients were randomly divided into\ntraining (n=23), validation (n=7) and test (n=7) datasets. We compared a CNN\ntrained from scratch, a pre-trained CNN, a SUVmax threshold approach, and using\nthe GTV directly.\n The SUVmax threshold method included 5 out of the 7 relapse origin points\nwithin a volume of median 4.6 cubic centimetres (cc). Both the GTV contour and\nbest CNN segmentations included the relapse origin 6 out of 7 times with median\nvolumes of 28 and 18 cc respectively.\n The CNN included the same or greater number of relapse volume POs, with\nsignificantly smaller relapse volumes. Our novel findings indicate that CNNs\nmay predict LRR, yet further work on dataset development is required to attain\nclinically useful prediction accuracy.\n","authors":["Denis Kutnár","Ivan R Vogelius","Katrin Elisabet Håkansson","Jens Petersen","Jeppe Friborg","Lena Specht","Mogens Bernsdorf","Anita Gothelf","Claus Kristensen","Abraham George Smith"],"pdf_url":"https://arxiv.org/pdf/2308.08396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08393v1","updated":"2023-08-16T14:25:30Z","published":"2023-08-16T14:25:30Z","title":"SIGMA: Scale-Invariant Global Sparse Shape Matching","summary":" We propose a novel mixed-integer programming (MIP) formulation for generating\nprecise sparse correspondences for highly non-rigid shapes. To this end, we\nintroduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic\nand extrinsic geometric information to measure the deformation quality induced\nby predicted correspondences. We integrate the PLBO, together with an\norientation-aware regulariser, into a novel MIP formulation that can be solved\nto global optimality for many practical problems. In contrast to previous\nmethods, our approach is provably invariant to rigid transformations and global\nscaling, initialisation-free, has optimality guarantees, and scales to high\nresolution meshes with (empirically observed) linear time. We show\nstate-of-the-art results for sparse non-rigid matching on several challenging\n3D datasets, including data with inconsistent meshing, as well as applications\nin mesh-to-point-cloud matching.\n","authors":["Maolin Gao","Paul Roetzer","Marvin Eisenberger","Zorah Lähner","Michael Moeller","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2308.08393v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2303.09219v2","updated":"2023-08-16T14:12:42Z","published":"2023-03-16T10:48:59Z","title":"MixCycle: Mixup Assisted Semi-Supervised 3D Single Object Tracking with\n Cycle Consistency","summary":" 3D single object tracking (SOT) is an indispensable part of automated\ndriving. Existing approaches rely heavily on large, densely labeled datasets.\nHowever, annotating point clouds is both costly and time-consuming. Inspired by\nthe great success of cycle tracking in unsupervised 2D SOT, we introduce the\nfirst semi-supervised approach to 3D SOT. Specifically, we introduce two\ncycle-consistency strategies for supervision: 1) Self tracking cycles, which\nleverage labels to help the model converge better in the early stages of\ntraining; 2) forward-backward cycles, which strengthen the tracker's robustness\nto motion variations and the template noise caused by the template update\nstrategy. Furthermore, we propose a data augmentation strategy named SOTMixup\nto improve the tracker's robustness to point cloud diversity. SOTMixup\ngenerates training samples by sampling points in two point clouds with a mixing\nrate and assigns a reasonable loss weight for training according to the mixing\nrate. The resulting MixCycle approach generalizes to appearance matching-based\ntrackers. On the KITTI benchmark, based on the P2B tracker, MixCycle trained\nwith $\\textbf{10\\%}$ labels outperforms P2B trained with $\\textbf{100\\%}$\nlabels, and achieves a $\\textbf{28.4\\%}$ precision improvement when using\n$\\textbf{1\\%}$ labels. Our code will be released at\n\\url{https://github.com/Mumuqiao/MixCycle}.\n","authors":["Qiao Wu","Jiaqi Yang","Kun Sun","Chu'ai Zhang","Yanning Zhang","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2303.09219v2.pdf","comment":"Accepted by ICCV23"},{"id":"http://arxiv.org/abs/2308.08380v1","updated":"2023-08-16T14:09:39Z","published":"2023-08-16T14:09:39Z","title":"Robust Autonomous Vehicle Pursuit without Expert Steering Labels","summary":" In this work, we present a learning method for lateral and longitudinal\nmotion control of an ego-vehicle for vehicle pursuit. The car being controlled\ndoes not have a pre-defined route, rather it reactively adapts to follow a\ntarget vehicle while maintaining a safety distance. To train our model, we do\nnot rely on steering labels recorded from an expert driver but effectively\nleverage a classical controller as an offline label generation tool. In\naddition, we account for the errors in the predicted control values, which can\nlead to a loss of tracking and catastrophic crashes of the controlled vehicle.\nTo this end, we propose an effective data augmentation approach, which allows\nto train a network capable of handling different views of the target vehicle.\nDuring the pursuit, the target vehicle is firstly localized using a\nConvolutional Neural Network. The network takes a single RGB image along with\ncars' velocities and estimates the target vehicle's pose with respect to the\nego-vehicle. This information is then fed to a Multi-Layer Perceptron, which\nregresses the control commands for the ego-vehicle, namely throttle and\nsteering angle. We extensively validate our approach using the CARLA simulator\non a wide range of terrains. Our method demonstrates real-time performance and\nrobustness to different scenarios including unseen trajectories and high route\ncompletion. The project page containing code and multimedia can be publicly\naccessed here: https://changyaozhou.github.io/Autonomous-Vehicle-Pursuit/.\n","authors":["Jiaxin Pan","Changyao Zhou","Mariia Gladkova","Qadeer Khan","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2308.08380v1.pdf","comment":"9 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.08376v1","updated":"2023-08-16T13:59:43Z","published":"2023-08-16T13:59:43Z","title":"Automated Semiconductor Defect Inspection in Scanning Electron\n Microscope Images: a Systematic Review","summary":" A growing need exists for efficient and accurate methods for detecting\ndefects in semiconductor materials and devices. These defects can have a\ndetrimental impact on the efficiency of the manufacturing process, because they\ncause critical failures and wafer-yield limitations. As nodes and patterns get\nsmaller, even high-resolution imaging techniques such as Scanning Electron\nMicroscopy (SEM) produce noisy images due to operating close to sensitivity\nlevels and due to varying physical properties of different underlayers or\nresist materials. This inherent noise is one of the main challenges for defect\ninspection. One promising approach is the use of machine learning algorithms,\nwhich can be trained to accurately classify and locate defects in semiconductor\nsamples. Recently, convolutional neural networks have proved to be particularly\nuseful in this regard. This systematic review provides a comprehensive overview\nof the state of automated semiconductor defect inspection on SEM images,\nincluding the most recent innovations and developments. 38 publications were\nselected on this topic, indexed in IEEE Xplore and SPIE databases. For each of\nthese, the application, methodology, dataset, results, limitations and future\nwork were summarized. A comprehensive overview and analysis of their methods is\nprovided. Finally, promising avenues for future work in the field of SEM-based\ndefect inspection are suggested.\n","authors":["Thibault Lechien","Enrique Dehaerne","Bappaditya Dey","Victor Blanco","Stefan De Gendt","Wannes Meert"],"pdf_url":"https://arxiv.org/pdf/2308.08376v1.pdf","comment":"16 pages, 12 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.08370v1","updated":"2023-08-16T13:48:02Z","published":"2023-08-16T13:48:02Z","title":"Agglomerative Transformer for Human-Object Interaction Detection","summary":" We propose an agglomerative Transformer (AGER) that enables Transformer-based\nhuman-object interaction (HOI) detectors to flexibly exploit extra\ninstance-level cues in a single-stage and end-to-end manner for the first time.\nAGER acquires instance tokens by dynamically clustering patch tokens and\naligning cluster centers to instances with textual guidance, thus enjoying two\nbenefits: 1) Integrality: each instance token is encouraged to contain all\ndiscriminative feature regions of an instance, which demonstrates a significant\nimprovement in the extraction of different instance-level cues and subsequently\nleads to a new state-of-the-art performance of HOI detection with 36.75 mAP on\nHICO-Det. 2) Efficiency: the dynamical clustering mechanism allows AGER to\ngenerate instance tokens jointly with the feature learning of the Transformer\nencoder, eliminating the need of an additional object detector or instance\ndecoder in prior methods, thus allowing the extraction of desirable extra cues\nfor HOI detection in a single-stage and end-to-end pipeline. Concretely, AGER\nreduces GFLOPs by 8.5% and improves FPS by 36%, even compared to a vanilla\nDETR-like pipeline without extra cue extraction.\n","authors":["Danyang Tu","Wei Sun","Guangtao Zhai","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2308.08370v1.pdf","comment":"Accepted by ICCV'23"},{"id":"http://arxiv.org/abs/2308.08367v1","updated":"2023-08-16T13:41:29Z","published":"2023-08-16T13:41:29Z","title":"Diff-CAPTCHA: An Image-based CAPTCHA with Security Enhanced by Denoising\n Diffusion Model","summary":" To enhance the security of text CAPTCHAs, various methods have been employed,\nsuch as adding the interference lines on the text, randomly distorting the\ncharacters, and overlapping multiple characters. These methods partly increase\nthe difficulty of automated segmentation and recognition attacks. However,\nfacing the rapid development of the end-to-end breaking algorithms, their\nsecurity has been greatly weakened. The diffusion model is a novel image\ngeneration model that can generate the text images with deep fusion of\ncharacters and background images. In this paper, an image-click CAPTCHA scheme\ncalled Diff-CAPTCHA is proposed based on denoising diffusion models. The\nbackground image and characters of the CAPTCHA are treated as a whole to guide\nthe generation process of a diffusion model, thus weakening the character\nfeatures available for machine learning, enhancing the diversity of character\nfeatures in the CAPTCHA, and increasing the difficulty of breaking algorithms.\nTo evaluate the security of Diff-CAPTCHA, this paper develops several attack\nmethods, including end-to-end attacks based on Faster R-CNN and two-stage\nattacks, and Diff-CAPTCHA is compared with three baseline schemes, including\ncommercial CAPTCHA scheme and security-enhanced CAPTCHA scheme based on style\ntransfer. The experimental results show that diffusion models can effectively\nenhance CAPTCHA security while maintaining good usability in human testing.\n","authors":["Ran Jiang","Sanfeng Zhang","Linfeng Liu","Yanbing Peng"],"pdf_url":"https://arxiv.org/pdf/2308.08367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08365v1","updated":"2023-08-16T13:40:01Z","published":"2023-08-16T13:40:01Z","title":"DeepContrast: Deep Tissue Contrast Enhancement using Synthetic Data\n Degradations and OOD Model Predictions","summary":" Microscopy images are crucial for life science research, allowing detailed\ninspection and characterization of cellular and tissue-level structures and\nfunctions. However, microscopy data are unavoidably affected by image\ndegradations, such as noise, blur, or others. Many such degradations also\ncontribute to a loss of image contrast, which becomes especially pronounced in\ndeeper regions of thick samples. Today, best performing methods to increase the\nquality of images are based on Deep Learning approaches, which typically\nrequire ground truth (GT) data during training. Our inability to counteract\nblurring and contrast loss when imaging deep into samples prevents the\nacquisition of such clean GT data. The fact that the forward process of\nblurring and contrast loss deep into tissue can be modeled, allowed us to\npropose a new method that can circumvent the problem of unobtainable GT data.\nTo this end, we first synthetically degraded the quality of microscopy images\neven further by using an approximate forward model for deep tissue image\ndegradations. Then we trained a neural network that learned the inverse of this\ndegradation function from our generated pairs of raw and degraded images. We\ndemonstrated that networks trained in this way can be used out-of-distribution\n(OOD) to improve the quality of less severely degraded images, e.g. the raw\ndata imaged in a microscope. Since the absolute level of degradation in such\nmicroscopy images can be stronger than the additional degradation introduced by\nour forward model, we also explored the effect of iterative predictions. Here,\nwe observed that in each iteration the measured image contrast kept improving\nwhile detailed structures in the images got increasingly removed. Therefore,\ndependent on the desired downstream analysis, a balance between contrast\nimprovement and retention of image details has to be found.\n","authors":["Nuno Pimpão Martins","Yannis Kalaidzidis","Marino Zerial","Florian Jug"],"pdf_url":"https://arxiv.org/pdf/2308.08365v1.pdf","comment":"8 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.08361v1","updated":"2023-08-16T13:35:09Z","published":"2023-08-16T13:35:09Z","title":"KernelWarehouse: Towards Parameter-Efficient Dynamic Convolution","summary":" Dynamic convolution learns a linear mixture of $n$ static kernels weighted\nwith their sample-dependent attentions, demonstrating superior performance\ncompared to normal convolution. However, existing designs are\nparameter-inefficient: they increase the number of convolutional parameters by\n$n$ times. This and the optimization difficulty lead to no research progress in\ndynamic convolution that can allow us to use a significant large value of $n$\n(e.g., $n>100$ instead of typical setting $n<10$) to push forward the\nperformance boundary. In this paper, we propose $KernelWarehouse$, a more\ngeneral form of dynamic convolution, which can strike a favorable trade-off\nbetween parameter efficiency and representation power. Its key idea is to\nredefine the basic concepts of \"$kernels$\" and \"$assembling$ $kernels$\" in\ndynamic convolution from the perspective of reducing kernel dimension and\nincreasing kernel number significantly. In principle, KernelWarehouse enhances\nconvolutional parameter dependencies within the same layer and across\nsuccessive layers via tactful kernel partition and warehouse sharing, yielding\na high degree of freedom to fit a desired parameter budget. We validate our\nmethod on ImageNet and MS-COCO datasets with different ConvNet architectures,\nand show that it attains state-of-the-art results. For instance, the\nResNet18|ResNet50|MobileNetV2|ConvNeXt-Tiny model trained with KernelWarehouse\non ImageNet reaches 76.05%|81.05%|75.52%|82.51% top-1 accuracy. Thanks to its\nflexible design, KernelWarehouse can even reduce the model size of a ConvNet\nwhile improving the accuracy, e.g., our ResNet18 model with 36.45%|65.10%\nparameter reduction to the baseline shows 2.89%|2.29% absolute improvement to\ntop-1 accuracy.\n","authors":["Chao Li","Anbang Yao"],"pdf_url":"https://arxiv.org/pdf/2308.08361v1.pdf","comment":"This research work was completed and submitted in early May 2023.\n Code and pre-trained models are available at\n https://github.com/OSVAI/KernelWarehouse"},{"id":"http://arxiv.org/abs/2308.08359v1","updated":"2023-08-16T13:32:03Z","published":"2023-08-16T13:32:03Z","title":"Membrane Potential Batch Normalization for Spiking Neural Networks","summary":" As one of the energy-efficient alternatives of conventional neural networks\n(CNNs), spiking neural networks (SNNs) have gained more and more interest\nrecently. To train the deep models, some effective batch normalization (BN)\ntechniques are proposed in SNNs. All these BNs are suggested to be used after\nthe convolution layer as usually doing in CNNs. However, the spiking neuron is\nmuch more complex with the spatio-temporal dynamics. The regulated data flow\nafter the BN layer will be disturbed again by the membrane potential updating\noperation before the firing function, i.e., the nonlinear activation.\nTherefore, we advocate adding another BN layer before the firing function to\nnormalize the membrane potential again, called MPBN. To eliminate the induced\ntime cost of MPBN, we also propose a training-inference-decoupled\nre-parameterization technique to fold the trained MPBN into the firing\nthreshold. With the re-parameterization technique, the MPBN will not introduce\nany extra time burden in the inference. Furthermore, the MPBN can also adopt\nthe element-wised form, while these BNs after the convolution layer can only\nuse the channel-wised form. Experimental results show that the proposed MPBN\nperforms well on both popular non-spiking static and neuromorphic datasets. Our\ncode is open-sourced at \\href{https://github.com/yfguo91/MPBN}{MPBN}.\n","authors":["Yufei Guo","Yuhan Zhang","Yuanpei Chen","Weihang Peng","Xiaode Liu","Liwen Zhang","Xuhui Huang","Zhe Ma"],"pdf_url":"https://arxiv.org/pdf/2308.08359v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.15029v2","updated":"2023-08-16T13:22:18Z","published":"2023-07-27T17:37:56Z","title":"Adaptive Segmentation Network for Scene Text Detection","summary":" Inspired by deep convolution segmentation algorithms, scene text detectors\nbreak the performance ceiling of datasets steadily. However, these methods\noften encounter threshold selection bottlenecks and have poor performance on\ntext instances with extreme aspect ratios. In this paper, we propose to\nautomatically learn the discriminate segmentation threshold, which\ndistinguishes text pixels from background pixels for segmentation-based scene\ntext detectors and then further reduces the time-consuming manual parameter\nadjustment. Besides, we design a Global-information Enhanced Feature Pyramid\nNetwork (GE-FPN) for capturing text instances with macro size and extreme\naspect ratios. Following the GE-FPN, we introduce a cascade optimization\nstructure to further refine the text instances. Finally, together with the\nproposed threshold learning strategy and text detection structure, we design an\nAdaptive Segmentation Network (ASNet) for scene text detection. Extensive\nexperiments are carried out to demonstrate that the proposed ASNet can achieve\nthe state-of-the-art performance on four text detection benchmarks, i.e., ICDAR\n2015, MSRA-TD500, ICDAR 2017 MLT and CTW1500. The ablation experiments also\nverify the effectiveness of our contributions.\n","authors":["Guiqin Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.15029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15128v2","updated":"2023-08-16T13:15:03Z","published":"2023-07-27T18:04:45Z","title":"End-to-end Remote Sensing Change Detection of Unregistered Bi-temporal\n Images for Natural Disasters","summary":" Change detection based on remote sensing images has been a prominent area of\ninterest in the field of remote sensing. Deep networks have demonstrated\nsignificant success in detecting changes in bi-temporal remote sensing images\nand have found applications in various fields. Given the degradation of natural\nenvironments and the frequent occurrence of natural disasters, accurately and\nswiftly identifying damaged buildings in disaster-stricken areas through remote\nsensing images holds immense significance. This paper aims to investigate\nchange detection specifically for natural disasters. Considering that existing\npublic datasets used in change detection research are registered, which does\nnot align with the practical scenario where bi-temporal images are not matched,\nthis paper introduces an unregistered end-to-end change detection synthetic\ndataset called xBD-E2ECD. Furthermore, we propose an end-to-end change\ndetection network named E2ECDNet, which takes an unregistered bi-temporal image\npair as input and simultaneously generates the flow field prediction result and\nthe change detection prediction result. It is worth noting that our E2ECDNet\nalso supports change detection for registered image pairs, as registration can\nbe seen as a special case of non-registration. Additionally, this paper\nredefines the criteria for correctly predicting a positive case and introduces\nneighborhood-based change detection evaluation metrics. The experimental\nresults have demonstrated significant improvements.\n","authors":["Guiqin Zhao","Lianlei Shan","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.15128v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08345v1","updated":"2023-08-16T13:10:32Z","published":"2023-08-16T13:10:32Z","title":"GAEI-UNet: Global Attention and Elastic Interaction U-Net for Vessel\n Image Segmentation","summary":" Vessel image segmentation plays a pivotal role in medical diagnostics, aiding\nin the early detection and treatment of vascular diseases. While segmentation\nbased on deep learning has shown promising results, effectively segmenting\nsmall structures and maintaining connectivity between them remains challenging.\nTo address these limitations, we propose GAEI-UNet, a novel model that combines\nglobal attention and elastic interaction-based techniques. GAEI-UNet leverages\nglobal spatial and channel context information to enhance high-level semantic\nunderstanding within the U-Net architecture, enabling precise segmentation of\nsmall vessels. Additionally, we adopt an elastic interaction-based loss\nfunction to improve connectivity among these fine structures. By capturing the\nforces generated by misalignment between target and predicted shapes, our model\neffectively learns to preserve the correct topology of vessel networks.\nEvaluation on retinal vessel dataset -- DRIVE demonstrates the superior\nperformance of GAEI-UNet in terms of SE and connectivity of small structures,\nwithout significantly increasing computational complexity. This research aims\nto advance the field of vessel image segmentation, providing more accurate and\nreliable diagnostic tools for the medical community. The implementation code is\navailable on Code.\n","authors":["Ruiqiang Xiao","Zhuoyue Wan","Yang Xiang"],"pdf_url":"https://arxiv.org/pdf/2308.08345v1.pdf","comment":"BIBM 2023 Under Review"},{"id":"http://arxiv.org/abs/2307.12761v2","updated":"2023-08-16T13:03:33Z","published":"2023-07-24T13:05:36Z","title":"LiDAR Meta Depth Completion","summary":" Depth estimation is one of the essential tasks to be addressed when creating\nmobile autonomous systems. While monocular depth estimation methods have\nimproved in recent times, depth completion provides more accurate and reliable\ndepth maps by additionally using sparse depth information from other sensors\nsuch as LiDAR. However, current methods are specifically trained for a single\nLiDAR sensor. As the scanning pattern differs between sensors, every new sensor\nwould require re-training a specialized depth completion model, which is\ncomputationally inefficient and not flexible. Therefore, we propose to\ndynamically adapt the depth completion model to the used sensor type enabling\nLiDAR adaptive depth completion. Specifically, we propose a meta depth\ncompletion network that uses data patterns derived from the data to learn a\ntask network to alter weights of the main depth completion network to solve a\ngiven depth completion task effectively. The method demonstrates a strong\ncapability to work on multiple LiDAR scanning patterns and can also generalize\nto scanning patterns that are unseen during training. While using a single\nmodel, our method yields significantly better results than a non-adaptive\nbaseline trained on different LiDAR patterns. It outperforms LiDAR-specific\nexpert models for very sparse cases. These advantages allow flexible deployment\nof a single depth completion model on different sensors, which could also prove\nvaluable to process the input of nascent LiDAR technology with adaptive instead\nof fixed scanning patterns.\n","authors":["Wolfgang Boettcher","Lukas Hoyer","Ozan Unal","Ke Li","Dengxin Dai"],"pdf_url":"https://arxiv.org/pdf/2307.12761v2.pdf","comment":"Accepted at IROS 2023, v2 has updated author list and fixed a figure\n caption"},{"id":"http://arxiv.org/abs/2308.08339v1","updated":"2023-08-16T13:01:13Z","published":"2023-08-16T13:01:13Z","title":"Denoising Diffusion Probabilistic Model for Retinal Image Generation and\n Segmentation","summary":" Experts use retinal images and vessel trees to detect and diagnose various\neye, blood circulation, and brain-related diseases. However, manual\nsegmentation of retinal images is a time-consuming process that requires high\nexpertise and is difficult due to privacy issues. Many methods have been\nproposed to segment images, but the need for large retinal image datasets\nlimits the performance of these methods. Several methods synthesize deep\nlearning models based on Generative Adversarial Networks (GAN) to generate\nlimited sample varieties. This paper proposes a novel Denoising Diffusion\nProbabilistic Model (DDPM) that outperformed GANs in image synthesis. We\ndeveloped a Retinal Trees (ReTree) dataset consisting of retinal images,\ncorresponding vessel trees, and a segmentation network based on DDPM trained\nwith images from the ReTree dataset. In the first stage, we develop a two-stage\nDDPM that generates vessel trees from random numbers belonging to a standard\nnormal distribution. Later, the model is guided to generate fundus images from\ngiven vessel trees and random distribution. The proposed dataset has been\nevaluated quantitatively and qualitatively. Quantitative evaluation metrics\ninclude Frechet Inception Distance (FID) score, Jaccard similarity coefficient,\nCohen's kappa, Matthew's Correlation Coefficient (MCC), precision, recall,\nF1-score, and accuracy. We trained the vessel segmentation model with synthetic\ndata to validate our dataset's efficiency and tested it on authentic data. Our\ndeveloped dataset and source code is available at\nhttps://github.com/AAleka/retree.\n","authors":["Alnur Alimanov","Md Baharul Islam"],"pdf_url":"https://arxiv.org/pdf/2308.08339v1.pdf","comment":"International Conference on Computational Photography 2023 (ICCP\n 2023)"},{"id":"http://arxiv.org/abs/2308.08333v1","updated":"2023-08-16T12:46:52Z","published":"2023-08-16T12:46:52Z","title":"Improving Depth Gradient Continuity in Transformers: A Comparative Study\n on Monocular Depth Estimation with CNN","summary":" Monocular depth estimation is an ongoing challenge in computer vision. Recent\nprogress with Transformer models has demonstrated notable advantages over\nconventional CNNs in this area. However, there's still a gap in understanding\nhow these models prioritize different regions in 2D images and how these\nregions affect depth estimation performance. To explore the differences between\nTransformers and CNNs, we employ a sparse pixel approach to contrastively\nanalyze the distinctions between the two. Our findings suggest that while\nTransformers excel in handling global context and intricate textures, they lag\nbehind CNNs in preserving depth gradient continuity. To further enhance the\nperformance of Transformer models in monocular depth estimation, we propose the\nDepth Gradient Refinement (DGR) module that refines depth estimation through\nhigh-order differentiation, feature fusion, and recalibration. Additionally, we\nleverage optimal transport theory, treating depth maps as spatial probability\ndistributions, and employ the optimal transport distance as a loss function to\noptimize our model. Experimental results demonstrate that models integrated\nwith the plug-and-play Depth Gradient Refinement (DGR) module and the proposed\nloss function enhance performance without increasing complexity and\ncomputational costs. This research not only offers fresh insights into the\ndistinctions between Transformers and CNNs in depth estimation but also paves\nthe way for novel depth estimation methodologies.\n","authors":["Jiawei Yao","Tong Wu","Xiaofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08327v1","updated":"2023-08-16T12:40:47Z","published":"2023-08-16T12:40:47Z","title":"AdaBrowse: Adaptive Video Browser for Efficient Continuous Sign Language\n Recognition","summary":" Raw videos have been proven to own considerable feature redundancy where in\nmany cases only a portion of frames can already meet the requirements for\naccurate recognition. In this paper, we are interested in whether such\nredundancy can be effectively leveraged to facilitate efficient inference in\ncontinuous sign language recognition (CSLR). We propose a novel adaptive model\n(AdaBrowse) to dynamically select a most informative subsequence from input\nvideo sequences by modelling this problem as a sequential decision task. In\nspecific, we first utilize a lightweight network to quickly scan input videos\nto extract coarse features. Then these features are fed into a policy network\nto intelligently select a subsequence to process. The corresponding subsequence\nis finally inferred by a normal CSLR model for sentence prediction. As only a\nportion of frames are processed in this procedure, the total computations can\nbe considerably saved. Besides temporal redundancy, we are also interested in\nwhether the inherent spatial redundancy can be seamlessly integrated together\nto achieve further efficiency, i.e., dynamically selecting a lowest input\nresolution for each sample, whose model is referred to as AdaBrowse+. Extensive\nexperimental results on four large-scale CSLR datasets, i.e., PHOENIX14,\nPHOENIX14-T, CSL-Daily and CSL, demonstrate the effectiveness of AdaBrowse and\nAdaBrowse+ by achieving comparable accuracy with state-of-the-art methods with\n1.44$\\times$ throughput and 2.12$\\times$ fewer FLOPs. Comparisons with other\ncommonly-used 2D CNNs and adaptive efficient methods verify the effectiveness\nof AdaBrowse. Code is available at\n\\url{https://github.com/hulianyuyy/AdaBrowse}.\n","authors":["Lianyu Hu","Liqing Gao","Zekang Liu","Chi-Man Pun","Wei Feng"],"pdf_url":"https://arxiv.org/pdf/2308.08327v1.pdf","comment":"ACMMM2023"},{"id":"http://arxiv.org/abs/2308.08325v1","updated":"2023-08-16T12:39:39Z","published":"2023-08-16T12:39:39Z","title":"Visually-Aware Context Modeling for News Image Captioning","summary":" The goal of News Image Captioning is to generate an image caption according\nto the content of both a news article and an image. To leverage the visual\ninformation effectively, it is important to exploit the connection between the\ncontext in the articles/captions and the images. Psychological studies indicate\nthat human faces in images draw higher attention priorities. On top of that,\nhumans often play a central role in news stories, as also proven by the\nface-name co-occurrence pattern we discover in existing News Image Captioning\ndatasets. Therefore, we design a face-naming module for faces in images and\nnames in captions/articles to learn a better name embedding. Apart from names,\nwhich can be directly linked to an image area (faces), news image captions\nmostly contain context information that can only be found in the article.\nHumans typically address this by searching for relevant information from the\narticle based on the image. To emulate this thought process, we design a\nretrieval strategy using CLIP to retrieve sentences that are semantically close\nto the image. We conduct extensive experiments to demonstrate the efficacy of\nour framework. Without using additional paired data, we establish the new\nstate-of-the-art performance on two News Image Captioning datasets, exceeding\nthe previous state-of-the-art by 5 CIDEr points. We will release code upon\nacceptance.\n","authors":["Tingyu Qu","Tinne Tuytelaars","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.08325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08321v1","updated":"2023-08-16T12:30:17Z","published":"2023-08-16T12:30:17Z","title":"Stable and Causal Inference for Discriminative Self-supervised Deep\n Visual Representations","summary":" In recent years, discriminative self-supervised methods have made significant\nstrides in advancing various visual tasks. The central idea of learning a data\nencoder that is robust to data distortions/augmentations is straightforward yet\nhighly effective. Although many studies have demonstrated the empirical success\nof various learning methods, the resulting learned representations can exhibit\ninstability and hinder downstream performance. In this study, we analyze\ndiscriminative self-supervised methods from a causal perspective to explain\nthese unstable behaviors and propose solutions to overcome them. Our approach\ndraws inspiration from prior works that empirically demonstrate the ability of\ndiscriminative self-supervised methods to demix ground truth causal sources to\nsome extent. Unlike previous work on causality-empowered representation\nlearning, we do not apply our solutions during the training process but rather\nduring the inference process to improve time efficiency. Through experiments on\nboth controlled image datasets and realistic image datasets, we show that our\nproposed solutions, which involve tempering a linear transformation with\ncontrolled synthetic data, are effective in addressing these issues.\n","authors":["Yuewei Yang","Hai Li","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08321v1.pdf","comment":"ICCV 2023 accepted paper"},{"id":"http://arxiv.org/abs/2308.08316v1","updated":"2023-08-16T12:22:29Z","published":"2023-08-16T12:22:29Z","title":"Dual-Stream Diffusion Net for Text-to-Video Generation","summary":" With the emerging diffusion models, recently, text-to-video generation has\naroused increasing attention. But an important bottleneck therein is that\ngenerative videos often tend to carry some flickers and artifacts. In this\nwork, we propose a dual-stream diffusion net (DSDN) to improve the consistency\nof content variations in generating videos. In particular, the designed two\ndiffusion streams, video content and motion branches, could not only run\nseparately in their private spaces for producing personalized video variations\nas well as content, but also be well-aligned between the content and motion\ndomains through leveraging our designed cross-transformer interaction module,\nwhich would benefit the smoothness of generated videos. Besides, we also\nintroduce motion decomposer and combiner to faciliate the operation on video\nmotion. Qualitative and quantitative experiments demonstrate that our method\ncould produce amazing continuous videos with fewer flickers.\n","authors":["Binhui Liu","Xin Liu","Anbo Dai","Zhiyong Zeng","Zhen Cui","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08316v1.pdf","comment":"8pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.08313v1","updated":"2023-08-16T12:18:27Z","published":"2023-08-16T12:18:27Z","title":"ECPC-IDS:A benchmark endometrail cancer PET/CT image dataset for\n evaluation of semantic segmentation and detection of hypermetabolic regions","summary":" Endometrial cancer is one of the most common tumors in the female\nreproductive system and is the third most common gynecological malignancy that\ncauses death after ovarian and cervical cancer. Early diagnosis can\nsignificantly improve the 5-year survival rate of patients. With the\ndevelopment of artificial intelligence, computer-assisted diagnosis plays an\nincreasingly important role in improving the accuracy and objectivity of\ndiagnosis, as well as reducing the workload of doctors. However, the absence of\npublicly available endometrial cancer image datasets restricts the application\nof computer-assisted diagnostic techniques.In this paper, a publicly available\nEndometrial Cancer PET/CT Image Dataset for Evaluation of Semantic Segmentation\nand Detection of Hypermetabolic Regions (ECPC-IDS) are published. Specifically,\nthe segmentation section includes PET and CT images, with a total of 7159\nimages in multiple formats. In order to prove the effectiveness of segmentation\nmethods on ECPC-IDS, five classical deep learning semantic segmentation methods\nare selected to test the image segmentation task. The object detection section\nalso includes PET and CT images, with a total of 3579 images and XML files with\nannotation information. Six deep learning methods are selected for experiments\non the detection task.This study conduct extensive experiments using deep\nlearning-based semantic segmentation and object detection methods to\ndemonstrate the differences between various methods on ECPC-IDS. As far as we\nknow, this is the first publicly available dataset of endometrial cancer with a\nlarge number of multiple images, including a large amount of information\nrequired for image and target detection. ECPC-IDS can aid researchers in\nexploring new algorithms to enhance computer-assisted technology, benefiting\nboth clinical doctors and patients greatly.\n","authors":["Dechao Tang","Xuanyi Li","Tianming Du","Deguo Ma","Zhiyu Ma","Hongzan Sun","Marcin Grzegorzek","Huiyan Jiang","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2308.08313v1.pdf","comment":"14 pages,6 figures"},{"id":"http://arxiv.org/abs/2303.08646v2","updated":"2023-08-16T12:15:29Z","published":"2023-03-15T14:23:07Z","title":"HFGD: High-level Feature Guided Decoder for Semantic Segmentation","summary":" Existing pyramid-based upsamplers (e.g. SemanticFPN), although efficient,\nusually produce less accurate results compared to dilation-based models when\nusing the same backbone. This is partially caused by the contaminated\nhigh-level features since they are fused and fine-tuned with noisy low-level\nfeatures on limited data. To address this issue, we propose to use powerful\npretrained high-level features as guidance (HFG) when learning to upsample the\nfine-grained low-level features. Specifically, the class tokens are trained\nalong with only the high-level features from the backbone. These class tokens\nare reused by the upsampler for classification, guiding the upsampler features\nto more discriminative backbone features. One key design of the HFG is to\nprotect the high-level features from being contaminated with proper\nstop-gradient operations so that the backbone does not update according to the\ngradient from the upsampler. To push the upper limit of HFG, we introduce an\ncontext augmentation encoder (CAE) that can efficiently and effectively\noperates on low-resolution high-level feature, resulting in improved\nrepresentation and thus better guidance. We evaluate the proposed method on\nthree benchmarks: Pascal Context, COCOStuff164k, and Cityscapes. Our method\nachieves state-of-the-art results among methods that do not use extra training\ndata, demonstrating its effectiveness and generalization ability. The complete\ncode will be released\n","authors":["Ye Huang","Di Kang","Shenghua Gao","Wen Li","Lixin Duan"],"pdf_url":"https://arxiv.org/pdf/2303.08646v2.pdf","comment":"Revised version, refactored presentation and added more experiments"},{"id":"http://arxiv.org/abs/2307.08693v2","updated":"2023-08-16T12:12:45Z","published":"2023-07-17T17:53:36Z","title":"SEMI-DiffusionInst: A Diffusion Model Based Approach for Semiconductor\n Defect Classification and Segmentation","summary":" With continuous progression of Moore's Law, integrated circuit (IC) device\ncomplexity is also increasing. Scanning Electron Microscope (SEM) image based\nextensive defect inspection and accurate metrology extraction are two main\nchallenges in advanced node (2 nm and beyond) technology. Deep learning (DL)\nalgorithm based computer vision approaches gained popularity in semiconductor\ndefect inspection over last few years. In this research work, a new\nsemiconductor defect inspection framework \"SEMI-DiffusionInst\" is investigated\nand compared to previous frameworks. To the best of the authors' knowledge,\nthis work is the first demonstration to accurately detect and precisely segment\nsemiconductor defect patterns by using a diffusion model. Different feature\nextractor networks as backbones and data sampling strategies are investigated\ntowards achieving a balanced trade-off between precision and computing\nefficiency. Our proposed approach outperforms previous work on overall mAP and\nperforms comparatively better or as per for almost all defect classes (per\nclass APs). The bounding box and segmentation mAPs achieved by the proposed\nSEMI-DiffusionInst model are improved by 3.83% and 2.10%, respectively. Among\nindividual defect types, precision on line collapse and thin bridge defects are\nimproved approximately 15\\% on detection task for both defect types. It has\nalso been shown that by tuning inference hyperparameters, inference time can be\nimproved significantly without compromising model precision. Finally, certain\nlimitations and future work strategy to overcome them are discussed.\n","authors":["Vic De Ridder","Bappaditya Dey","Sandip Halder","Bartel Van Waeyenberge"],"pdf_url":"https://arxiv.org/pdf/2307.08693v2.pdf","comment":"6 pages, 5 figures, To be published by IEEE in the proceedings of the\n 2023 ELMAR conference"},{"id":"http://arxiv.org/abs/2308.08303v1","updated":"2023-08-16T12:07:02Z","published":"2023-08-16T12:07:02Z","title":"Leveraging Next-Active Objects for Context-Aware Anticipation in\n Egocentric Videos","summary":" Objects are crucial for understanding human-object interactions. By\nidentifying the relevant objects, one can also predict potential future\ninteractions or actions that may occur with these objects. In this paper, we\nstudy the problem of Short-Term Object interaction anticipation (STA) and\npropose NAOGAT (Next-Active-Object Guided Anticipation Transformer), a\nmulti-modal end-to-end transformer network, that attends to objects in observed\nframes in order to anticipate the next-active-object (NAO) and, eventually, to\nguide the model to predict context-aware future actions. The task is\nchallenging since it requires anticipating future action along with the object\nwith which the action occurs and the time after which the interaction will\nbegin, a.k.a. the time to contact (TTC). Compared to existing video modeling\narchitectures for action anticipation, NAOGAT captures the relationship between\nobjects and the global scene context in order to predict detections for the\nnext active object and anticipate relevant future actions given these\ndetections, leveraging the objects' dynamics to improve accuracy. One of the\nkey strengths of our approach, in fact, is its ability to exploit the motion\ndynamics of objects within a given clip, which is often ignored by other\nmodels, and separately decoding the object-centric and motion-centric\ninformation. Through our experiments, we show that our model outperforms\nexisting methods on two separate datasets, Ego4D and EpicKitchens-100 (\"Unseen\nSet\"), as measured by several additional metrics, such as time to contact, and\nnext-active-object localization. The code will be available upon acceptance.\n","authors":["Sanket Thakur","Cigdem Beyan","Pietro Morerio","Vittorio Murino","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.08303v1.pdf","comment":"Accepted in WACV'24"},{"id":"http://arxiv.org/abs/2308.08288v1","updated":"2023-08-16T11:20:23Z","published":"2023-08-16T11:20:23Z","title":"Improving Audio-Visual Segmentation with Bidirectional Generation","summary":" The aim of audio-visual segmentation (AVS) is to precisely differentiate\naudible objects within videos down to the pixel level. Traditional approaches\noften tackle this challenge by combining information from various modalities,\nwhere the contribution of each modality is implicitly or explicitly modeled.\nNevertheless, the interconnections between different modalities tend to be\noverlooked in audio-visual modeling. In this paper, inspired by the human\nability to mentally simulate the sound of an object and its visual appearance,\nwe introduce a bidirectional generation framework. This framework establishes\nrobust correlations between an object's visual characteristics and its\nassociated sound, thereby enhancing the performance of AVS. To achieve this, we\nemploy a visual-to-audio projection component that reconstructs audio features\nfrom object segmentation masks and minimizes reconstruction errors. Moreover,\nrecognizing that many sounds are linked to object movements, we introduce an\nimplicit volumetric motion estimation module to handle temporal dynamics that\nmay be challenging to capture using conventional optical flow methods. To\nshowcase the effectiveness of our approach, we conduct comprehensive\nexperiments and analyses on the widely recognized AVSBench benchmark. As a\nresult, we establish a new state-of-the-art performance level in the AVS\nbenchmark, particularly excelling in the challenging MS3 subset which involves\nsegmenting multiple sound sources. To facilitate reproducibility, we plan to\nrelease both the source code and the pre-trained model.\n","authors":["Dawei Hao","Yuxin Mao","Bowen He","Xiaodong Han","Yuchao Dai","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.08288v1.pdf","comment":"Dawei Hao and Yuxin Mao contribute equality to this paper. Yiran\n Zhong is the corresponding author. The code will be released at\n https://github.com/OpenNLPLab/AVS-bidirectional"},{"id":"http://arxiv.org/abs/2305.13873v2","updated":"2023-08-16T11:16:15Z","published":"2023-05-23T09:48:16Z","title":"Unsafe Diffusion: On the Generation of Unsafe Images and Hateful Memes\n From Text-To-Image Models","summary":" State-of-the-art Text-to-Image models like Stable Diffusion and DALLE$\\cdot$2\nare revolutionizing how people generate visual content. At the same time,\nsociety has serious concerns about how adversaries can exploit such models to\ngenerate unsafe images. In this work, we focus on demystifying the generation\nof unsafe images and hateful memes from Text-to-Image models. We first\nconstruct a typology of unsafe images consisting of five categories (sexually\nexplicit, violent, disturbing, hateful, and political). Then, we assess the\nproportion of unsafe images generated by four advanced Text-to-Image models\nusing four prompt datasets. We find that these models can generate a\nsubstantial percentage of unsafe images; across four models and four prompt\ndatasets, 14.56% of all generated images are unsafe. When comparing the four\nmodels, we find different risk levels, with Stable Diffusion being the most\nprone to generating unsafe content (18.92% of all generated images are unsafe).\nGiven Stable Diffusion's tendency to generate more unsafe content, we evaluate\nits potential to generate hateful meme variants if exploited by an adversary to\nattack a specific individual or community. We employ three image editing\nmethods, DreamBooth, Textual Inversion, and SDEdit, which are supported by\nStable Diffusion. Our evaluation result shows that 24% of the generated images\nusing DreamBooth are hateful meme variants that present the features of the\noriginal hateful meme and the target individual/community; these generated\nimages are comparable to hateful meme variants collected from the real world.\nOverall, our results demonstrate that the danger of large-scale generation of\nunsafe images is imminent. We discuss several mitigating measures, such as\ncurating training data, regulating prompts, and implementing safety filters,\nand encourage better safeguard tools to be developed to prevent unsafe\ngeneration.\n","authors":["Yiting Qu","Xinyue Shen","Xinlei He","Michael Backes","Savvas Zannettou","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13873v2.pdf","comment":"To Appear in the ACM Conference on Computer and Communications\n Security, November 26, 2023"},{"id":"http://arxiv.org/abs/2302.06608v3","updated":"2023-08-16T11:12:42Z","published":"2023-02-13T18:59:52Z","title":"3D-aware Blending with Generative NeRFs","summary":" Image blending aims to combine multiple images seamlessly. It remains\nchallenging for existing 2D-based methods, especially when input images are\nmisaligned due to differences in 3D camera poses and object shapes. To tackle\nthese issues, we propose a 3D-aware blending method using generative Neural\nRadiance Fields (NeRF), including two key components: 3D-aware alignment and\n3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of\nthe reference image with respect to generative NeRFs and then perform 3D local\nalignment for each part. To further leverage 3D information of the generative\nNeRF, we propose 3D-aware blending that directly blends images on the NeRF's\nlatent representation space, rather than raw pixel space. Collectively, our\nmethod outperforms existing 2D baselines, as validated by extensive\nquantitative and qualitative evaluations with FFHQ and AFHQ-Cat.\n","authors":["Hyunsu Kim","Gayoung Lee","Yunjey Choi","Jin-Hwa Kim","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.06608v3.pdf","comment":"ICCV 2023, Project page: https://blandocs.github.io/blendnerf"},{"id":"http://arxiv.org/abs/2308.08283v1","updated":"2023-08-16T10:51:27Z","published":"2023-08-16T10:51:27Z","title":"CARE: A Large Scale CT Image Dataset and Clinical Applicable Benchmark\n Model for Rectal Cancer Segmentation","summary":" Rectal cancer segmentation of CT image plays a crucial role in timely\nclinical diagnosis, radiotherapy treatment, and follow-up. Although current\nsegmentation methods have shown promise in delineating cancerous tissues, they\nstill encounter challenges in achieving high segmentation precision. These\nobstacles arise from the intricate anatomical structures of the rectum and the\ndifficulties in performing differential diagnosis of rectal cancer.\nAdditionally, a major obstacle is the lack of a large-scale, finely annotated\nCT image dataset for rectal cancer segmentation. To address these issues, this\nwork introduces a novel large scale rectal cancer CT image dataset CARE with\npixel-level annotations for both normal and cancerous rectum, which serves as a\nvaluable resource for algorithm research and clinical application development.\nMoreover, we propose a novel medical cancer lesion segmentation benchmark model\nnamed U-SAM. The model is specifically designed to tackle the challenges posed\nby the intricate anatomical structures of abdominal organs by incorporating\nprompt information. U-SAM contains three key components: promptable information\n(e.g., points) to aid in target area localization, a convolution module for\ncapturing low-level lesion details, and skip-connections to preserve and\nrecover spatial information during the encoding-decoding process. To evaluate\nthe effectiveness of U-SAM, we systematically compare its performance with\nseveral popular segmentation methods on the CARE dataset. The generalization of\nthe model is further verified on the WORD dataset. Extensive experiments\ndemonstrate that the proposed U-SAM outperforms state-of-the-art methods on\nthese two datasets. These experiments can serve as the baseline for future\nresearch and clinical application development.\n","authors":["Hantao Zhang","Weidong Guo","Chenyang Qiu","Shouhong Wan","Bingbing Zou","Wanqin Wang","Peiquan Jin"],"pdf_url":"https://arxiv.org/pdf/2308.08283v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2212.05370v2","updated":"2023-08-16T10:45:20Z","published":"2022-12-10T21:57:11Z","title":"Source-free Depth for Object Pop-out","summary":" Depth cues are known to be useful for visual perception. However, direct\nmeasurement of depth is often impracticable. Fortunately, though, modern\nlearning-based methods offer promising depth maps by inference in the wild. In\nthis work, we adapt such depth inference models for object segmentation using\nthe objects' \"pop-out\" prior in 3D. The \"pop-out\" is a simple composition prior\nthat assumes objects reside on the background surface. Such compositional prior\nallows us to reason about objects in the 3D space. More specifically, we adapt\nthe inferred depth maps such that objects can be localized using only 3D\ninformation. Such separation, however, requires knowledge about contact surface\nwhich we learn using the weak supervision of the segmentation mask. Our\nintermediate representation of contact surface, and thereby reasoning about\nobjects purely in 3D, allows us to better transfer the depth knowledge into\nsemantics. The proposed adaptation method uses only the depth model without\nneeding the source data used for training, making the learning process\nefficient and practical. Our experiments on eight datasets of two challenging\ntasks, namely camouflaged object detection and salient object detection,\nconsistently demonstrate the benefit of our method in terms of both performance\nand generalizability.\n","authors":["Zongwei Wu","Danda Pani Paudel","Deng-Ping Fan","Jingjing Wang","Shuo Wang","Cédric Demonceaux","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2212.05370v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2305.10732v2","updated":"2023-08-16T10:39:12Z","published":"2023-05-18T06:04:24Z","title":"BlindHarmony: \"Blind\" Harmonization for MR Images via Flow model","summary":" In MRI, images of the same contrast (e.g., T$_1$) from the same subject can\nexhibit noticeable differences when acquired using different hardware,\nsequences, or scan parameters. These differences in images create a domain gap\nthat needs to be bridged by a step called image harmonization, to process the\nimages successfully using conventional or deep learning-based image analysis\n(e.g., segmentation). Several methods, including deep learning-based\napproaches, have been proposed to achieve image harmonization. However, they\noften require datasets from multiple domains for deep learning training and may\nstill be unsuccessful when applied to images from unseen domains. To address\nthis limitation, we propose a novel concept called `Blind Harmonization', which\nutilizes only target domain data for training but still has the capability to\nharmonize images from unseen domains. For the implementation of blind\nharmonization, we developed BlindHarmony using an unconditional flow model\ntrained on target domain data. The harmonized image is optimized to have a\ncorrelation with the input source domain image while ensuring that the latent\nvector of the flow model is close to the center of the Gaussian distribution.\nBlindHarmony was evaluated on both simulated and real datasets and compared to\nconventional methods. BlindHarmony demonstrated noticeable performance on both\ndatasets, highlighting its potential for future use in clinical settings. The\nsource code is available at: https://github.com/SNU-LIST/BlindHarmony\n","authors":["Hwihun Jeong","Heejoon Byun","Dong Un Kang","Jongho Lee"],"pdf_url":"https://arxiv.org/pdf/2305.10732v2.pdf","comment":"ICCV 2023 accepted. 9 pages and 5 Figures for manuscipt,\n supplementary included"},{"id":"http://arxiv.org/abs/2308.08276v1","updated":"2023-08-16T10:33:24Z","published":"2023-08-16T10:33:24Z","title":"Computer vision-enriched discrete choice models, with an application to\n residential location choice","summary":" Visual imagery is indispensable to many multi-attribute decision situations.\nExamples of such decision situations in travel behaviour research include\nresidential location choices, vehicle choices, tourist destination choices, and\nvarious safety-related choices. However, current discrete choice models cannot\nhandle image data and thus cannot incorporate information embedded in images\ninto their representations of choice behaviour. This gap between discrete\nchoice models' capabilities and the real-world behaviour it seeks to model\nleads to incomplete and, possibly, misleading outcomes. To solve this gap, this\nstudy proposes \"Computer Vision-enriched Discrete Choice Models\" (CV-DCMs).\nCV-DCMs can handle choice tasks involving numeric attributes and images by\nintegrating computer vision and traditional discrete choice models. Moreover,\nbecause CV-DCMs are grounded in random utility maximisation principles, they\nmaintain the solid behavioural foundation of traditional discrete choice\nmodels. We demonstrate the proposed CV-DCM by applying it to data obtained\nthrough a novel stated choice experiment involving residential location\nchoices. In this experiment, respondents faced choice tasks with trade-offs\nbetween commute time, monthly housing cost and street-level conditions,\npresented using images. As such, this research contributes to the growing body\nof literature in the travel behaviour field that seeks to integrate discrete\nchoice modelling and machine learning.\n","authors":["Sander van Cranenburgh","Francisco Garrido-Valenzuela"],"pdf_url":"https://arxiv.org/pdf/2308.08276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08271v1","updated":"2023-08-16T10:19:16Z","published":"2023-08-16T10:19:16Z","title":"Detecting Olives with Synthetic or Real Data? Olive the Above","summary":" Modern robotics has enabled the advancement in yield estimation for precision\nagriculture. However, when applied to the olive industry, the high variation of\nolive colors and their similarity to the background leaf canopy presents a\nchallenge. Labeling several thousands of very dense olive grove images for\nsegmentation is a labor-intensive task. This paper presents a novel approach to\ndetecting olives without the need to manually label data. In this work, we\npresent the world's first olive detection dataset comprised of synthetic and\nreal olive tree images. This is accomplished by generating an auto-labeled\nphotorealistic 3D model of an olive tree. Its geometry is then simplified for\nlightweight rendering purposes. In addition, experiments are conducted with a\nmix of synthetically generated and real images, yielding an improvement of up\nto 66% compared to when only using a small sample of real data. When access to\nreal, human-labeled data is limited, a combination of mostly synthetic data and\na small amount of real data can enhance olive detection.\n","authors":["Yianni Karabatis","Xiaomin Lin","Nitin J. Sanket","Michail G. Lagoudakis","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2308.08271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08269v1","updated":"2023-08-16T10:16:50Z","published":"2023-08-16T10:16:50Z","title":"OnUVS: Online Feature Decoupling Framework for High-Fidelity Ultrasound\n Video Synthesis","summary":" Ultrasound (US) imaging is indispensable in clinical practice. To diagnose\ncertain diseases, sonographers must observe corresponding dynamic anatomic\nstructures to gather comprehensive information. However, the limited\navailability of specific US video cases causes teaching difficulties in\nidentifying corresponding diseases, which potentially impacts the detection\nrate of such cases. The synthesis of US videos may represent a promising\nsolution to this issue. Nevertheless, it is challenging to accurately animate\nthe intricate motion of dynamic anatomic structures while preserving image\nfidelity. To address this, we present a novel online feature-decoupling\nframework called OnUVS for high-fidelity US video synthesis. Our highlights can\nbe summarized by four aspects. First, we introduced anatomic information into\nkeypoint learning through a weakly-supervised training strategy, resulting in\nimproved preservation of anatomical integrity and motion while minimizing the\nlabeling burden. Second, to better preserve the integrity and textural\ninformation of US images, we implemented a dual-decoder that decouples the\ncontent and textural features in the generator. Third, we adopted a\nmultiple-feature discriminator to extract a comprehensive range of visual cues,\nthereby enhancing the sharpness and fine details of the generated videos.\nFourth, we constrained the motion trajectories of keypoints during online\nlearning to enhance the fluidity of generated videos. Our validation and user\nstudies on in-house echocardiographic and pelvic floor US videos showed that\nOnUVS synthesizes US videos with high fidelity.\n","authors":["Han Zhou","Dong Ni","Ao Chang","Xinrui Zhou","Rusi Chen","Yanlin Chen","Lian Liu","Jiamin Liang","Yuhao Huang","Tong Han","Zhe Liu","Deng-Ping Fan","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08269v1.pdf","comment":"14 pages, 13 figures and 6 tables"},{"id":"http://arxiv.org/abs/2102.03973v7","updated":"2023-08-16T10:06:53Z","published":"2021-02-08T02:51:34Z","title":"STS-GAN: Can We Synthesize Solid Texture with High Fidelity from\n Arbitrary 2D Exemplar?","summary":" Solid texture synthesis (STS), an effective way to extend a 2D exemplar to a\n3D solid volume, exhibits advantages in computational photography. However,\nexisting methods generally fail to accurately learn arbitrary textures, which\nmay result in the failure to synthesize solid textures with high fidelity. In\nthis paper, we propose a novel generative adversarial nets-based framework\n(STS-GAN) to extend the given 2D exemplar to arbitrary 3D solid textures. In\nSTS-GAN, multi-scale 2D texture discriminators evaluate the similarity between\nthe given 2D exemplar and slices from the generated 3D texture, promoting the\n3D texture generator synthesizing realistic solid textures. Finally,\nexperiments demonstrate that the proposed method can generate high-fidelity\nsolid textures with similar visual characteristics to the 2D exemplar.\n","authors":["Xin Zhao","Jifeng Guo","Lin Wang","Fanqi Li","Jiahao Li","Junteng Zheng","Bo Yang"],"pdf_url":"https://arxiv.org/pdf/2102.03973v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02510v2","updated":"2023-08-16T09:59:40Z","published":"2023-07-27T12:54:16Z","title":"Seeing through the Brain: Image Reconstruction of Visual Perception from\n Human Brain Signals","summary":" Seeing is believing, however, the underlying mechanism of how human visual\nperceptions are intertwined with our cognitions is still a mystery. Thanks to\nthe recent advances in both neuroscience and artificial intelligence, we have\nbeen able to record the visually evoked brain activities and mimic the visual\nperception ability through computational approaches. In this paper, we pay\nattention to visual stimuli reconstruction by reconstructing the observed\nimages based on portably accessible brain signals, i.e., electroencephalography\n(EEG) data. Since EEG signals are dynamic in the time-series format and are\nnotorious to be noisy, processing and extracting useful information requires\nmore dedicated efforts; In this paper, we propose a comprehensive pipeline,\nnamed NeuroImagen, for reconstructing visual stimuli images from EEG signals.\nSpecifically, we incorporate a novel multi-level perceptual information\ndecoding to draw multi-grained outputs from the given EEG data. A latent\ndiffusion model will then leverage the extracted information to reconstruct the\nhigh-resolution visual stimuli images. The experimental results have\nillustrated the effectiveness of image reconstruction and superior quantitative\nperformance of our proposed method.\n","authors":["Yu-Ting Lan","Kan Ren","Yansen Wang","Wei-Long Zheng","Dongsheng Li","Bao-Liang Lu","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.02510v2.pdf","comment":"A preprint version of an ongoing work"},{"id":"http://arxiv.org/abs/2308.08258v1","updated":"2023-08-16T09:50:35Z","published":"2023-08-16T09:50:35Z","title":"SceNeRFlow: Time-Consistent Reconstruction of General Dynamic Scenes","summary":" Existing methods for the 4D reconstruction of general, non-rigidly deforming\nobjects focus on novel-view synthesis and neglect correspondences. However,\ntime consistency enables advanced downstream tasks like 3D editing, motion\nanalysis, or virtual-asset creation. We propose SceNeRFlow to reconstruct a\ngeneral, non-rigid scene in a time-consistent manner. Our dynamic-NeRF method\ntakes multi-view RGB videos and background images from static cameras with\nknown camera parameters as input. It then reconstructs the deformations of an\nestimated canonical model of the geometry and appearance in an online fashion.\nSince this canonical model is time-invariant, we obtain correspondences even\nfor long-term, long-range motions. We employ neural scene representations to\nparametrize the components of our method. Like prior dynamic-NeRF methods, we\nuse a backwards deformation model. We find non-trivial adaptations of this\nmodel necessary to handle larger motions: We decompose the deformations into a\nstrongly regularized coarse component and a weakly regularized fine component,\nwhere the coarse component also extends the deformation field into the space\nsurrounding the object, which enables tracking over time. We show\nexperimentally that, unlike prior work that only handles small motion, our\nmethod enables the reconstruction of studio-scale motions.\n","authors":["Edith Tretschk","Vladislav Golyanik","Michael Zollhoefer","Aljaz Bozic","Christoph Lassner","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2308.08258v1.pdf","comment":"Project page: https://vcai.mpi-inf.mpg.de/projects/scenerflow/"},{"id":"http://arxiv.org/abs/2308.08256v1","updated":"2023-08-16T09:47:52Z","published":"2023-08-16T09:47:52Z","title":"MultiMediate'23: Engagement Estimation and Bodily Behaviour Recognition\n in Social Interactions","summary":" Automatic analysis of human behaviour is a fundamental prerequisite for the\ncreation of machines that can effectively interact with- and support humans in\nsocial interactions. In MultiMediate'23, we address two key human social\nbehaviour analysis tasks for the first time in a controlled challenge:\nengagement estimation and bodily behaviour recognition in social interactions.\nThis paper describes the MultiMediate'23 challenge and presents novel sets of\nannotations for both tasks. For engagement estimation we collected novel\nannotations on the NOvice eXpert Interaction (NOXI) database. For bodily\nbehaviour recognition, we annotated test recordings of the MPIIGroupInteraction\ncorpus with the BBSI annotation scheme. In addition, we present baseline\nresults for both challenge tasks.\n","authors":["Philipp Müller","Michal Balazia","Tobias Baur","Michael Dietz","Alexander Heimerl","Dominik Schiller","Mohammed Guermal","Dominike Thomas","François Brémond","Jan Alexandersson","Elisabeth André","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2308.08256v1.pdf","comment":"ACM MultiMedia'23"},{"id":"http://arxiv.org/abs/2308.07009v2","updated":"2023-08-16T09:47:08Z","published":"2023-08-14T08:52:41Z","title":"ACTIVE: Towards Highly Transferable 3D Physical Camouflage for Universal\n and Robust Vehicle Evasion","summary":" Adversarial camouflage has garnered attention for its ability to attack\nobject detectors from any viewpoint by covering the entire object's surface.\nHowever, universality and robustness in existing methods often fall short as\nthe transferability aspect is often overlooked, thus restricting their\napplication only to a specific target with limited performance. To address\nthese challenges, we present Adversarial Camouflage for Transferable and\nIntensive Vehicle Evasion (ACTIVE), a state-of-the-art physical camouflage\nattack framework designed to generate universal and robust adversarial\ncamouflage capable of concealing any 3D vehicle from detectors. Our framework\nincorporates innovative techniques to enhance universality and robustness,\nincluding a refined texture rendering that enables common texture application\nto different vehicles without being constrained to a specific texture map, a\nnovel stealth loss that renders the vehicle undetectable, and a smooth and\ncamouflage loss to enhance the naturalness of the adversarial camouflage. Our\nextensive experiments on 15 different models show that ACTIVE consistently\noutperforms existing works on various public detectors, including the latest\nYOLOv7. Notably, our universality evaluations reveal promising transferability\nto other vehicle classes, tasks (segmentation models), and the real world, not\njust other vehicles.\n","authors":["Naufal Suryanto","Yongsu Kim","Harashta Tatimma Larasati","Hyoeun Kang","Thi-Thu-Huong Le","Yoonyoung Hong","Hunmin Yang","Se-Yoon Oh","Howon Kim"],"pdf_url":"https://arxiv.org/pdf/2308.07009v2.pdf","comment":"Accepted for ICCV 2023. Main Paper with Supplementary Material.\n Project Page: https://islab-ai.github.io/active-iccv2023/"},{"id":"http://arxiv.org/abs/2308.08242v1","updated":"2023-08-16T09:16:05Z","published":"2023-08-16T09:16:05Z","title":"Contrastive Learning for Lane Detection via cross-similarity","summary":" Detecting road lanes is challenging due to intricate markings vulnerable to\nunfavorable conditions. Lane markings have strong shape priors, but their\nvisibility is easily compromised. Factors like lighting, weather, vehicles,\npedestrians, and aging colors challenge the detection. A large amount of data\nis required to train a lane detection approach that can withstand natural\nvariations caused by low visibility. This is because there are numerous lane\nshapes and natural variations that exist. Our solution, Contrastive Learning\nfor Lane Detection via cross-similarity (CLLD), is a self-supervised learning\nmethod that tackles this challenge by enhancing lane detection models\nresilience to real-world conditions that cause lane low visibility. CLLD is a\nnovel multitask contrastive learning that trains lane detection approaches to\ndetect lane markings even in low visible situations by integrating local\nfeature contrastive learning (CL) with our new proposed operation\ncross-similarity. Local feature CL focuses on extracting features for small\nimage parts, which is necessary to localize lane segments, while\ncross-similarity captures global features to detect obscured lane segments\nusing their surrounding. We enhance cross-similarity by randomly masking parts\nof input images for augmentation. Evaluated on benchmark datasets, CLLD\noutperforms state-of-the-art contrastive learning, especially in\nvisibility-impairing conditions like shadows. Compared to supervised learning,\nCLLD excels in scenarios like shadows and crowded scenes.\n","authors":["Ali Zoljodi","Sadegh Abadijou","Mina Alibeigi","Masoud Daneshtalab"],"pdf_url":"https://arxiv.org/pdf/2308.08242v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.08231v1","updated":"2023-08-16T09:06:32Z","published":"2023-08-16T09:06:32Z","title":"DDF-HO: Hand-Held Object Reconstruction via Conditional Directed\n Distance Field","summary":" Reconstructing hand-held objects from a single RGB image is an important and\nchallenging problem. Existing works utilizing Signed Distance Fields (SDF)\nreveal limitations in comprehensively capturing the complex hand-object\ninteractions, since SDF is only reliable within the proximity of the target,\nand hence, infeasible to simultaneously encode local hand and object cues. To\naddress this issue, we propose DDF-HO, a novel approach leveraging Directed\nDistance Field (DDF) as the shape representation. Unlike SDF, DDF maps a ray in\n3D space, consisting of an origin and a direction, to corresponding DDF values,\nincluding a binary visibility signal determining whether the ray intersects the\nobjects and a distance value measuring the distance from origin to target in\nthe given direction. We randomly sample multiple rays and collect local to\nglobal geometric features for them by introducing a novel 2D ray-based feature\naggregation scheme and a 3D intersection-aware hand pose embedding, combining\n2D-3D features to model hand-object interactions. Extensive experiments on\nsynthetic and real-world datasets demonstrate that DDF-HO consistently\noutperforms all baseline methods by a large margin, especially under Chamfer\nDistance, with about 80% leap forward. Codes and trained models will be\nreleased soon.\n","authors":["Chenyangguang Zhang","Yan Di","Ruida Zhang","Guangyao Zhai","Fabian Manhardt","Federico Tombari","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2308.08231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08227v1","updated":"2023-08-16T08:58:25Z","published":"2023-08-16T08:58:25Z","title":"Inherent Redundancy in Spiking Neural Networks","summary":" Spiking Neural Networks (SNNs) are well known as a promising energy-efficient\nalternative to conventional artificial neural networks. Subject to the\npreconceived impression that SNNs are sparse firing, the analysis and\noptimization of inherent redundancy in SNNs have been largely overlooked, thus\nthe potential advantages of spike-based neuromorphic computing in accuracy and\nenergy efficiency are interfered. In this work, we pose and focus on three key\nquestions regarding the inherent redundancy in SNNs. We argue that the\nredundancy is induced by the spatio-temporal invariance of SNNs, which enhances\nthe efficiency of parameter utilization but also invites lots of noise spikes.\nFurther, we analyze the effect of spatio-temporal invariance on the\nspatio-temporal dynamics and spike firing of SNNs. Then, motivated by these\nanalyses, we propose an Advance Spatial Attention (ASA) module to harness SNNs'\nredundancy, which can adaptively optimize their membrane potential distribution\nby a pair of individual spatial attention sub-modules. In this way, noise spike\nfeatures are accurately regulated. Experimental results demonstrate that the\nproposed method can significantly drop the spike firing with better performance\nthan state-of-the-art SNN baselines. Our code is available in\n\\url{https://github.com/BICLab/ASA-SNN}.\n","authors":["Man Yao","Jiakui Hu","Guangshe Zhao","Yaoyuan Wang","Ziyang Zhang","Bo Xu","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2308.08227v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2210.04214v2","updated":"2023-08-16T08:55:29Z","published":"2022-10-09T09:59:46Z","title":"VM-NeRF: Tackling Sparsity in NeRF with View Morphing","summary":" NeRF aims to learn a continuous neural scene representation by using a finite\nset of input images taken from various viewpoints. A well-known limitation of\nNeRF methods is their reliance on data: the fewer the viewpoints, the higher\nthe likelihood of overfitting. This paper addresses this issue by introducing a\nnovel method to generate geometrically consistent image transitions between\nviewpoints using View Morphing. Our VM-NeRF approach requires no prior\nknowledge about the scene structure, as View Morphing is based on the\nfundamental principles of projective geometry. VM-NeRF tightly integrates this\ngeometric view generation process during the training procedure of standard\nNeRF approaches. Notably, our method significantly improves novel view\nsynthesis, particularly when only a few views are available. Experimental\nevaluation reveals consistent improvement over current methods that handle\nsparse viewpoints in NeRF models. We report an increase in PSNR of up to 1.8dB\nand 1.0dB when training uses eight and four views, respectively. Source code:\n\\url{https://github.com/mbortolon97/VM-NeRF}\n","authors":["Matteo Bortolon","Alessio Del Bue","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2210.04214v2.pdf","comment":"ICIAP 2023"},{"id":"http://arxiv.org/abs/2308.08224v1","updated":"2023-08-16T08:52:49Z","published":"2023-08-16T08:52:49Z","title":"How To Overcome Confirmation Bias in Semi-Supervised Image\n Classification By Active Learning","summary":" Do we need active learning? The rise of strong deep semi-supervised methods\nraises doubt about the usability of active learning in limited labeled data\nsettings. This is caused by results showing that combining semi-supervised\nlearning (SSL) methods with a random selection for labeling can outperform\nexisting active learning (AL) techniques. However, these results are obtained\nfrom experiments on well-established benchmark datasets that can overestimate\nthe external validity. However, the literature lacks sufficient research on the\nperformance of active semi-supervised learning methods in realistic data\nscenarios, leaving a notable gap in our understanding. Therefore we present\nthree data challenges common in real-world applications: between-class\nimbalance, within-class imbalance, and between-class similarity. These\nchallenges can hurt SSL performance due to confirmation bias. We conduct\nexperiments with SSL and AL on simulated data challenges and find that random\nsampling does not mitigate confirmation bias and, in some cases, leads to worse\nperformance than supervised learning. In contrast, we demonstrate that AL can\novercome confirmation bias in SSL in these realistic settings. Our results\nprovide insights into the potential of combining active and semi-supervised\nlearning in the presence of common real-world challenges, which is a promising\ndirection for robust methods when learning with limited labeled data in\nreal-world applications.\n","authors":["Sandra Gilhuber","Rasmus Hvingelby","Mang Ling Ada Fok","Thomas Seidl"],"pdf_url":"https://arxiv.org/pdf/2308.08224v1.pdf","comment":"Accepted @ ECML PKDD 2023. This is the author's version of the work.\n The definitive Version of Record will be published in the Proceedings of ECML\n PKDD 2023"},{"id":"http://arxiv.org/abs/2306.17558v2","updated":"2023-08-16T08:46:52Z","published":"2023-06-30T11:21:40Z","title":"Towards the extraction of robust sign embeddings for low resource sign\n language recognition","summary":" Isolated Sign Language Recognition (SLR) has mostly been applied on datasets\ncontaining signs executed slowly and clearly by a limited group of signers. In\nreal-world scenarios, however, we are met with challenging visual conditions,\ncoarticulated signing, small datasets, and the need for signer independent\nmodels. To tackle this difficult problem, we require a robust feature extractor\nto process the sign language videos. One could expect human pose estimators to\nbe ideal candidates. However, due to a domain mismatch with their training sets\nand challenging poses in sign language, they lack robustness on sign language\ndata and image-based models often still outperform keypoint-based models.\nFurthermore, whereas the common practice of transfer learning with image-based\nmodels yields even higher accuracy, keypoint-based models are typically trained\nfrom scratch on every SLR dataset. These factors limit their usefulness for\nSLR. From the existing literature, it is also not clear which, if any, pose\nestimator performs best for SLR. We compare the three most popular pose\nestimators for SLR: OpenPose, MMPose and MediaPipe. We show that through\nkeypoint normalization, missing keypoint imputation, and learning a pose\nembedding, we can obtain significantly better results and enable transfer\nlearning. We show that keypoint-based embeddings contain cross-lingual\nfeatures: they can transfer between sign languages and achieve competitive\nperformance even when fine-tuning only the classifier layer of an SLR model on\na target sign language. We furthermore achieve better performance using\nfine-tuned transferred embeddings than models trained only on the target sign\nlanguage. The embeddings can also be learned in a multilingual fashion. The\napplication of these embeddings could prove particularly useful for low\nresource sign languages in the future.\n","authors":["Mathieu De Coster","Ellen Rushe","Ruth Holmes","Anthony Ventresque","Joni Dambre"],"pdf_url":"https://arxiv.org/pdf/2306.17558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08220v1","updated":"2023-08-16T08:46:51Z","published":"2023-08-16T08:46:51Z","title":"Low-Light Image Enhancement with Illumination-Aware Gamma Correction and\n Complete Image Modelling Network","summary":" This paper presents a novel network structure with illumination-aware gamma\ncorrection and complete image modelling to solve the low-light image\nenhancement problem. Low-light environments usually lead to less informative\nlarge-scale dark areas, directly learning deep representations from low-light\nimages is insensitive to recovering normal illumination. We propose to\nintegrate the effectiveness of gamma correction with the strong modelling\ncapacities of deep networks, which enables the correction factor gamma to be\nlearned in a coarse to elaborate manner via adaptively perceiving the deviated\nillumination. Because exponential operation introduces high computational\ncomplexity, we propose to use Taylor Series to approximate gamma correction,\naccelerating the training and inference speed. Dark areas usually occupy large\nscales in low-light images, common local modelling structures, e.g., CNN,\nSwinIR, are thus insufficient to recover accurate illumination across whole\nlow-light images. We propose a novel Transformer block to completely simulate\nthe dependencies of all pixels across images via a local-to-global hierarchical\nattention mechanism, so that dark areas could be inferred by borrowing the\ninformation from far informative regions in a highly effective manner.\nExtensive experiments on several benchmark datasets demonstrate that our\napproach outperforms state-of-the-art methods.\n","authors":["Yinglong Wang","Zhen Liu","Jianzhuang Liu","Songcen Xu","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.08220v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08213v1","updated":"2023-08-16T08:30:44Z","published":"2023-08-16T08:30:44Z","title":"MEDOE: A Multi-Expert Decoder and Output Ensemble Framework for\n Long-tailed Semantic Segmentation","summary":" Long-tailed distribution of semantic categories, which has been often ignored\nin conventional methods, causes unsatisfactory performance in semantic\nsegmentation on tail categories. In this paper, we focus on the problem of\nlong-tailed semantic segmentation. Although some long-tailed recognition\nmethods (e.g., re-sampling/re-weighting) have been proposed in other problems,\nthey can probably compromise crucial contextual information and are thus hardly\nadaptable to the problem of long-tailed semantic segmentation. To address this\nissue, we propose MEDOE, a novel framework for long-tailed semantic\nsegmentation via contextual information ensemble-and-grouping. The proposed\ntwo-sage framework comprises a multi-expert decoder (MED) and a multi-expert\noutput ensemble (MOE). Specifically, the MED includes several \"experts\". Based\non the pixel frequency distribution, each expert takes the dataset masked\naccording to the specific categories as input and generates contextual\ninformation self-adaptively for classification; The MOE adopts learnable\ndecision weights for the ensemble of the experts' outputs. As a model-agnostic\nframework, our MEDOE can be flexibly and efficiently coupled with various\npopular deep neural networks (e.g., DeepLabv3+, OCRNet, and PSPNet) to improve\ntheir performance in long-tailed semantic segmentation. Experimental results\nshow that the proposed framework outperforms the current methods on both\nCityscapes and ADE20K datasets by up to 1.78% in mIoU and 5.89% in mAcc.\n","authors":["Junao Shen","Long Chen","Kun Kuang","Fei Wu","Tian Feng","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08213v1.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.08210v1","updated":"2023-08-16T08:28:01Z","published":"2023-08-16T08:28:01Z","title":"Neural Spherical Harmonics for structurally coherent continuous\n representation of diffusion MRI signal","summary":" We present a novel way to model diffusion magnetic resonance imaging (dMRI)\ndatasets, that benefits from the structural coherence of the human brain while\nonly using data from a single subject. Current methods model the dMRI signal in\nindividual voxels, disregarding the intervoxel coherence that is present. We\nuse a neural network to parameterize a spherical harmonics series (NeSH) to\nrepresent the dMRI signal of a single subject from the Human Connectome Project\ndataset, continuous in both the angular and spatial domain. The reconstructed\ndMRI signal using this method shows a more structurally coherent representation\nof the data. Noise in gradient images is removed and the fiber orientation\ndistribution functions show a smooth change in direction along a fiber tract.\nWe showcase how the reconstruction can be used to calculate mean diffusivity,\nfractional anisotropy, and total apparent fiber density. These results can be\nachieved with a single model architecture, tuning only one hyperparameter. In\nthis paper we also demonstrate how upsampling in both the angular and spatial\ndomain yields reconstructions that are on par or better than existing methods.\n","authors":["Tom Hendriks","Anna Villanova","Maxime Chamberland"],"pdf_url":"https://arxiv.org/pdf/2308.08210v1.pdf","comment":"12 pages, 6 figures, accepted for cdMRI workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.08206v1","updated":"2023-08-16T08:13:38Z","published":"2023-08-16T08:13:38Z","title":"Explainable Multi-View Deep Networks Methodology for Experimental\n Physics","summary":" Physical experiments often involve multiple imaging representations, such as\nX-ray scans and microscopic images. Deep learning models have been widely used\nfor supervised analysis in these experiments. Combining different image\nrepresentations is frequently required to analyze and make a decision properly.\nConsequently, multi-view data has emerged - datasets where each sample is\ndescribed by views from different angles, sources, or modalities. These\nproblems are addressed with the concept of multi-view learning. Understanding\nthe decision-making process of deep learning models is essential for reliable\nand credible analysis. Hence, many explainability methods have been devised\nrecently. Nonetheless, there is a lack of proper explainability in multi-view\nmodels, which are challenging to explain due to their architectures. In this\npaper, we suggest different multi-view architectures for the vision domain,\neach suited to another problem, and we also present a methodology for\nexplaining these models. To demonstrate the effectiveness of our methodology,\nwe focus on the domain of High Energy Density Physics (HEDP) experiments, where\nmultiple imaging representations are used to assess the quality of foam\nsamples. We apply our methodology to classify the foam samples quality using\nthe suggested multi-view architectures. Through experimental results, we\nshowcase the improvement of accurate architecture choice on both accuracy - 78%\nto 84% and AUC - 83% to 93% and present a trade-off between performance and\nexplainability. Specifically, we demonstrate that our approach enables the\nexplanation of individual one-view models, providing insights into the\ndecision-making process of each view. This understanding enhances the\ninterpretability of the overall multi-view model. The sources of this work are\navailable at:\nhttps://github.com/Scientific-Computing-Lab-NRCN/Multi-View-Explainability.\n","authors":["Nadav Schneider","Muriel Tzdaka","Galit Sturm","Guy Lazovski","Galit Bar","Gilad Oren","Raz Gvishi","Gal Oren"],"pdf_url":"https://arxiv.org/pdf/2308.08206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10813v2","updated":"2023-08-16T08:02:02Z","published":"2023-06-19T10:03:11Z","title":"Instruct-NeuralTalker: Editing Audio-Driven Talking Radiance Fields with\n Instructions","summary":" Recent neural talking radiance field methods have shown great success in\nphotorealistic audio-driven talking face synthesis. In this paper, we propose a\nnovel interactive framework that utilizes human instructions to edit such\nimplicit neural representations to achieve real-time personalized talking face\ngeneration. Given a short speech video, we first build an efficient talking\nradiance field, and then apply the latest conditional diffusion model for image\nediting based on the given instructions and guiding implicit representation\noptimization towards the editing target. To ensure audio-lip synchronization\nduring the editing process, we propose an iterative dataset updating strategy\nand utilize a lip-edge loss to constrain changes in the lip region. We also\nintroduce a lightweight refinement network for complementing image details and\nachieving controllable detail generation in the final rendered image. Our\nmethod also enables real-time rendering at up to 30FPS on consumer hardware.\nMultiple metrics and user verification show that our approach provides a\nsignificant improvement in rendering quality compared to state-of-the-art\nmethods.\n","authors":["Yuqi Sun","Ruian He","Weimin Tan","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2306.10813v2.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.08197v1","updated":"2023-08-16T07:57:35Z","published":"2023-08-16T07:57:35Z","title":"Self-Reference Deep Adaptive Curve Estimation for Low-Light Image\n Enhancement","summary":" In this paper, we propose a 2-stage low-light image enhancement method called\nSelf-Reference Deep Adaptive Curve Estimation (Self-DACE). In the first stage,\nwe present an intuitive, lightweight, fast, and unsupervised luminance\nenhancement algorithm. The algorithm is based on a novel low-light enhancement\ncurve that can be used to locally boost image brightness. We also propose a new\nloss function with a simplified physical model designed to preserve natural\nimages' color, structure, and fidelity. We use a vanilla CNN to map each pixel\nthrough deep Adaptive Adjustment Curves (AAC) while preserving the local image\nstructure. Secondly, we introduce the corresponding denoising scheme to remove\nthe latent noise in the darkness. We approximately model the noise in the dark\nand deploy a Denoising-Net to estimate and remove the noise after the first\nstage. Exhaustive qualitative and quantitative analysis shows that our method\noutperforms existing state-of-the-art algorithms on multiple real-world\ndatasets.\n","authors":["Jianyu Wen","Chenhao Wu","Tong Zhang","Yixuan Yu","Piotr Swierczynski"],"pdf_url":"https://arxiv.org/pdf/2308.08197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08192v1","updated":"2023-08-16T07:44:34Z","published":"2023-08-16T07:44:34Z","title":"Automatic Vision-Based Parking Slot Detection and Occupancy\n Classification","summary":" Parking guidance information (PGI) systems are used to provide information to\ndrivers about the nearest parking lots and the number of vacant parking slots.\nRecently, vision-based solutions started to appear as a cost-effective\nalternative to standard PGI systems based on hardware sensors mounted on each\nparking slot. Vision-based systems provide information about parking occupancy\nbased on images taken by a camera that is recording a parking lot. However,\nsuch systems are challenging to develop due to various possible viewpoints,\nweather conditions, and object occlusions. Most notably, they require manual\nlabeling of parking slot locations in the input image which is sensitive to\ncamera angle change, replacement, or maintenance. In this paper, the algorithm\nthat performs Automatic Parking Slot Detection and Occupancy Classification\n(APSD-OC) solely on input images is proposed. Automatic parking slot detection\nis based on vehicle detections in a series of parking lot images upon which\nclustering is applied in bird's eye view to detect parking slots. Once the\nparking slots positions are determined in the input image, each detected\nparking slot is classified as occupied or vacant using a specifically trained\nResNet34 deep classifier. The proposed approach is extensively evaluated on\nwell-known publicly available datasets (PKLot and CNRPark+EXT), showing high\nefficiency in parking slot detection and robustness to the presence of illegal\nparking or passing vehicles. Trained classifier achieves high accuracy in\nparking slot occupancy classification.\n","authors":["Ratko Grbić","Brando Koch"],"pdf_url":"https://arxiv.org/pdf/2308.08192v1.pdf","comment":"39 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2308.08182v1","updated":"2023-08-16T07:21:25Z","published":"2023-08-16T07:21:25Z","title":"Unsupervised Domain Adaptive Detection with Network Stability Analysis","summary":" Domain adaptive detection aims to improve the generality of a detector,\nlearned from the labeled source domain, on the unlabeled target domain. In this\nwork, drawing inspiration from the concept of stability from the control theory\nthat a robust system requires to remain consistent both externally and\ninternally regardless of disturbances, we propose a novel framework that\nachieves unsupervised domain adaptive detection through stability analysis. In\nspecific, we treat discrepancies between images and regions from different\ndomains as disturbances, and introduce a novel simple but effective Network\nStability Analysis (NSA) framework that considers various disturbances for\ndomain adaptation. Particularly, we explore three types of perturbations\nincluding heavy and light image-level disturbances and instancelevel\ndisturbance. For each type, NSA performs external consistency analysis on the\noutputs from raw and perturbed images and/or internal consistency analysis on\ntheir features, using teacher-student models. By integrating NSA into Faster\nR-CNN, we immediately achieve state-of-the-art results. In particular, we set a\nnew record of 52.7% mAP on Cityscapes-to-FoggyCityscapes, showing the potential\nof NSA for domain adaptive detection. It is worth noticing, our NSA is designed\nfor general purpose, and thus applicable to one-stage detection model (e.g.,\nFCOS) besides the adopted one, as shown by experiments.\nhttps://github.com/tiankongzhang/NSA.\n","authors":["Wenzhang Zhou","Heng Fan","Tiejian Luo","Libo Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01103v2","updated":"2023-08-16T07:12:41Z","published":"2022-12-02T11:31:49Z","title":"3D-TOGO: Towards Text-Guided Cross-Category 3D Object Generation","summary":" Text-guided 3D object generation aims to generate 3D objects described by\nuser-defined captions, which paves a flexible way to visualize what we\nimagined. Although some works have been devoted to solving this challenging\ntask, these works either utilize some explicit 3D representations (e.g., mesh),\nwhich lack texture and require post-processing for rendering photo-realistic\nviews; or require individual time-consuming optimization for every single case.\nHere, we make the first attempt to achieve generic text-guided cross-category\n3D object generation via a new 3D-TOGO model, which integrates a text-to-views\ngeneration module and a views-to-3D generation module. The text-to-views\ngeneration module is designed to generate different views of the target 3D\nobject given an input caption. prior-guidance, caption-guidance and view\ncontrastive learning are proposed for achieving better view-consistency and\ncaption similarity. Meanwhile, a pixelNeRF model is adopted for the views-to-3D\ngeneration module to obtain the implicit 3D neural representation from the\npreviously-generated views. Our 3D-TOGO model generates 3D objects in the form\nof the neural radiance field with good texture and requires no time-cost\noptimization for every single caption. Besides, 3D-TOGO can control the\ncategory, color and shape of generated 3D objects with the input caption.\nExtensive experiments on the largest 3D object dataset (i.e., ABO) are\nconducted to verify that 3D-TOGO can better generate high-quality 3D objects\naccording to the input captions across 98 different categories, in terms of\nPSNR, SSIM, LPIPS and CLIP-score, compared with text-NeRF and Dreamfields.\n","authors":["Zutao Jiang","Guansong Lu","Xiaodan Liang","Jihua Zhu","Wei Zhang","Xiaojun Chang","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2212.01103v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08172v1","updated":"2023-08-16T07:02:02Z","published":"2023-08-16T07:02:02Z","title":"AATCT-IDS: A Benchmark Abdominal Adipose Tissue CT Image Dataset for\n Image Denoising, Semantic Segmentation, and Radiomics Evaluation","summary":" Methods: In this study, a benchmark \\emph{Abdominal Adipose Tissue CT Image\nDataset} (AATTCT-IDS) containing 300 subjects is prepared and published.\nAATTCT-IDS publics 13,732 raw CT slices, and the researchers individually\nannotate the subcutaneous and visceral adipose tissue regions of 3,213 of those\nslices that have the same slice distance to validate denoising methods, train\nsemantic segmentation models, and study radiomics. For different tasks, this\npaper compares and analyzes the performance of various methods on AATTCT-IDS by\ncombining the visualization results and evaluation data. Thus, verify the\nresearch potential of this data set in the above three types of tasks.\n Results: In the comparative study of image denoising, algorithms using a\nsmoothing strategy suppress mixed noise at the expense of image details and\nobtain better evaluation data. Methods such as BM3D preserve the original image\nstructure better, although the evaluation data are slightly lower. The results\nshow significant differences among them. In the comparative study of semantic\nsegmentation of abdominal adipose tissue, the segmentation results of adipose\ntissue by each model show different structural characteristics. Among them,\nBiSeNet obtains segmentation results only slightly inferior to U-Net with the\nshortest training time and effectively separates small and isolated adipose\ntissue. In addition, the radiomics study based on AATTCT-IDS reveals three\nadipose distributions in the subject population.\n Conclusion: AATTCT-IDS contains the ground truth of adipose tissue regions in\nabdominal CT slices. This open-source dataset can attract researchers to\nexplore the multi-dimensional characteristics of abdominal adipose tissue and\nthus help physicians and patients in clinical practice. AATCT-IDS is freely\npublished for non-commercial purpose at:\n\\url{https://figshare.com/articles/dataset/AATTCT-IDS/23807256}.\n","authors":["Zhiyu Ma","Chen Li","Tianming Du","Le Zhang","Dechao Tang","Deguo Ma","Shanchuan Huang","Yan Liu","Yihao Sun","Zhihao Chen","Jin Yuan","Qianqing Nie","Marcin Grzegorzek","Hongzan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.08172v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.07163v2","updated":"2023-08-16T06:52:21Z","published":"2023-08-14T14:18:11Z","title":"HyperSparse Neural Networks: Shifting Exploration to Exploitation\n through Adaptive Regularization","summary":" Sparse neural networks are a key factor in developing resource-efficient\nmachine learning applications. We propose the novel and powerful sparse\nlearning method Adaptive Regularized Training (ART) to compress dense into\nsparse networks. Instead of the commonly used binary mask during training to\nreduce the number of model weights, we inherently shrink weights close to zero\nin an iterative manner with increasing weight regularization. Our method\ncompresses the pre-trained model knowledge into the weights of highest\nmagnitude. Therefore, we introduce a novel regularization loss named\nHyperSparse that exploits the highest weights while conserving the ability of\nweight exploration. Extensive experiments on CIFAR and TinyImageNet show that\nour method leads to notable performance gains compared to other sparsification\nmethods, especially in extremely high sparsity regimes up to 99.8 percent model\nsparsity. Additional investigations provide new insights into the patterns that\nare encoded in weights with high magnitudes.\n","authors":["Patrick Glandorf","Timo Kaiser","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2308.07163v2.pdf","comment":"ICCV'23 Workshops"},{"id":"http://arxiv.org/abs/2303.10385v2","updated":"2023-08-16T06:50:24Z","published":"2023-03-18T10:44:39Z","title":"Social Occlusion Inference with Vectorized Representation for Autonomous\n Driving","summary":" Autonomous vehicles must be capable of handling the occlusion of the\nenvironment to ensure safe and efficient driving. In urban environment,\nocclusion often arises due to other vehicles obscuring the perception of the\nego vehicle. Since the occlusion condition can impact the trajectories of\nvehicles, the behavior of other vehicles is helpful in making inferences about\nthe occlusion as a remedy for perceptual deficiencies. This paper introduces a\nnovel social occlusion inference approach that learns a mapping from agent\ntrajectories and scene context to an occupancy grid map (OGM) representing the\nview of ego vehicle. Specially, vectorized features are encoded through the\npolyline encoder to aggregate features of vectors into features of polylines. A\ntransformer module is then utilized to model the high-order interactions of\npolylines. Importantly, occlusion queries are proposed to fuse polyline\nfeatures and generate the OGM without the input of visual modality. To verify\nthe performance of vectorized representation, we design a baseline based on a\nfully transformer encoder-decoder architecture mapping the OGM with occlusion\nand historical trajectories information to the ground truth OGM. We evaluate\nour approach on an unsignalized intersection in the INTERACTION dataset, which\noutperforms the state-of-the-art results.\n","authors":["Bochao Huang"," Pin"],"pdf_url":"https://arxiv.org/pdf/2303.10385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08162v1","updated":"2023-08-16T06:09:51Z","published":"2023-08-16T06:09:51Z","title":"Interpretability Benchmark for Evaluating Spatial Misalignment of\n Prototypical Parts Explanations","summary":" Prototypical parts-based networks are becoming increasingly popular due to\ntheir faithful self-explanations. However, their similarity maps are calculated\nin the penultimate network layer. Therefore, the receptive field of the\nprototype activation region often depends on parts of the image outside this\nregion, which can lead to misleading interpretations. We name this undesired\nbehavior a spatial explanation misalignment and introduce an interpretability\nbenchmark with a set of dedicated metrics for quantifying this phenomenon. In\naddition, we propose a method for misalignment compensation and apply it to\nexisting state-of-the-art models. We show the expressiveness of our benchmark\nand the effectiveness of the proposed compensation methodology through\nextensive empirical studies.\n","authors":["Mikołaj Sacha","Bartosz Jura","Dawid Rymarczyk","Łukasz Struski","Jacek Tabor","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2308.08162v1.pdf","comment":"Under review. Code will be release upon acceptance"},{"id":"http://arxiv.org/abs/2308.08157v1","updated":"2023-08-16T05:59:33Z","published":"2023-08-16T05:59:33Z","title":"Learning to Generate Semantic Layouts for Higher Text-Image\n Correspondence in Text-to-Image Synthesis","summary":" Existing text-to-image generation approaches have set high standards for\nphotorealism and text-image correspondence, largely benefiting from web-scale\ntext-image datasets, which can include up to 5~billion pairs. However,\ntext-to-image generation models trained on domain-specific datasets, such as\nurban scenes, medical images, and faces, still suffer from low text-image\ncorrespondence due to the lack of text-image pairs. Additionally, collecting\nbillions of text-image pairs for a specific domain can be time-consuming and\ncostly. Thus, ensuring high text-image correspondence without relying on\nweb-scale text-image datasets remains a challenging task. In this paper, we\npresent a novel approach for enhancing text-image correspondence by leveraging\navailable semantic layouts. Specifically, we propose a Gaussian-categorical\ndiffusion process that simultaneously generates both images and corresponding\nlayout pairs. Our experiments reveal that we can guide text-to-image generation\nmodels to be aware of the semantics of different image regions, by training the\nmodel to generate semantic labels for each pixel. We demonstrate that our\napproach achieves higher text-image correspondence compared to existing\ntext-to-image generation approaches in the Multi-Modal CelebA-HQ and the\nCityscapes dataset, where text-image pairs are scarce. Codes are available in\nthis https://pmh9960.github.io/research/GCDP\n","authors":["Minho Park","Jooyeol Yun","Seunghwan Choi","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2308.08157v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08154v1","updated":"2023-08-16T05:57:09Z","published":"2023-08-16T05:57:09Z","title":"Conditional Perceptual Quality Preserving Image Compression","summary":" We propose conditional perceptual quality, an extension of the perceptual\nquality defined in \\citet{blau2018perception}, by conditioning it on user\ndefined information. Specifically, we extend the original perceptual quality\n$d(p_{X},p_{\\hat{X}})$ to the conditional perceptual quality\n$d(p_{X|Y},p_{\\hat{X}|Y})$, where $X$ is the original image, $\\hat{X}$ is the\nreconstructed, $Y$ is side information defined by user and $d(.,.)$ is\ndivergence. We show that conditional perceptual quality has similar theoretical\nproperties as rate-distortion-perception trade-off \\citep{blau2019rethinking}.\nBased on these theoretical results, we propose an optimal framework for\nconditional perceptual quality preserving compression. Experimental results\nshow that our codec successfully maintains high perceptual quality and semantic\nquality at all bitrate. Besides, by providing a lowerbound of common randomness\nrequired, we settle the previous arguments on whether randomness should be\nincorporated into generator for (conditional) perceptual quality compression.\nThe source code is provided in supplementary material.\n","authors":["Tongda Xu","Qian Zhang","Yanghao Li","Dailan He","Zhe Wang","Yuanyuan Wang","Hongwei Qin","Yan Wang","Jingjing Liu","Ya-Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12027v2","updated":"2023-08-16T05:34:20Z","published":"2023-07-22T09:19:26Z","title":"On the Effectiveness of Spectral Discriminators for Perceptual Quality\n Improvement","summary":" Several recent studies advocate the use of spectral discriminators, which\nevaluate the Fourier spectra of images for generative modeling. However, the\neffectiveness of the spectral discriminators is not well interpreted yet. We\ntackle this issue by examining the spectral discriminators in the context of\nperceptual image super-resolution (i.e., GAN-based SR), as SR image quality is\nsusceptible to spectral changes. Our analyses reveal that the spectral\ndiscriminator indeed performs better than the ordinary (a.k.a. spatial)\ndiscriminator in identifying the differences in the high-frequency range;\nhowever, the spatial discriminator holds an advantage in the low-frequency\nrange. Thus, we suggest that the spectral and spatial discriminators shall be\nused simultaneously. Moreover, we improve the spectral discriminators by first\ncalculating the patch-wise Fourier spectrum and then aggregating the spectra by\nTransformer. We verify the effectiveness of the proposed method twofold. On the\none hand, thanks to the additional spectral discriminator, our obtained SR\nimages have their spectra better aligned to those of the real images, which\nleads to a better PD tradeoff. On the other hand, our ensembled discriminator\npredicts the perceptual quality more accurately, as evidenced in the\nno-reference image quality assessment task.\n","authors":["Xin Luo","Yunan Zhu","Shunxin Xu","Dong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12027v2.pdf","comment":"Accepted to ICCV 2023. Code and Models are publicly available at\n https://github.com/Luciennnnnnn/DualFormer"},{"id":"http://arxiv.org/abs/2308.07687v2","updated":"2023-08-16T05:24:46Z","published":"2023-08-15T10:37:04Z","title":"DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using\n Pre-trained Diffusion Models","summary":" Given a classifier, the inherent property of semantic Out-of-Distribution\n(OOD) samples is that their contents differ from all legal classes in terms of\nsemantics, namely semantic mismatch. There is a recent work that directly\napplies it to OOD detection, which employs a conditional Generative Adversarial\nNetwork (cGAN) to enlarge semantic mismatch in the image space. While achieving\nremarkable OOD detection performance on small datasets, it is not applicable to\nImageNet-scale datasets due to the difficulty in training cGANs with both input\nimages and labels as conditions. As diffusion models are much easier to train\nand amenable to various conditions compared to cGANs, in this work, we propose\nto directly use pre-trained diffusion models for semantic mismatch-guided OOD\ndetection, named DiffGuard. Specifically, given an OOD input image and the\npredicted label from the classifier, we try to enlarge the semantic difference\nbetween the reconstructed OOD image under these conditions and the original\ninput image. We also present several test-time techniques to further strengthen\nsuch differences. Experimental results show that DiffGuard is effective on both\nCifar-10 and hard cases of the large-scale ImageNet, and it can be easily\ncombined with existing OOD detection techniques to achieve state-of-the-art OOD\ndetection results.\n","authors":["Ruiyuan Gao","Chenchen Zhao","Lanqing Hong","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.07687v2.pdf","comment":"Accepted by ICCV2023, with supplementary materials"},{"id":"http://arxiv.org/abs/2307.16586v3","updated":"2023-08-16T04:53:42Z","published":"2023-07-31T11:40:53Z","title":"SAMFlow: Eliminating Any Fragmentation in Optical Flow with Segment\n Anything Model","summary":" Optical Flow Estimation aims to find the 2D dense motion field between two\nframes. Due to the limitation of model structures and training datasets,\nexisting methods often rely too much on local clues and ignore the integrity of\nobjects, resulting in fragmented motion estimation. Through theoretical\nanalysis, we find the pre-trained large vision models are helpful in optical\nflow estimation, and we notice that the recently famous Segment Anything Model\n(SAM) demonstrates a strong ability to segment complete objects, which is\nsuitable for solving the fragmentation problem. We thus propose a solution to\nembed the frozen SAM image encoder into FlowFormer to enhance object\nperception. To address the challenge of in-depth utilizing SAM in\nnon-segmentation tasks like optical flow estimation, we propose an Optical Flow\nTask-Specific Adaption scheme, including a Context Fusion Module to fuse the\nSAM encoder with the optical flow context encoder, and a Context Adaption\nModule to adapt the SAM features for optical flow task with Learned\nTask-Specific Embedding. Our proposed SAMFlow model reaches 0.86/2.10\nclean/final EPE and 3.55/12.32 EPE/F1-all on Sintel and KITTI-15 training set,\nsurpassing Flowformer by 8.5%/9.9% and 13.2%/16.3%. Furthermore, our model\nachieves state-of-the-art performance on the Sintel and KITTI-15 benchmarks,\nranking #1 among all two-frame methods on Sintel clean pass.\n","authors":["Shili Zhou","Ruian He","Weimin Tan","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2307.16586v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08143v1","updated":"2023-08-16T04:31:33Z","published":"2023-08-16T04:31:33Z","title":"SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech\n Separation","summary":" The integration of different modalities, such as audio and visual\ninformation, plays a crucial role in human perception of the surrounding\nenvironment. Recent research has made significant progress in designing fusion\nmodules for audio-visual speech separation. However, they predominantly focus\non multi-modal fusion architectures situated either at the top or bottom\npositions, rather than comprehensively considering multi-modal fusion at\nvarious hierarchical positions within the network. In this paper, we propose a\nnovel model called self- and cross-attention network (SCANet), which leverages\nthe attention mechanism for efficient audio-visual feature fusion. SCANet\nconsists of two types of attention blocks: self-attention (SA) and\ncross-attention (CA) blocks, where the CA blocks are distributed at the top\n(TCA), middle (MCA) and bottom (BCA) of SCANet. These blocks maintain the\nability to learn modality-specific features and enable the extraction of\ndifferent semantics from audio-visual features. Comprehensive experiments on\nthree standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2)\ndemonstrate the effectiveness of SCANet, outperforming existing\nstate-of-the-art (SOTA) methods while maintaining comparable inference time.\n","authors":["Kai Li","Runxuan Yang","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08143v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.08142v1","updated":"2023-08-16T04:27:44Z","published":"2023-08-16T04:27:44Z","title":"S2R: Exploring a Double-Win Transformer-Based Framework for Ideal and\n Blind Super-Resolution","summary":" Nowadays, deep learning based methods have demonstrated impressive\nperformance on ideal super-resolution (SR) datasets, but most of these methods\nincur dramatically performance drops when directly applied in real-world SR\nreconstruction tasks with unpredictable blur kernels. To tackle this issue,\nblind SR methods are proposed to improve the visual results on random blur\nkernels, which causes unsatisfactory reconstruction effects on ideal\nlow-resolution images similarly. In this paper, we propose a double-win\nframework for ideal and blind SR task, named S2R, including a light-weight\ntransformer-based SR model (S2R transformer) and a novel coarse-to-fine\ntraining strategy, which can achieve excellent visual results on both ideal and\nrandom fuzzy conditions. On algorithm level, S2R transformer smartly combines\nsome efficient and light-weight blocks to enhance the representation ability of\nextracted features with relatively low number of parameters. For training\nstrategy, a coarse-level learning process is firstly performed to improve the\ngeneralization of the network with the help of a large-scale external dataset,\nand then, a fast fine-tune process is developed to transfer the pre-trained\nmodel to real-world SR tasks by mining the internal features of the image.\nExperimental results show that the proposed S2R outperforms other single-image\nSR models in ideal SR condition with only 578K parameters. Meanwhile, it can\nachieve better visual results than regular blind SR models in blind fuzzy\nconditions with only 10 gradient updates, which improve convergence speed by\n300 times, significantly accelerating the transfer-learning process in\nreal-world situations.\n","authors":["Minghao She","Wendong Mao","Huihong Shi","Zhongfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08140v1","updated":"2023-08-16T04:15:21Z","published":"2023-08-16T04:15:21Z","title":"GPA-3D: Geometry-aware Prototype Alignment for Unsupervised Domain\n Adaptive 3D Object Detection from Point Clouds","summary":" LiDAR-based 3D detection has made great progress in recent years. However,\nthe performance of 3D detectors is considerably limited when deployed in unseen\nenvironments, owing to the severe domain gap problem. Existing domain adaptive\n3D detection methods do not adequately consider the problem of the\ndistributional discrepancy in feature space, thereby hindering generalization\nof detectors across domains. In this work, we propose a novel unsupervised\ndomain adaptive \\textbf{3D} detection framework, namely \\textbf{G}eometry-aware\n\\textbf{P}rototype \\textbf{A}lignment (\\textbf{GPA-3D}), which explicitly\nleverages the intrinsic geometric relationship from point cloud objects to\nreduce the feature discrepancy, thus facilitating cross-domain transferring.\nSpecifically, GPA-3D assigns a series of tailored and learnable prototypes to\npoint cloud objects with distinct geometric structures. Each prototype aligns\nBEV (bird's-eye-view) features derived from corresponding point cloud objects\non source and target domains, reducing the distributional discrepancy and\nachieving better adaptation. The evaluation results obtained on various\nbenchmarks, including Waymo, nuScenes and KITTI, demonstrate the superiority of\nour GPA-3D over the state-of-the-art approaches for different adaptation\nscenarios. The MindSpore version code will be publicly available at\n\\url{https://github.com/Liz66666/GPA3D}.\n","authors":["Ziyu Li","Jingming Guo","Tongtong Cao","Liu Bingbing","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08140v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08137v1","updated":"2023-08-16T04:03:59Z","published":"2023-08-16T04:03:59Z","title":"SYENet: A Simple Yet Effective Network for Multiple Low-Level Vision\n Tasks with Real-time Performance on Mobile Device","summary":" With the rapid development of AI hardware accelerators, applying deep\nlearning-based algorithms to solve various low-level vision tasks on mobile\ndevices has gradually become possible. However, two main problems still need to\nbe solved: task-specific algorithms make it difficult to integrate them into a\nsingle neural network architecture, and large amounts of parameters make it\ndifficult to achieve real-time inference. To tackle these problems, we propose\na novel network, SYENet, with only $~$6K parameters, to handle multiple\nlow-level vision tasks on mobile devices in a real-time manner. The SYENet\nconsists of two asymmetrical branches with simple building blocks. To\neffectively connect the results by asymmetrical branches, a Quadratic\nConnection Unit(QCU) is proposed. Furthermore, to improve performance, a new\nOutlier-Aware Loss is proposed to process the image. The proposed method proves\nits superior performance with the best PSNR as compared with other networks in\nreal-time applications such as Image Signal Processing(ISP), Low-Light\nEnhancement(LLE), and Super-Resolution(SR) with 2K60FPS throughput on Qualcomm\n8 Gen 1 mobile SoC(System-on-Chip). Particularly, for ISP task, SYENet got the\nhighest score in MAI 2022 Learned Smartphone ISP challenge.\n","authors":["Weiran Gou","Ziyao Yi","Yan Xiang","Shaoqing Li","Zibin Liu","Dehui Kong","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2308.08137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08131v1","updated":"2023-08-16T03:48:19Z","published":"2023-08-16T03:48:19Z","title":"Ranking-aware Uncertainty for Text-guided Image Retrieval","summary":" Text-guided image retrieval is to incorporate conditional text to better\ncapture users' intent. Traditionally, the existing methods focus on minimizing\nthe embedding distances between the source inputs and the targeted image, using\nthe provided triplets $\\langle$source image, source text, target\nimage$\\rangle$. However, such triplet optimization may limit the learned\nretrieval model to capture more detailed ranking information, e.g., the\ntriplets are one-to-one correspondences and they fail to account for\nmany-to-many correspondences arising from semantic diversity in feedback\nlanguages and images. To capture more ranking information, we propose a novel\nranking-aware uncertainty approach to model many-to-many correspondences by\nonly using the provided triplets. We introduce uncertainty learning to learn\nthe stochastic ranking list of features. Specifically, our approach mainly\ncomprises three components: (1) In-sample uncertainty, which aims to capture\nsemantic diversity using a Gaussian distribution derived from both combined and\ntarget features; (2) Cross-sample uncertainty, which further mines the ranking\ninformation from other samples' distributions; and (3) Distribution\nregularization, which aligns the distributional representations of source\ninputs and targeted image. Compared to the existing state-of-the-art methods,\nour proposed method achieves significant results on two public datasets for\ncomposed image retrieval.\n","authors":["Junyang Chen","Hanjiang Lai"],"pdf_url":"https://arxiv.org/pdf/2308.08131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06567v2","updated":"2023-08-16T03:45:46Z","published":"2023-01-16T19:04:23Z","title":"Scalable Surface Water Mapping up to Fine-scale using Geometric Features\n of Water from Topographic Airborne LiDAR Data","summary":" Despite substantial technological advancements, the comprehensive mapping of\nsurface water, particularly smaller bodies (<1ha), continues to be a challenge\ndue to a lack of robust, scalable methods. Standard methods require either\ntraining labels or site-specific parameter tuning, which complicates automated\nmapping and introduces biases related to training data and parameters. The\nreliance on water's reflectance properties, including LiDAR intensity, further\ncomplicates the matter, as higher-resolution images inherently produce more\nnoise. To mitigate these difficulties, we propose a unique method that focuses\non the geometric characteristics of water instead of its variable reflectance\nproperties. Unlike preceding approaches, our approach relies entirely on 3D\ncoordinate observations from airborne LiDAR data, taking advantage of the\nprinciple that connected surface water remains flat due to gravity. By\nharnessing this natural law in conjunction with connectivity, our method can\naccurately and scalably identify small water bodies, eliminating the need for\ntraining labels or repetitive parameter tuning. Consequently, our approach\nenables the creation of comprehensive 3D topographic maps that include both\nwater and terrain, all performed in an unsupervised manner using only airborne\nlaser scanning data, potentially enhancing the process of generating reliable\n3D topographic maps. We validated our method across extensive and diverse\nlandscapes, while comparing it to highly competitive Normalized Difference\nWater Index (NDWI)-based methods and assessing it using a reference surface\nwater map. In conclusion, our method offers a new approach to address\npersistent difficulties in robust, scalable surface water mapping and 3D\ntopographic mapping, using solely airborne LiDAR data.\n","authors":["Hunsoo Song","Jinha Jung"],"pdf_url":"https://arxiv.org/pdf/2301.06567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.14585v3","updated":"2023-08-16T03:40:41Z","published":"2022-05-29T07:00:34Z","title":"An unsupervised, open-source workflow for 2D and 3D building mapping\n from airborne LiDAR data","summary":" Despite the substantial demand for high-quality, large-area building maps, no\nestablished open-source workflow for generating 2D and 3D maps currently\nexists. This study introduces an automated, open-source workflow for\nlarge-scale 2D and 3D building mapping utilizing airborne LiDAR data. Uniquely,\nour workflow operates entirely unsupervised, eliminating the need for any\ntraining procedures. We have integrated a specifically tailored DTM generation\nalgorithm into our workflow to prevent errors in complex urban landscapes,\nespecially around highways and overpasses. Through fine rasterization of LiDAR\npoint clouds, we've enhanced building-tree differentiation, reduced errors near\nwater bodies, and augmented computational efficiency by introducing a new\nplanarity calculation. Our workflow offers a practical and scalable solution\nfor the mass production of rasterized 2D and 3D building maps from raw airborne\nLiDAR data. Also, we elaborate on the influence of parameters and potential\nerror sources to provide users with practical guidance. Our method's robustness\nhas been rigorously optimized and tested using an extensive dataset (> 550\nkm$^2$), and further validated through comparison with deep learning-based and\nhand-digitized products. Notably, through these unparalleled, large-scale\ncomparisons, we offer a valuable analysis of large-scale building maps\ngenerated via different methodologies, providing insightful evaluations of the\neffectiveness of each approach. We anticipate that our highly scalable building\nmapping workflow will facilitate the production of reliable 2D and 3D building\nmaps, fostering advances in large-scale urban analysis. The code will be\nreleased upon publication.\n","authors":["Hunsoo Song","Jinha Jung"],"pdf_url":"https://arxiv.org/pdf/2205.14585v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08114v1","updated":"2023-08-16T02:58:43Z","published":"2023-08-16T02:58:43Z","title":"OmniZoomer: Learning to Move and Zoom in on Sphere at High-Resolution","summary":" Omnidirectional images (ODIs) have become increasingly popular, as their\nlarge field-of-view (FoV) can offer viewers the chance to freely choose the\nview directions in immersive environments such as virtual reality. The M\\\"obius\ntransformation is typically employed to further provide the opportunity for\nmovement and zoom on ODIs, but applying it to the image level often results in\nblurry effect and aliasing problem. In this paper, we propose a novel deep\nlearning-based approach, called \\textbf{OmniZoomer}, to incorporate the\nM\\\"obius transformation into the network for movement and zoom on ODIs. By\nlearning various transformed feature maps under different conditions, the\nnetwork is enhanced to handle the increasing edge curvatures, which alleviates\nthe blurry effect. Moreover, to address the aliasing problem, we propose two\nkey components. Firstly, to compensate for the lack of pixels for describing\ncurves, we enhance the feature maps in the high-resolution (HR) space and\ncalculate the transformed index map with a spatial index generation module.\nSecondly, considering that ODIs are inherently represented in the spherical\nspace, we propose a spherical resampling module that combines the index map and\nHR feature maps to transform the feature maps for better spherical correlation.\nThe transformed feature maps are decoded to output a zoomed ODI. Experiments\nshow that our method can produce HR and high-quality ODIs with the flexibility\nto move and zoom in to the object of interest. Project page is available at\nhttp://vlislab22.github.io/OmniZoomer/.\n","authors":["Zidong Cao","Hao Ai","Yan-Pei Cao","Ying Shan","Xiaohu Qie","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08114v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08110v1","updated":"2023-08-16T02:51:52Z","published":"2023-08-16T02:51:52Z","title":"View Consistent Purification for Accurate Cross-View Localization","summary":" This paper proposes a fine-grained self-localization method for outdoor\nrobotics that utilizes a flexible number of onboard cameras and readily\naccessible satellite images. The proposed method addresses limitations in\nexisting cross-view localization methods that struggle to handle noise sources\nsuch as moving objects and seasonal variations. It is the first sparse\nvisual-only method that enhances perception in dynamic environments by\ndetecting view-consistent key points and their corresponding deep features from\nground and satellite views, while removing off-the-ground objects and\nestablishing homography transformation between the two views. Moreover, the\nproposed method incorporates a spatial embedding approach that leverages camera\nintrinsic and extrinsic information to reduce the ambiguity of purely visual\nmatching, leading to improved feature matching and overall pose estimation\naccuracy. The method exhibits strong generalization and is robust to\nenvironmental changes, requiring only geo-poses as ground truth. Extensive\nexperiments on the KITTI and Ford Multi-AV Seasonal datasets demonstrate that\nour proposed method outperforms existing state-of-the-art methods, achieving\nmedian spatial accuracy errors below $0.5$ meters along the lateral and\nlongitudinal directions, and a median orientation accuracy error below 2\ndegrees.\n","authors":["Shan Wang","Yanhao Zhang","Akhil Perincherry","Ankit Vora","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2308.08110v1.pdf","comment":"Accepted for ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14051v4","updated":"2023-08-16T02:29:50Z","published":"2023-07-26T09:04:27Z","title":"3D Semantic Subspace Traverser: Empowering 3D Generative Model with\n Shape Editing Capability","summary":" Shape generation is the practice of producing 3D shapes as various\nrepresentations for 3D content creation. Previous studies on 3D shape\ngeneration have focused on shape quality and structure, without or less\nconsidering the importance of semantic information. Consequently, such\ngenerative models often fail to preserve the semantic consistency of shape\nstructure or enable manipulation of the semantic attributes of shapes during\ngeneration. In this paper, we proposed a novel semantic generative model named\n3D Semantic Subspace Traverser that utilizes semantic attributes for\ncategory-specific 3D shape generation and editing. Our method utilizes implicit\nfunctions as the 3D shape representation and combines a novel latent-space GAN\nwith a linear subspace model to discover semantic dimensions in the local\nlatent space of 3D shapes. Each dimension of the subspace corresponds to a\nparticular semantic attribute, and we can edit the attributes of generated\nshapes by traversing the coefficients of those dimensions. Experimental results\ndemonstrate that our method can produce plausible shapes with complex\nstructures and enable the editing of semantic attributes. The code and trained\nmodels are available at\nhttps://github.com/TrepangCat/3D_Semantic_Subspace_Traverser\n","authors":["Ruowei Wang","Yu Liu","Pei Su","Jianwei Zhang","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.14051v4.pdf","comment":"Published in ICCV 2023. Code:\n https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser"},{"id":"http://arxiv.org/abs/2308.08094v1","updated":"2023-08-16T02:04:34Z","published":"2023-08-16T02:04:34Z","title":"Snapshot High Dynamic Range Imaging with a Polarization Camera","summary":" High dynamic range (HDR) images are important for a range of tasks, from\nnavigation to consumer photography. Accordingly, a host of specialized HDR\nsensors have been developed, the most successful of which are based on\ncapturing variable per-pixel exposures. In essence, these methods capture an\nentire exposure bracket sequence at once in a single shot. This paper presents\na straightforward but highly effective approach for turning an off-the-shelf\npolarization camera into a high-performance HDR camera. By placing a linear\npolarizer in front of the polarization camera, we are able to simultaneously\ncapture four images with varied exposures, which are determined by the\norientation of the polarizer. We develop an outlier-robust and self-calibrating\nalgorithm to reconstruct an HDR image (at a single polarity) from these\nmeasurements. Finally, we demonstrate the efficacy of our approach with\nextensive real-world experiments.\n","authors":["Mingyang Xie","Matthew Chan","Christopher Metzler"],"pdf_url":"https://arxiv.org/pdf/2308.08094v1.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2304.06906v3","updated":"2023-08-16T01:53:02Z","published":"2023-04-14T02:49:08Z","title":"Swin3D: A Pretrained Transformer Backbone for 3D Indoor Scene\n Understanding","summary":" The use of pretrained backbones with fine-tuning has been successful for 2D\nvision and natural language processing tasks, showing advantages over\ntask-specific networks. In this work, we introduce a pretrained 3D backbone,\ncalled {\\SST}, for 3D indoor scene understanding. We design a 3D Swin\ntransformer as our backbone network, which enables efficient self-attention on\nsparse voxels with linear memory complexity, making the backbone scalable to\nlarge models and datasets. We also introduce a generalized contextual relative\npositional embedding scheme to capture various irregularities of point signals\nfor improved network performance. We pretrained a large {\\SST} model on a\nsynthetic Structured3D dataset, which is an order of magnitude larger than the\nScanNet dataset. Our model pretrained on the synthetic dataset not only\ngeneralizes well to downstream segmentation and detection on real 3D point\ndatasets, but also outperforms state-of-the-art methods on downstream tasks\nwith +2.3 mIoU and +2.2 mIoU on S3DIS Area5 and 6-fold semantic segmentation,\n+1.8 mIoU on ScanNet segmentation (val), +1.9 mAP@0.5 on ScanNet detection, and\n+8.1 mAP@0.5 on S3DIS detection. A series of extensive ablation studies further\nvalidate the scalability, generality, and superior performance enabled by our\napproach. The code and models are available at\nhttps://github.com/microsoft/Swin3D .\n","authors":["Yu-Qi Yang","Yu-Xiao Guo","Jian-Yu Xiong","Yang Liu","Hao Pan","Peng-Shuai Wang","Xin Tong","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2304.06906v3.pdf","comment":"Project page: https://yukichiii.github.io/project/swin3D/swin3D.html"},{"id":"http://arxiv.org/abs/2308.08089v1","updated":"2023-08-16T01:43:41Z","published":"2023-08-16T01:43:41Z","title":"DragNUWA: Fine-grained Control in Video Generation by Integrating Text,\n Image, and Trajectory","summary":" Controllable video generation has gained significant attention in recent\nyears. However, two main limitations persist: Firstly, most existing works\nfocus on either text, image, or trajectory-based control, leading to an\ninability to achieve fine-grained control in videos. Secondly, trajectory\ncontrol research is still in its early stages, with most experiments being\nconducted on simple datasets like Human3.6M. This constraint limits the models'\ncapability to process open-domain images and effectively handle complex curved\ntrajectories. In this paper, we propose DragNUWA, an open-domain\ndiffusion-based video generation model. To tackle the issue of insufficient\ncontrol granularity in existing works, we simultaneously introduce text, image,\nand trajectory information to provide fine-grained control over video content\nfrom semantic, spatial, and temporal perspectives. To resolve the problem of\nlimited open-domain trajectory control in current research, We propose\ntrajectory modeling with three aspects: a Trajectory Sampler (TS) to enable\nopen-domain control of arbitrary trajectories, a Multiscale Fusion (MF) to\ncontrol trajectories in different granularities, and an Adaptive Training (AT)\nstrategy to generate consistent videos following trajectories. Our experiments\nvalidate the effectiveness of DragNUWA, demonstrating its superior performance\nin fine-grained control in video generation. The homepage link is\n\\url{https://www.microsoft.com/en-us/research/project/dragnuwa/}\n","authors":["Shengming Yin","Chenfei Wu","Jian Liang","Jie Shi","Houqiang Li","Gong Ming","Nan Duan"],"pdf_url":"https://arxiv.org/pdf/2308.08089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08088v1","updated":"2023-08-16T01:38:49Z","published":"2023-08-16T01:38:49Z","title":"Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme\n Detection","summary":" Hateful meme detection is a challenging multimodal task that requires\ncomprehension of both vision and language, as well as cross-modal interactions.\nRecent studies have tried to fine-tune pre-trained vision-language models\n(PVLMs) for this task. However, with increasing model sizes, it becomes\nimportant to leverage powerful PVLMs more efficiently, rather than simply\nfine-tuning them. Recently, researchers have attempted to convert meme images\ninto textual captions and prompt language models for predictions. This approach\nhas shown good performance but suffers from non-informative image captions.\nConsidering the two factors mentioned above, we propose a probing-based\ncaptioning approach to leverage PVLMs in a zero-shot visual question answering\n(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful\ncontent-related questions and use the answers as image captions (which we call\nPro-Cap), so that the captions contain information critical for hateful content\ndetection. The good performance of models with Pro-Cap on three benchmarks\nvalidates the effectiveness and generalization of the proposed method.\n","authors":["Rui Cao","Ming Shan Hee","Adriel Kuek","Wen-Haw Chong","Roy Ka-Wei Lee","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.08088v1.pdf","comment":"Camera-ready for 23, ACM MM"},{"id":"http://arxiv.org/abs/2308.07439v2","updated":"2023-08-16T01:29:39Z","published":"2023-08-14T20:20:26Z","title":"Interaction-Aware Personalized Vehicle Trajectory Prediction Using\n Temporal Graph Neural Networks","summary":" Accurate prediction of vehicle trajectories is vital for advanced driver\nassistance systems and autonomous vehicles. Existing methods mainly rely on\ngeneric trajectory predictions derived from large datasets, overlooking the\npersonalized driving patterns of individual drivers. To address this gap, we\npropose an approach for interaction-aware personalized vehicle trajectory\nprediction that incorporates temporal graph neural networks. Our method\nutilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to\nmodel the spatio-temporal interactions between target vehicles and their\nsurrounding traffic. To personalize the predictions, we establish a pipeline\nthat leverages transfer learning: the model is initially pre-trained on a\nlarge-scale trajectory dataset and then fine-tuned for each driver using their\nspecific driving data. We employ human-in-the-loop simulation to collect\npersonalized naturalistic driving trajectories and corresponding surrounding\nvehicle trajectories. Experimental results demonstrate the superior performance\nof our personalized GCN-LSTM model, particularly for longer prediction\nhorizons, compared to its generic counterpart. Moreover, the personalized model\noutperforms individual models created without pre-training, emphasizing the\nsignificance of pre-training on a large dataset to avoid overfitting. By\nincorporating personalization, our approach enhances trajectory prediction\naccuracy.\n","authors":["Amr Abdelraouf","Rohit Gupta","Kyungtae Han"],"pdf_url":"https://arxiv.org/pdf/2308.07439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16751v2","updated":"2023-08-16T01:25:03Z","published":"2023-07-31T15:18:54Z","title":"High-Performance Fine Defect Detection in Artificial Leather Using Dual\n Feature Pool Object Detection","summary":" In this study, the structural problems of the YOLOv5 model were analyzed\nemphatically. Based on the characteristics of fine defects in artificial\nleather, four innovative structures, namely DFP, IFF, AMP, and EOS, were\ndesigned. These advancements led to the proposal of a high-performance\nartificial leather fine defect detection model named YOLOD. YOLOD demonstrated\noutstanding performance on the artificial leather defect dataset, achieving an\nimpressive increase of 11.7% - 13.5% in AP_50 compared to YOLOv5, along with a\nsignificant reduction of 5.2% - 7.2% in the error detection rate. Moreover,\nYOLOD also exhibited remarkable performance on the general MS-COCO dataset,\nwith an increase of 0.4% - 2.6% in AP compared to YOLOv5, and a rise of 2.5% -\n4.1% in AP_S compared to YOLOv5. These results demonstrate the superiority of\nYOLOD in both artificial leather defect detection and general object detection\ntasks, making it a highly efficient and effective model for real-world\napplications.\n","authors":["Lin Huang","Weisheng Li","Linlin Shen","Xue Xiao","Suihan Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.16751v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04170v4","updated":"2023-08-16T01:10:43Z","published":"2023-05-07T03:00:06Z","title":"YOLOCS: Object Detection based on Dense Channel Compression for Feature\n Spatial Solidification","summary":" In this study, we examine the associations between channel features and\nconvolutional kernels during the processes of feature purification and gradient\nbackpropagation, with a focus on the forward and backward propagation within\nthe network. Consequently, we propose a method called Dense Channel Compression\nfor Feature Spatial Solidification. Drawing upon the central concept of this\nmethod, we introduce two innovative modules for backbone and head networks: the\nDense Channel Compression for Feature Spatial Solidification Structure (DCFS)\nand the Asymmetric Multi-Level Compression Decoupled Head (ADH). When\nintegrated into the YOLOv5 model, these two modules demonstrate exceptional\nperformance, resulting in a modified model referred to as YOLOCS. Evaluated on\nthe MSCOCO dataset, the large, medium, and small YOLOCS models yield AP of\n50.1%, 47.6%, and 42.5%, respectively. Maintaining inference speeds remarkably\nsimilar to those of the YOLOv5 model, the large, medium, and small YOLOCS\nmodels surpass the YOLOv5 model's AP by 1.1%, 2.3%, and 5.2%, respectively.\n","authors":["Lin Huang","Weisheng Li","Linlin Shen","Haojie Fu","Xue Xiao","Suihan Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.04170v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13855v2","updated":"2023-08-16T00:57:52Z","published":"2023-05-23T09:23:05Z","title":"A Two-Step Deep Learning Method for 3DCT-2DUS Kidney Registration During\n Breathing","summary":" This work proposed a novel deep registration pipeline for 3D CT and 2D U/S\nkidney scans of free breathing, which consists of a feature network, and a\n3D-2D CNN-based registration network. The feature network has handcraft texture\nfeature layers to reduce the semantic gap. The registration network is\nencoder-decoder structure with loss of feature-image-motion (FIM), which\nenables hierarchical regression at decoder layers and avoids multiple network\nconcatenation. It was first pretrained with retrospective datasets cum training\ndata generation strategy, then adapted to specific patient data under\nunsupervised one-cycle transfer learning in onsite application. The experiment\nwas on 132 U/S sequences, 39 multiple phase CT and 210 public single phase CT\nimages, and 25 pairs of CT and U/S sequences. It resulted in mean contour\ndistance (MCD) of 0.94 mm between kidneys on CT and U/S images and MCD of 1.15\nmm on CT and reference CT images. For datasets with small transformations, it\nresulted in MCD of 0.82 and 1.02 mm respectively. For large transformations, it\nresulted in MCD of 1.10 and 1.28 mm respectively. This work addressed\ndifficulties in 3DCT-2DUS kidney registration during free breathing via novel\nnetwork structures and training strategy.\n","authors":["Chi Yanling","Xu Yuyu","Liu Huiying","Wu Xiaoxiang","Liu Zhiqiang","Mao Jiawei","Xu Guibin","Huang Weimin"],"pdf_url":"https://arxiv.org/pdf/2305.13855v2.pdf","comment":"16 pages, 8 figures, 10 tables"},{"id":"http://arxiv.org/abs/2307.04246v3","updated":"2023-08-16T00:46:04Z","published":"2023-07-09T18:52:01Z","title":"Convex Decomposition of Indoor Scenes","summary":" We describe a method to parse a complex, cluttered indoor scene into\nprimitives which offer a parsimonious abstraction of scene structure. Our\nprimitives are simple convexes. Our method uses a learned regression procedure\nto parse a scene into a fixed number of convexes from RGBD input, and can\noptionally accept segmentations to improve the decomposition. The result is\nthen polished with a descent method which adjusts the convexes to produce a\nvery good fit, and greedily removes superfluous primitives. Because the entire\nscene is parsed, we can evaluate using traditional depth, normal, and\nsegmentation error metrics. Our evaluation procedure demonstrates that the\nerror from our primitive representation is comparable to that of predicting\ndepth from a single image.\n","authors":["Vaibhav Vavilala","David Forsyth"],"pdf_url":"https://arxiv.org/pdf/2307.04246v3.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2303.13516v3","updated":"2023-08-16T00:00:47Z","published":"2023-03-23T17:59:42Z","title":"Ablating Concepts in Text-to-Image Diffusion Models","summary":" Large-scale text-to-image diffusion models can generate high-fidelity images\nwith powerful compositional ability. However, these models are typically\ntrained on an enormous amount of Internet data, often containing copyrighted\nmaterial, licensed images, and personal photos. Furthermore, they have been\nfound to replicate the style of various living artists or memorize exact\ntraining samples. How can we remove such copyrighted concepts or images without\nretraining the model from scratch? To achieve this goal, we propose an\nefficient method of ablating concepts in the pretrained model, i.e., preventing\nthe generation of a target concept. Our algorithm learns to match the image\ndistribution for a target style, instance, or text prompt we wish to ablate to\nthe distribution corresponding to an anchor concept. This prevents the model\nfrom generating target concepts given its text condition. Extensive experiments\nshow that our method can successfully prevent the generation of the ablated\nconcept while preserving closely related concepts in the model.\n","authors":["Nupur Kumari","Bingliang Zhang","Sheng-Yu Wang","Eli Shechtman","Richard Zhang","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.13516v3.pdf","comment":"ICCV 2023. Project website: https://www.cs.cmu.edu/~concept-ablation/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.15780v2","updated":"2023-08-16T17:59:07Z","published":"2023-07-24T18:47:38Z","title":"LLM-Rec: Personalized Recommendation via Prompting Large Language Models","summary":" We investigate various prompting strategies for enhancing personalized\nrecommendation performance with large language models (LLMs) through input\naugmentation. Our proposed approach, termed LLM-Rec, encompasses four distinct\nprompting strategies: (1) basic prompting, (2) recommendation-driven prompting,\n(3) engagement-guided prompting, and (4) recommendation-driven +\nengagement-guided prompting. Our empirical experiments show that incorporating\nthe augmented input text generated by LLM leads to improved recommendation\nperformance. Recommendation-driven and engagement-guided prompting strategies\nare found to elicit LLM's understanding of global and local item\ncharacteristics. This finding highlights the importance of leveraging diverse\nprompts and input augmentation techniques to enhance the recommendation\ncapabilities with LLMs.\n","authors":["Hanjia Lyu","Song Jiang","Hanqing Zeng","Qifan Wang","Si Zhang","Ren Chen","Chris Leung","Jiajie Tang","Yinglong Xia","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2307.15780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08434v1","updated":"2023-08-16T15:28:22Z","published":"2023-08-16T15:28:22Z","title":"A Bi-Step Grounding Paradigm for Large Language Models in Recommendation\n Systems","summary":" As the focus on Large Language Models (LLMs) in the field of recommendation\nintensifies, the optimization of LLMs for recommendation purposes (referred to\nas LLM4Rec) assumes a crucial role in augmenting their effectiveness in\nproviding recommendations. However, existing approaches for LLM4Rec often\nassess performance using restricted sets of candidates, which may not\naccurately reflect the models' overall ranking capabilities. In this paper, our\nobjective is to investigate the comprehensive ranking capacity of LLMs and\npropose a two-step grounding framework known as BIGRec (Bi-step Grounding\nParadigm for Recommendation). It initially grounds LLMs to the recommendation\nspace by fine-tuning them to generate meaningful tokens for items and\nsubsequently identifies appropriate actual items that correspond to the\ngenerated tokens. By conducting extensive experiments on two datasets, we\nsubstantiate the superior performance, capacity for handling few-shot\nscenarios, and versatility across multiple domains exhibited by BIGRec.\nFurthermore, we observe that the marginal benefits derived from increasing the\nquantity of training samples are modest for BIGRec, implying that LLMs possess\nthe limited capability to assimilate statistical information, such as\npopularity and collaborative filtering, due to their robust semantic priors.\nThese findings also underline the efficacy of integrating diverse statistical\ninformation into the LLM4Rec framework, thereby pointing towards a potential\navenue for future research. Our code and data are available at\nhttps://github.com/SAI990323/Grounding4Rec.\n","authors":["Keqin Bao","Jizhi Zhang","Wenjie Wang","Yang Zhang","Zhengyi Yang","Yancheng Luo","Fuli Feng","Xiangnaan He","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2308.08434v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2308.08413v1","updated":"2023-08-16T14:58:12Z","published":"2023-08-16T14:58:12Z","title":"Knowledge-Enhanced Multi-Label Few-Shot Product Attribute-Value\n Extraction","summary":" Existing attribute-value extraction (AVE) models require large quantities of\nlabeled data for training. However, new products with new attribute-value pairs\nenter the market every day in real-world e-Commerce. Thus, we formulate AVE in\nmulti-label few-shot learning (FSL), aiming to extract unseen attribute value\npairs based on a small number of training examples. We propose a\nKnowledge-Enhanced Attentive Framework (KEAF) based on prototypical networks,\nleveraging the generated label description and category information to learn\nmore discriminative prototypes. Besides, KEAF integrates with hybrid attention\nto reduce noise and capture more informative semantics for each class by\ncalculating the label-relevant and query-related weights. To achieve\nmulti-label inference, KEAF further learns a dynamic threshold by integrating\nthe semantic information from both the support set and the query set. Extensive\nexperiments with ablation studies conducted on two datasets demonstrate that\nKEAF outperforms other SOTA models for information extraction in FSL. The code\ncan be found at: https://github.com/gjiaying/KEAF\n","authors":["Jiaying Gong","Wei-Te Chen","Hoda Eldardiry"],"pdf_url":"https://arxiv.org/pdf/2308.08413v1.pdf","comment":"6 pages, 2 figures, published in CIKM 2023"},{"id":"http://arxiv.org/abs/2308.08406v1","updated":"2023-08-16T14:50:51Z","published":"2023-08-16T14:50:51Z","title":"Content-based Recommendation Engine for Video Streaming Platform","summary":" Recommendation engine suggest content, product or services to the user by\nusing machine learning algorithm. This paper proposed a content-based\nrecommendation engine for providing video suggestion to the user based on their\nprevious interests and choices. We will use TF-IDF text vectorization method to\ndetermine the relevance of words in a document. Then we will find out the\nsimilarity between each content by calculating cosine similarity between them.\nFinally, engine will recommend videos to the users based on the obtained\nsimilarity score value. In addition, we will measure the engine's performance\nby computing precision, recall, and F1 core of the proposed system.\n","authors":["Puskal Khadka","Prabhav Lamichhane"],"pdf_url":"https://arxiv.org/pdf/2308.08406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08378v1","updated":"2023-08-16T14:01:25Z","published":"2023-08-16T14:01:25Z","title":"Advancing continual lifelong learning in neural information retrieval:\n definition, dataset, framework, and empirical evaluation","summary":" Continual learning refers to the capability of a machine learning model to\nlearn and adapt to new information, without compromising its performance on\npreviously learned tasks. Although several studies have investigated continual\nlearning methods for information retrieval tasks, a well-defined task\nformulation is still lacking, and it is unclear how typical learning strategies\nperform in this context. To address this challenge, a systematic task\nformulation of continual neural information retrieval is presented, along with\na multiple-topic dataset that simulates continuous information retrieval. A\ncomprehensive continual neural information retrieval framework consisting of\ntypical retrieval models and continual learning strategies is then proposed.\nEmpirical evaluations illustrate that the proposed framework can successfully\nprevent catastrophic forgetting in neural information retrieval and enhance\nperformance on previously learned tasks. The results indicate that\nembedding-based retrieval models experience a decline in their continual\nlearning performance as the topic shift distance and dataset volume of new\ntasks increase. In contrast, pretraining-based models do not show any such\ncorrelation. Adopting suitable learning strategies can mitigate the effects of\ntopic shift and data augmentation.\n","authors":["Jingrui Hou","Georgina Cosma","Axel Finke"],"pdf_url":"https://arxiv.org/pdf/2308.08378v1.pdf","comment":"Submitted to Information Sciences"},{"id":"http://arxiv.org/abs/2308.08354v1","updated":"2023-08-16T13:24:47Z","published":"2023-08-16T13:24:47Z","title":"Is Meta-Learning the Right Approach for the Cold-Start Problem in\n Recommender Systems?","summary":" Recommender systems have become fundamental building blocks of modern online\nproducts and services, and have a substantial impact on user experience. In the\npast few years, deep learning methods have attracted a lot of research, and are\nnow heavily used in modern real-world recommender systems. Nevertheless,\ndealing with recommendations in the cold-start setting, e.g., when a user has\ndone limited interactions in the system, is a problem that remains far from\nsolved. Meta-learning techniques, and in particular optimization-based\nmeta-learning, have recently become the most popular approaches in the academic\nresearch literature for tackling the cold-start problem in deep learning models\nfor recommender systems. However, current meta-learning approaches are not\npractical for real-world recommender systems, which have billions of users and\nitems, and strict latency requirements. In this paper we show that it is\npossible to obtaining similar, or higher, performance on commonly used\nbenchmarks for the cold-start problem without using meta-learning techniques.\nIn more detail, we show that, when tuned correctly, standard and widely adopted\ndeep learning models perform just as well as newer meta-learning models. We\nfurther show that an extremely simple modular approach using common\nrepresentation learning techniques, can perform comparably to meta-learning\ntechniques specifically designed for the cold-start setting while being much\nmore easily deployable in real-world applications.\n","authors":["Davide Buffelli","Ashish Gupta","Agnieszka Strzalka","Vassilis Plachouras"],"pdf_url":"https://arxiv.org/pdf/2308.08354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08328v1","updated":"2023-08-16T12:42:28Z","published":"2023-08-16T12:42:28Z","title":"Phase Retrieval with Background Information: Decreased References and\n Efficient Methods","summary":" Fourier phase retrieval(PR) is a severely ill-posed inverse problem that\narises in various applications. To guarantee a unique solution and relieve the\ndependence on the initialization, background information can be exploited as a\nstructural priors. However, the requirement for the background information may\nbe challenging when moving to the high-resolution imaging. At the same time,\nthe previously proposed projected gradient descent(PGD) method also demands\nmuch background information.\n In this paper, we present an improved theoretical result about the demand for\nthe background information, along with two Douglas Rachford(DR) based methods.\nAnalytically, we demonstrate that the background required to ensure a unique\nsolution can be decreased by nearly $1/2$ for the 2-D signals compared to the\n1-D signals. By generalizing the results into $d$-dimension, we show that the\nlength of the background information more than $(2^{\\frac{d+1}{d}}-1)$ folds of\nthe signal is sufficient to ensure the uniqueness. At the same time, we also\nanalyze the stability and robustness of the model when measurements and\nbackground information are corrupted by the noise. Furthermore, two methods\ncalled Background Douglas-Rachford (BDR) and Convex Background Douglas-Rachford\n(CBDR) are proposed. BDR which is a kind of non-convex method is proven to have\nthe local R-linear convergence rate under mild assumptions. Instead, CBDR\nmethod uses the techniques of convexification and can be proven to own a global\nconvergence guarantee as long as the background information is sufficient. To\nsupport this, a new property called F-RIP is established. We test the\nperformance of the proposed methods through simulations as well as real\nexperimental measurements, and demonstrate that they achieve a higher recovery\nrate with less background information compared to the PGD method.\n","authors":["Ziyang Yuan","Haoxing Yang","Ningyi Leng","Hongxia Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08285v1","updated":"2023-08-16T11:10:43Z","published":"2023-08-16T11:10:43Z","title":"Pre-training with Large Language Model-based Document Expansion for\n Dense Passage Retrieval","summary":" In this paper, we systematically study the potential of pre-training with\nLarge Language Model(LLM)-based document expansion for dense passage retrieval.\nConcretely, we leverage the capabilities of LLMs for document expansion, i.e.\nquery generation, and effectively transfer expanded knowledge to retrievers\nusing pre-training strategies tailored for passage retrieval. These strategies\ninclude contrastive learning and bottlenecked query generation. Furthermore, we\nincorporate a curriculum learning strategy to reduce the reliance on LLM\ninferences. Experimental results demonstrate that pre-training with LLM-based\ndocument expansion significantly boosts the retrieval performance on\nlarge-scale web-search tasks. Our work shows strong zero-shot and out-of-domain\nretrieval abilities, making it more widely applicable for retrieval when\ninitializing with no human-labeled data.\n","authors":["Guangyuan Ma","Xing Wu","Peng Wang","Zijia Lin","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08285v1.pdf","comment":"10 pages, 3 tables, 4 figures, under review"},{"id":"http://arxiv.org/abs/2301.10405v5","updated":"2023-08-16T10:57:58Z","published":"2023-01-25T04:45:06Z","title":"Editing Language Model-based Knowledge Graph Embeddings","summary":" Recently decades have witnessed the empirical success of framing Knowledge\nGraph (KG) embeddings via language models. However, language model-based KG\nembeddings are usually deployed as static artifacts, making them difficult to\nmodify post-deployment without re-training after deployment. To address this\nissue, we propose a new task of editing language model-based KG embeddings in\nthis paper. This task is designed to facilitate rapid, data-efficient updates\nto KG embeddings without compromising the performance of other aspects. We\nbuild four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and\nevaluate several knowledge editing baselines demonstrating the limited ability\nof previous models to handle the proposed challenging task. We further propose\na simple yet strong baseline dubbed KGEditor, which utilizes additional\nparametric layers of the hyper network to edit/add facts. Our comprehensive\nexperimental results reveal that KGEditor excels in updating specific facts\nwithout impacting the overall performance, even when faced with limited\ntraining resources. Code and datasets are available in\nhttps://github.com/zjunlp/PromptKG/tree/main/deltaKG.\n","authors":["Siyuan Cheng","Ningyu Zhang","Bozhong Tian","Xi Chen","Qingbing Liu","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2301.10405v5.pdf","comment":"Work in progress and the project website is\n https://zjunlp.github.io/project/KGE_Editing/"},{"id":"http://arxiv.org/abs/2308.07426v2","updated":"2023-08-16T10:10:45Z","published":"2023-08-14T19:36:57Z","title":"A Survey on Point-of-Interest Recommendations Leveraging Heterogeneous\n Data","summary":" Tourism is an important application domain for recommender systems. In this\ndomain, recommender systems are for example tasked with providing personalized\nrecommendations for transportation, accommodation, points-of-interest (POIs),\nor tourism services. Among these tasks, in particular the problem of\nrecommending POIs that are of likely interest to individual tourists has gained\ngrowing attention in recent years. Providing POI recommendations to tourists\n\\emph{during their trip} can however be especially challenging due to the\nvariability of the users' context. With the rapid development of the Web and\ntoday's multitude of online services, vast amounts of data from various sources\nhave become available, and these heterogeneous data sources represent a huge\npotential to better address the challenges of in-trip POI recommendation\nproblems. In this work, we provide a comprehensive survey of published research\non POI recommendation between 2017 and 2022 from the perspective of\nheterogeneous data sources. Specifically, we investigate which types of data\nare used in the literature and which technical approaches and evaluation\nmethods are predominant. Among other aspects, we find that today's research\nworks often focus on a narrow range of data sources, leaving great potential\nfor future works that better utilize heterogeneous data sources and diverse\ndata types for improved in-trip recommendations.\n","authors":["Zehui Wang","Wolfram Höpken","Dietmar Jannach"],"pdf_url":"https://arxiv.org/pdf/2308.07426v2.pdf","comment":"35 pages, 19 figures"},{"id":"http://arxiv.org/abs/2308.07711v2","updated":"2023-08-16T05:58:16Z","published":"2023-08-15T11:45:34Z","title":"SPM: Structured Pretraining and Matching Architectures for Relevance\n Modeling in Meituan Search","summary":" In e-commerce search, relevance between query and documents is an essential\nrequirement for satisfying user experience. Different from traditional\ne-commerce platforms that offer products, users search on life service\nplatforms such as Meituan mainly for product providers, which usually have\nabundant structured information, e.g. name, address, category, thousands of\nproducts. Modeling search relevance with these rich structured contents is\nchallenging due to the following issues: (1) there is language distribution\ndiscrepancy among different fields of structured document, making it difficult\nto directly adopt off-the-shelf pretrained language model based methods like\nBERT. (2) different fields usually have different importance and their length\nvary greatly, making it difficult to extract document information helpful for\nrelevance matching.\n To tackle these issues, in this paper we propose a novel two-stage\npretraining and matching architecture for relevance matching with rich\nstructured documents. At pretraining stage, we propose an effective pretraining\nmethod that employs both query and multiple fields of document as inputs,\nincluding an effective information compression method for lengthy fields. At\nrelevance matching stage, a novel matching method is proposed by leveraging\ndomain knowledge in search query to generate more effective document\nrepresentations for relevance scoring. Extensive offline experiments and online\nA/B tests on millions of users verify that the proposed architectures\neffectively improve the performance of relevance modeling. The model has\nalready been deployed online, serving the search traffic of Meituan for over a\nyear.\n","authors":["Wen Zan","Yaopeng Han","Xiaotian Jiang","Yao Xiao","Yang Yang","Dayao Chen","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07711v2.pdf","comment":"Accepted by CIKM '23"},{"id":"http://arxiv.org/abs/2308.05379v3","updated":"2023-08-16T03:59:23Z","published":"2023-08-10T06:52:53Z","title":"Beyond Semantics: Learning a Behavior Augmented Relevance Model with\n Self-supervised Learning","summary":" Relevance modeling aims to locate desirable items for corresponding queries,\nwhich is crucial for search engines to ensure user experience. Although most\nconventional approaches address this problem by assessing the semantic\nsimilarity between the query and item, pure semantic matching is not\neverything.\n","authors":["Zeyuan Chen","Wei Chen","Jia Xu","Zhongyi Liu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05379v3.pdf","comment":"Partial content"},{"id":"http://arxiv.org/abs/2308.08120v1","updated":"2023-08-16T03:21:23Z","published":"2023-08-16T03:21:23Z","title":"Uncovering User Interest from Biased and Noised Watch Time in Video\n Recommendation","summary":" In the video recommendation, watch time is commonly adopted as an indicator\nof user interest. However, watch time is not only influenced by the matching of\nusers' interests but also by other factors, such as duration bias and noisy\nwatching. Duration bias refers to the tendency for users to spend more time on\nvideos with longer durations, regardless of their actual interest level. Noisy\nwatching, on the other hand, describes users taking time to determine whether\nthey like a video or not, which can result in users spending time watching\nvideos they do not like. Consequently, the existence of duration bias and noisy\nwatching make watch time an inadequate label for indicating user interest.\nFurthermore, current methods primarily address duration bias and ignore the\nimpact of noisy watching, which may limit their effectiveness in uncovering\nuser interest from watch time. In this study, we first analyze the generation\nmechanism of users' watch time from a unified causal viewpoint. Specifically,\nwe considered the watch time as a mixture of the user's actual interest level,\nthe duration-biased watch time, and the noisy watch time. To mitigate both the\nduration bias and noisy watching, we propose Debiased and Denoised watch time\nCorrection (D$^2$Co), which can be divided into two steps: First, we employ a\nduration-wise Gaussian Mixture Model plus frequency-weighted moving average for\nestimating the bias and noise terms; then we utilize a sensitivity-controlled\ncorrection function to separate the user interest from the watch time, which is\nrobust to the estimation error of bias and noise terms. The experiments on two\npublic video recommendation datasets and online A/B testing indicate the\neffectiveness of the proposed method.\n","authors":["Haiyuan Zhao","Lei Zhang","Jun Xu","Guohao Cai","Zhenhua Dong","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.08120v1.pdf","comment":"Accepted by Recsys'23"},{"id":"http://arxiv.org/abs/2308.08088v1","updated":"2023-08-16T01:38:49Z","published":"2023-08-16T01:38:49Z","title":"Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme\n Detection","summary":" Hateful meme detection is a challenging multimodal task that requires\ncomprehension of both vision and language, as well as cross-modal interactions.\nRecent studies have tried to fine-tune pre-trained vision-language models\n(PVLMs) for this task. However, with increasing model sizes, it becomes\nimportant to leverage powerful PVLMs more efficiently, rather than simply\nfine-tuning them. Recently, researchers have attempted to convert meme images\ninto textual captions and prompt language models for predictions. This approach\nhas shown good performance but suffers from non-informative image captions.\nConsidering the two factors mentioned above, we propose a probing-based\ncaptioning approach to leverage PVLMs in a zero-shot visual question answering\n(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful\ncontent-related questions and use the answers as image captions (which we call\nPro-Cap), so that the captions contain information critical for hateful content\ndetection. The good performance of models with Pro-Cap on three benchmarks\nvalidates the effectiveness and generalization of the proposed method.\n","authors":["Rui Cao","Ming Shan Hee","Adriel Kuek","Wen-Haw Chong","Roy Ka-Wei Lee","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.08088v1.pdf","comment":"Camera-ready for 23, ACM MM"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.08538v1","updated":"2023-08-16T17:53:40Z","published":"2023-08-16T17:53:40Z","title":"Proprioceptive Learning with Soft Polyhedral Networks","summary":" Proprioception is the \"sixth sense\" that detects limb postures with motor\nneurons. It requires a natural integration between the musculoskeletal systems\nand sensory receptors, which is challenging among modern robots that aim for\nlightweight, adaptive, and sensitive designs at a low cost. Here, we present\nthe Soft Polyhedral Network with an embedded vision for physical interactions,\ncapable of adaptive kinesthesia and viscoelastic proprioception by learning\nkinetic features. This design enables passive adaptations to omni-directional\ninteractions, visually captured by a miniature high-speed motion tracking\nsystem embedded inside for proprioceptive learning. The results show that the\nsoft network can infer real-time 6D forces and torques with accuracies of\n0.25/0.24/0.35 N and 0.025/0.034/0.006 Nm in dynamic interactions. We also\nincorporate viscoelasticity in proprioception during static adaptation by\nadding a creep and relaxation modifier to refine the predicted results. The\nproposed soft network combines simplicity in design, omni-adaptation, and\nproprioceptive sensing with high accuracy, making it a versatile solution for\nrobotics at a low cost with more than 1 million use cycles for tasks such as\nsensitive and competitive grasping, and touch-based geometry reconstruction.\nThis study offers new insights into vision-based proprioception for soft robots\nin adaptive grasping, soft manipulation, and human-robot interaction.\n","authors":["Xiaobo Liu","Xudong Han","Wei Hong","Fang Wan","Chaoyang Song"],"pdf_url":"https://arxiv.org/pdf/2308.08538v1.pdf","comment":"20 pages, 10 figures, 2 tables, submitted to the International\n Journal of Robotics Research for review"},{"id":"http://arxiv.org/abs/2308.08536v1","updated":"2023-08-16T17:52:11Z","published":"2023-08-16T17:52:11Z","title":"Can Transformers Learn Optimal Filtering for Unknown Systems?","summary":" Transformers have demonstrated remarkable success in natural language\nprocessing; however, their potential remains mostly unexplored for problems\narising in dynamical systems. In this work, we investigate the optimal output\nestimation problem using transformers, which generate output predictions using\nall the past ones. We train the transformer using various systems drawn from a\nprior distribution and then evaluate its performance on previously unseen\nsystems from the same distribution. As a result, the obtained transformer acts\nlike a prediction algorithm that learns in-context and quickly adapts to and\npredicts well for different systems - thus we call it meta-output-predictor\n(MOP). MOP matches the performance of the optimal output estimator, based on\nKalman filter, for most linear dynamical systems even though it does not have\naccess to a model. We observe via extensive numerical experiments that MOP also\nperforms well in challenging scenarios with non-i.i.d. noise, time-varying\ndynamics, and nonlinear dynamics like a quadrotor system with unknown\nparameters. To further support this observation, in the second part of the\npaper, we provide statistical guarantees on the performance of MOP and quantify\nthe required amount of training to achieve a desired excess risk during\ntest-time. Finally, we point out some limitations of MOP by identifying two\nclasses of problems MOP fails to perform well, highlighting the need for\ncaution when using transformers for control and estimation.\n","authors":["Haldun Balim","Zhe Du","Samet Oymak","Necmiye Ozay"],"pdf_url":"https://arxiv.org/pdf/2308.08536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.00563v3","updated":"2023-08-16T17:45:13Z","published":"2022-10-02T16:25:47Z","title":"AI-Assisted Discovery of Quantitative and Formal Models in Social\n Science","summary":" In social science, formal and quantitative models, such as ones describing\neconomic growth and collective action, are used to formulate mechanistic\nexplanations, provide predictions, and uncover questions about observed\nphenomena. Here, we demonstrate the use of a machine learning system to aid the\ndiscovery of symbolic models that capture nonlinear and dynamical relationships\nin social science datasets. By extending neuro-symbolic methods to find compact\nfunctions and differential equations in noisy and longitudinal data, we show\nthat our system can be used to discover interpretable models from real-world\ndata in economics and sociology. Augmenting existing workflows with symbolic\nregression can help uncover novel relationships and explore counterfactual\nmodels during the scientific process. We propose that this AI-assisted\nframework can bridge parametric and non-parametric models commonly employed in\nsocial science research by systematically exploring the space of nonlinear\nmodels and enabling fine-grained control over expressivity and\ninterpretability.\n","authors":["Julia Balla","Sihao Huang","Owen Dugan","Rumen Dangovski","Marin Soljacic"],"pdf_url":"https://arxiv.org/pdf/2210.00563v3.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2210.03921v2","updated":"2023-08-16T17:42:22Z","published":"2022-10-08T05:16:49Z","title":"Data Selection: A Surprisingly Effective and General Principle for\n Building Small Interpretable Models","summary":" We present convincing empirical evidence for an effective and general\nstrategy for building accurate small models. Such models are attractive for\ninterpretability and also find use in resource-constrained environments. The\nstrategy is to learn the training distribution instead of using data from the\ntest distribution. The distribution learning algorithm is not a contribution of\nthis work; we highlight the broad usefulness of this simple strategy on a\ndiverse set of tasks, and as such these rigorous empirical results are our\ncontribution. We apply it to the tasks of (1) building cluster explanation\ntrees, (2) prototype-based classification, and (3) classification using Random\nForests, and show that it improves the accuracy of weak traditional baselines\nto the point that they are surprisingly competitive with specialized modern\ntechniques.\n This strategy is also versatile wrt the notion of model size. In the first\ntwo tasks, model size is identified by number of leaves in the tree and the\nnumber of prototypes respectively. In the final task involving Random Forests\nthe strategy is shown to be effective even when model size is determined by\nmore than one factor: number of trees and their maximum depth.\n Positive results using multiple datasets are presented that are shown to be\nstatistically significant. These lead us to conclude that this strategy is both\neffective, i.e, leads to significant improvements, and general, i.e., is\napplicable to different tasks and model families, and therefore merits further\nattention in domains that require small accurate models.\n","authors":["Abhishek Ghose"],"pdf_url":"https://arxiv.org/pdf/2210.03921v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13565v2","updated":"2023-08-16T17:26:28Z","published":"2023-07-25T15:17:31Z","title":"Decision-Focused Learning: Foundations, State of the Art, Benchmark and\n Future Opportunities","summary":" Decision-focused learning (DFL) is an emerging paradigm in machine learning\nwhich trains a model to optimize decisions, integrating prediction and\noptimization in an end-to-end system. This paradigm holds the promise to\nrevolutionize decision-making in many real-world applications which operate\nunder uncertainty, where the estimation of unknown parameters within these\ndecision models often becomes a substantial roadblock. This paper presents a\ncomprehensive review of DFL. It provides an in-depth analysis of the various\ntechniques devised to integrate machine learning and optimization models,\nintroduces a taxonomy of DFL methods distinguished by their unique\ncharacteristics, and conducts an extensive empirical evaluation of these\nmethods proposing suitable benchmark dataset and tasks for DFL. Finally, the\nstudy provides valuable insights into current and potential future avenues in\nDFL research.\n","authors":["Jayanta Mandi","James Kotary","Senne Berden","Maxime Mulamba","Victor Bucarey","Tias Guns","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2307.13565v2.pdf","comment":"Experimental Survey and Benchmarking"},{"id":"http://arxiv.org/abs/2308.08520v1","updated":"2023-08-16T17:18:30Z","published":"2023-08-16T17:18:30Z","title":"Painter: Teaching Auto-regressive Language Models to Draw Sketches","summary":" Large language models (LLMs) have made tremendous progress in natural\nlanguage understanding and they have also been successfully adopted in other\ndomains such as computer vision, robotics, reinforcement learning, etc. In this\nwork, we apply LLMs to image generation tasks by directly generating the\nvirtual brush strokes to paint an image. We present Painter, an LLM that can\nconvert user prompts in text description format to sketches by generating the\ncorresponding brush strokes in an auto-regressive way. We construct Painter\nbased on off-the-shelf LLM that is pre-trained on a large text corpus, by\nfine-tuning it on the new task while preserving language understanding\ncapabilities. We create a dataset of diverse multi-object sketches paired with\ntextual prompts that covers several object types and tasks. Painter can\ngenerate sketches from text descriptions, remove objects from canvas, and\ndetect and classify objects in sketches. Although this is an unprecedented\npioneering work in using LLMs for auto-regressive image generation, the results\nare very encouraging.\n","authors":["Reza Pourreza","Apratim Bhattacharyya","Sunny Panchal","Mingu Lee","Pulkit Madan","Roland Memisevic"],"pdf_url":"https://arxiv.org/pdf/2308.08520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08511v1","updated":"2023-08-16T17:07:40Z","published":"2023-08-16T17:07:40Z","title":"Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse\n Problems","summary":" Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial\ntechnologies in the field of medical imaging. Score-based models have proven to\nbe effective in addressing different inverse problems encountered in CT and\nMRI, such as sparse-view CT and fast MRI reconstruction. However, these models\nface challenges in achieving accurate three dimensional (3D) volumetric\nreconstruction. The existing score-based models primarily focus on\nreconstructing two dimensional (2D) data distribution, leading to\ninconsistencies between adjacent slices in the reconstructed 3D volumetric\nimages. To overcome this limitation, we propose a novel two-and-a-half order\nscore-based model (TOSM). During the training phase, our TOSM learns data\ndistributions in 2D space, which reduces the complexity of training compared to\ndirectly working on 3D volumes. However, in the reconstruction phase, the TOSM\nupdates the data distribution in 3D space, utilizing complementary scores along\nthree directions (sagittal, coronal, and transaxial) to achieve a more precise\nreconstruction. The development of TOSM is built on robust theoretical\nprinciples, ensuring its reliability and efficacy. Through extensive\nexperimentation on large-scale sparse-view CT and fast MRI datasets, our method\ndemonstrates remarkable advancements and attains state-of-the-art results in\nsolving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively\naddresses the inter-slice inconsistency issue, resulting in high-quality 3D\nvolumetric reconstruction.\n","authors":["Zirong Li","Yanyang Wang","Jianjia Zhang","Weiwen Wu","Hengyong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08510v1","updated":"2023-08-16T17:07:37Z","published":"2023-08-16T17:07:37Z","title":"Autoencoding a Soft Touch to Learn Grasping from On-land to Underwater","summary":" Robots play a critical role as the physical agent of human operators in\nexploring the ocean. However, it remains challenging to grasp objects reliably\nwhile fully submerging under a highly pressurized aquatic environment with\nlittle visible light, mainly due to the fluidic interference on the tactile\nmechanics between the finger and object surfaces. This study investigates the\ntransferability of grasping knowledge from on-land to underwater via a\nvision-based soft robotic finger that learns 6D forces and torques (FT) using a\nSupervised Variational Autoencoder (SVAE). A high-framerate camera captures the\nwhole-body deformations while a soft robotic finger interacts with physical\nobjects on-land and underwater. Results show that the trained SVAE model\nlearned a series of latent representations of the soft mechanics transferrable\nfrom land to water, presenting a superior adaptation to the changing\nenvironments against commercial FT sensors. Soft, delicate, and reactive\ngrasping enabled by tactile intelligence enhances the gripper's underwater\ninteraction with improved reliability and robustness at a much-reduced cost,\npaving the path for learning-based intelligent grasping to support fundamental\nscientific discoveries in environmental and ocean research.\n","authors":["Ning Guo","Xudong Han","Xiaobo Liu","Shuqiao Zhong","Zhiyuan Zhou","Jian Lin","Jiansheng Dai","Fang Wan","Chaoyang Song"],"pdf_url":"https://arxiv.org/pdf/2308.08510v1.pdf","comment":"17 pages, 5 figures, 1 table, submitted to Advanced Intelligent\n Systems for review"},{"id":"http://arxiv.org/abs/2208.11061v2","updated":"2023-08-16T17:02:51Z","published":"2022-08-23T16:07:28Z","title":"Large-Scale Traffic Congestion Prediction based on Multimodal Fusion and\n Representation Mapping","summary":" With the progress of the urbanisation process, the urban transportation\nsystem is extremely critical to the development of cities and the quality of\nlife of the citizens. Among them, it is one of the most important tasks to\njudge traffic congestion by analysing the congestion factors. Recently, various\ntraditional and machine-learning-based models have been introduced for\npredicting traffic congestion. However, these models are either poorly\naggregated for massive congestion factors or fail to make accurate predictions\nfor every precise location in large-scale space. To alleviate these problems, a\nnovel end-to-end framework based on convolutional neural networks is proposed\nin this paper. With learning representations, the framework proposes a novel\nmultimodal fusion module and a novel representation mapping module to achieve\ntraffic congestion predictions on arbitrary query locations on a large-scale\nmap, combined with various global reference information. The proposed framework\nachieves significant results and efficient inference on real-world large-scale\ndatasets.\n","authors":["Bodong Zhou","Jiahui Liu","Songyi Cui","Yaping Zhao"],"pdf_url":"https://arxiv.org/pdf/2208.11061v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08504v1","updated":"2023-08-16T16:58:25Z","published":"2023-08-16T16:58:25Z","title":"ResBuilder: Automated Learning of Depth with Residual Structures","summary":" In this work, we develop a neural architecture search algorithm, termed\nResbuilder, that develops ResNet architectures from scratch that achieve high\naccuracy at moderate computational cost. It can also be used to modify existing\narchitectures and has the capability to remove and insert ResNet blocks, in\nthis way searching for suitable architectures in the space of ResNet\narchitectures. In our experiments on different image classification datasets,\nResbuilder achieves close to state-of-the-art performance while saving\ncomputational cost compared to off-the-shelf ResNets. Noteworthy, we once tune\nthe parameters on CIFAR10 which yields a suitable default choice for all other\ndatasets. We demonstrate that this property generalizes even to industrial\napplications by applying our method with default parameters on a proprietary\nfraud detection dataset.\n","authors":["Julian Burghoff","Matthias Rottmann","Jill von Conta","Sebastian Schoenen","Andreas Witte","Hanno Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2308.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08493v1","updated":"2023-08-16T16:48:57Z","published":"2023-08-16T16:48:57Z","title":"Time Travel in LLMs: Tracing Data Contamination in Large Language Models","summary":" Data contamination, i.e., the presence of test data from downstream tasks in\nthe training data of large language models (LLMs), is a potential major issue\nin understanding LLMs' effectiveness on other tasks. We propose a\nstraightforward yet effective method for identifying data contamination within\nLLMs. At its core, our approach starts by identifying potential contamination\nin individual instances that are drawn from a small random sample; using this\ninformation, our approach then assesses if an entire dataset partition is\ncontaminated. To estimate contamination of individual instances, we employ\n\"guided instruction:\" a prompt consisting of the dataset name, partition type,\nand the initial segment of a reference instance, asking the LLM to complete it.\nAn instance is flagged as contaminated if the LLM's output either exactly or\nclosely matches the latter segment of the reference. To understand if an entire\npartition is contaminated, we propose two ideas. The first idea marks a dataset\npartition as contaminated if the average overlap score with the reference\ninstances (as measured by ROUGE or BLEURT) is statistically significantly\nbetter with the guided instruction vs. a general instruction that does not\ninclude the dataset and partition name. The second idea marks a dataset as\ncontaminated if a classifier based on GPT-4 with in-context learning prompting\nmarks multiple instances as contaminated. Our best method achieves an accuracy\nbetween 92% and 100% in detecting if an LLM is contaminated with seven\ndatasets, containing train and test/validation partitions, when contrasted with\nmanual evaluation by human expert. Further, our findings indicate that GPT-4 is\ncontaminated with AG News, WNLI, and XSum datasets.\n","authors":["Shahriar Golchin","Mihai Surdeanu"],"pdf_url":"https://arxiv.org/pdf/2308.08493v1.pdf","comment":"v1 preprint"},{"id":"http://arxiv.org/abs/2308.08480v1","updated":"2023-08-16T16:38:03Z","published":"2023-08-16T16:38:03Z","title":"Label Propagation Techniques for Artifact Detection in Imbalanced\n Classes using Photoplethysmogram Signals","summary":" Photoplethysmogram (PPG) signals are widely used in healthcare for monitoring\nvital signs, but they are susceptible to motion artifacts that can lead to\ninaccurate interpretations. In this study, the use of label propagation\ntechniques to propagate labels among PPG samples is explored, particularly in\nimbalanced class scenarios where clean PPG samples are significantly\noutnumbered by artifact-contaminated samples. With a precision of 91%, a recall\nof 90% and an F1 score of 90% for the class without artifacts, the results\ndemonstrate its effectiveness in labeling a medical dataset, even when clean\nsamples are rare. For the classification of artifacts our study compares\nsupervised classifiers such as conventional classifiers and neural networks\n(MLP, Transformers, FCN) with the semi-supervised label propagation algorithm.\nWith a precision of 89%, a recall of 95% and an F1 score of 92%, the KNN\nsupervised model gives good results, but the semi-supervised algorithm performs\nbetter in detecting artifacts. The findings suggest that the semi-supervised\nalgorithm label propagation hold promise for artifact detection in PPG signals,\nwhich can enhance the reliability of PPG-based health monitoring systems in\nreal-world applications.\n","authors":["Clara Macabiau","Thanh-Dung Le","Kevin Albert","Philippe Jouvet","Rita Noumeir"],"pdf_url":"https://arxiv.org/pdf/2308.08480v1.pdf","comment":"Under preparation to submit to IEEE for possible publications"},{"id":"http://arxiv.org/abs/2308.08469v1","updated":"2023-08-16T16:19:50Z","published":"2023-08-16T16:19:50Z","title":"LLM4TS: Two-Stage Fine-Tuning for Time-Series Forecasting with\n Pre-Trained LLMs","summary":" In this work, we leverage pre-trained Large Language Models (LLMs) to enhance\ntime-series forecasting. Mirroring the growing interest in unifying models for\nNatural Language Processing and Computer Vision, we envision creating an\nanalogous model for long-term time-series forecasting. Due to limited\nlarge-scale time-series data for building robust foundation models, our\napproach LLM4TS focuses on leveraging the strengths of pre-trained LLMs. By\ncombining time-series patching with temporal encoding, we have enhanced the\ncapability of LLMs to handle time-series data effectively. Inspired by the\nsupervised fine-tuning in chatbot domains, we prioritize a two-stage\nfine-tuning process: first conducting supervised fine-tuning to orient the LLM\ntowards time-series data, followed by task-specific downstream fine-tuning.\nFurthermore, to unlock the flexibility of pre-trained LLMs without extensive\nparameter adjustments, we adopt several Parameter-Efficient Fine-Tuning (PEFT)\ntechniques. Drawing on these innovations, LLM4TS has yielded state-of-the-art\nresults in long-term forecasting. Our model has also shown exceptional\ncapabilities as both a robust representation learner and an effective few-shot\nlearner, thanks to the knowledge transferred from the pre-trained LLM.\n","authors":["Ching Chang","Wen-Chih Peng","Tien-Fu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08468v1","updated":"2023-08-16T16:19:25Z","published":"2023-08-16T16:19:25Z","title":"An Expert's Guide to Training Physics-informed Neural Networks","summary":" Physics-informed neural networks (PINNs) have been popularized as a deep\nlearning framework that can seamlessly synthesize observational data and\npartial differential equation (PDE) constraints. Their practical effectiveness\nhowever can be hampered by training pathologies, but also oftentimes by poor\nchoices made by users who lack deep learning expertise. In this paper we\npresent a series of best practices that can significantly improve the training\nefficiency and overall accuracy of PINNs. We also put forth a series of\nchallenging benchmark problems that highlight some of the most prominent\ndifficulties in training PINNs, and present comprehensive and fully\nreproducible ablation studies that demonstrate how different architecture\nchoices and training strategies affect the test accuracy of the resulting\nmodels. We show that the methods and guiding principles put forth in this study\nlead to state-of-the-art results and provide strong baselines that future\nstudies should use for comparison purposes. To this end, we also release a\nhighly optimized library in JAX that can be used to reproduce all results\nreported in this paper, enable future research studies, as well as facilitate\neasy adaptation to new use-case scenarios.\n","authors":["Sifan Wang","Shyam Sankaran","Hanwen Wang","Paris Perdikaris"],"pdf_url":"https://arxiv.org/pdf/2308.08468v1.pdf","comment":"36 pages, 25 figures, 13 tables"},{"id":"http://arxiv.org/abs/2308.06422v2","updated":"2023-08-16T16:18:28Z","published":"2023-08-12T00:16:51Z","title":"Sensitivity-Aware Mixed-Precision Quantization and Width Optimization of\n Deep Neural Networks Through Cluster-Based Tree-Structured Parzen Estimation","summary":" As the complexity and computational demands of deep learning models rise, the\nneed for effective optimization methods for neural network designs becomes\nparamount. This work introduces an innovative search mechanism for\nautomatically selecting the best bit-width and layer-width for individual\nneural network layers. This leads to a marked enhancement in deep neural\nnetwork efficiency. The search domain is strategically reduced by leveraging\nHessian-based pruning, ensuring the removal of non-crucial parameters.\nSubsequently, we detail the development of surrogate models for favorable and\nunfavorable outcomes by employing a cluster-based tree-structured Parzen\nestimator. This strategy allows for a streamlined exploration of architectural\npossibilities and swift pinpointing of top-performing designs. Through rigorous\ntesting on well-known datasets, our method proves its distinct advantage over\nexisting methods. Compared to leading compression strategies, our approach\nrecords an impressive 20% decrease in model size without compromising accuracy.\nAdditionally, our method boasts a 12x reduction in search time relative to the\nbest search-focused strategies currently available. As a result, our proposed\nmethod represents a leap forward in neural network design optimization, paving\nthe way for quick model design and implementation in settings with limited\nresources, thereby propelling the potential of scalable deep learning\nsolutions.\n","authors":["Seyedarmin Azizi","Mahdi Nazemi","Arash Fayyazi","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2308.06422v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08467v1","updated":"2023-08-16T16:15:47Z","published":"2023-08-16T16:15:47Z","title":"On Neural Quantum Support Vector Machines","summary":" In \\cite{simon2023algorithms} we introduced four algorithms for the training\nof neural support vector machines (NSVMs) and demonstrated their feasibility.\nIn this note we introduce neural quantum support vector machines, that is,\nNSVMs with a quantum kernel, and extend our results to this setting.\n","authors":["Lars Simon","Manuel Radons"],"pdf_url":"https://arxiv.org/pdf/2308.08467v1.pdf","comment":"13 pages, 0 figures. arXiv admin note: substantial text overlap with\n arXiv:2308.07204"},{"id":"http://arxiv.org/abs/2301.11118v3","updated":"2023-08-16T16:11:30Z","published":"2023-01-26T14:13:37Z","title":"Box$^2$EL: Concept and Role Box Embeddings for the Description Logic\n EL++","summary":" Description logic (DL) ontologies extend knowledge graphs (KGs) with\nconceptual information and logical background knowledge. In recent years, there\nhas been growing interest in inductive reasoning techniques for such\nontologies, which promise to complement classical deductive reasoning\nalgorithms. Similar to KG completion, several existing approaches learn\nontology embeddings in a latent space, while additionally ensuring that they\nfaithfully capture the logical semantics of the underlying DL. However, they\nsuffer from several shortcomings, mainly due to a limiting role representation.\nWe propose Box$^2$EL, which represents both concepts and roles as boxes (i.e.,\naxis-aligned hyperrectangles) and demonstrate how it overcomes the limitations\nof previous methods. We theoretically prove the soundness of our model and\nconduct an extensive experimental evaluation, achieving state-of-the-art\nresults across a variety of datasets. As part of our evaluation, we introduce a\nnovel benchmark for subsumption prediction involving both atomic and complex\nconcepts.\n","authors":["Mathias Jackermeier","Jiaoyan Chen","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2301.11118v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08465v1","updated":"2023-08-16T16:09:23Z","published":"2023-08-16T16:09:23Z","title":"Hierarchical Uncertainty Estimation for Medical Image Segmentation\n Networks","summary":" Learning a medical image segmentation model is an inherently ambiguous task,\nas uncertainties exist in both images (noise) and manual annotations (human\nerrors and bias) used for model training. To build a trustworthy image\nsegmentation model, it is important to not just evaluate its performance but\nalso estimate the uncertainty of the model prediction. Most state-of-the-art\nimage segmentation networks adopt a hierarchical encoder architecture,\nextracting image features at multiple resolution levels from fine to coarse. In\nthis work, we leverage this hierarchical image representation and propose a\nsimple yet effective method for estimating uncertainties at multiple levels.\nThe multi-level uncertainties are modelled via the skip-connection module and\nthen sampled to generate an uncertainty map for the predicted image\nsegmentation. We demonstrate that a deep learning segmentation network such as\nU-net, when implemented with such hierarchical uncertainty estimation module,\ncan achieve a high segmentation performance, while at the same time provide\nmeaningful uncertainty maps that can be used for out-of-distribution detection.\n","authors":["Xinyu Bai","Wenjia Bai"],"pdf_url":"https://arxiv.org/pdf/2308.08465v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2211.11695v2","updated":"2023-08-16T16:05:00Z","published":"2022-11-21T18:14:38Z","title":"Disentangled Representation Learning","summary":" Disentangled Representation Learning (DRL) aims to learn a model capable of\nidentifying and disentangling the underlying factors hidden in the observable\ndata in representation form. The process of separating underlying factors of\nvariation into variables with semantic meaning benefits in learning explainable\nrepresentations of data, which imitates the meaningful understanding process of\nhumans when observing an object or relation. As a general learning strategy,\nDRL has demonstrated its power in improving the model explainability,\ncontrolability, robustness, as well as generalization capacity in a wide range\nof scenarios such as computer vision, natural language processing, data mining\netc. In this article, we comprehensively review DRL from various aspects\nincluding motivations, definitions, methodologies, evaluations, applications\nand model designs. We discuss works on DRL based on two well-recognized\ndefinitions, i.e., Intuitive Definition and Group Theory Definition. We further\ncategorize the methodologies for DRL into four groups, i.e., Traditional\nStatistical Approaches, Variational Auto-encoder Based Approaches, Generative\nAdversarial Networks Based Approaches, Hierarchical Approaches and Other\nApproaches. We also analyze principles to design different DRL models that may\nbenefit different tasks in practical applications. Finally, we point out\nchallenges in DRL as well as potential research directions deserving future\ninvestigations. We believe this work may provide insights for promoting the DRL\nresearch in the community.\n","authors":["Xin Wang","Hong Chen","Si'ao Tang","Zihao Wu","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.11695v2.pdf","comment":"26 pages, 11 figures"},{"id":"http://arxiv.org/abs/2304.01752v2","updated":"2023-08-16T15:54:54Z","published":"2023-04-04T12:42:29Z","title":"Black Box Few-Shot Adaptation for Vision-Language models","summary":" Vision-Language (V-L) models trained with contrastive learning to align the\nvisual and language modalities have been shown to be strong few-shot learners.\nSoft prompt learning is the method of choice for few-shot downstream adaption\naiming to bridge the modality gap caused by the distribution shift induced by\nthe new domain. While parameter-efficient, prompt learning still requires\naccess to the model weights and can be computationally infeasible for large\nmodels with billions of parameters. To address these shortcomings, in this\nwork, we describe a black-box method for V-L few-shot adaptation that (a)\noperates on pre-computed image and text features and hence works without access\nto the model's weights, (b) it is orders of magnitude faster at training time,\n(c) it is amenable to both supervised and unsupervised training, and (d) it can\nbe even used to align image and text features computed from uni-modal models.\nTo achieve this, we propose Linear Feature Alignment (LFA), a simple linear\napproach for V-L re-alignment in the target domain. LFA is initialized from a\nclosed-form solution to a least-squares problem and then it is iteratively\nupdated by minimizing a re-ranking loss. Despite its simplicity, our approach\ncan even surpass soft-prompt learning methods as shown by extensive experiments\non 11 image and 2 video datasets.\n","authors":["Yassine Ouali","Adrian Bulat","Brais Martinez","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2304.01752v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08438v1","updated":"2023-08-16T15:42:24Z","published":"2023-08-16T15:42:24Z","title":"Accurate synthesis of Dysarthric Speech for ASR data augmentation","summary":" Dysarthria is a motor speech disorder often characterized by reduced speech\nintelligibility through slow, uncoordinated control of speech production\nmuscles. Automatic Speech recognition (ASR) systems can help dysarthric talkers\ncommunicate more effectively. However, robust dysarthria-specific ASR requires\na significant amount of training speech, which is not readily available for\ndysarthric talkers. This paper presents a new dysarthric speech synthesis\nmethod for the purpose of ASR training data augmentation. Differences in\nprosodic and acoustic characteristics of dysarthric spontaneous speech at\nvarying severity levels are important components for dysarthric speech\nmodeling, synthesis, and augmentation. For dysarthric speech synthesis, a\nmodified neural multi-talker TTS is implemented by adding a dysarthria severity\nlevel coefficient and a pause insertion model to synthesize dysarthric speech\nfor varying severity levels. To evaluate the effectiveness for synthesis of\ntraining data for ASR, dysarthria-specific speech recognition was used. Results\nshow that a DNN-HMM model trained on additional synthetic dysarthric speech\nachieves WER improvement of 12.2% compared to the baseline, and that the\naddition of the severity level and pause insertion controls decrease WER by\n6.5%, showing the effectiveness of adding these parameters. Overall results on\nthe TORGO database demonstrate that using dysarthric synthetic speech to\nincrease the amount of dysarthric-patterned speech for training has significant\nimpact on the dysarthric ASR systems. In addition, we have conducted a\nsubjective evaluation to evaluate the dysarthric-ness and similarity of\nsynthesized speech. Our subjective evaluation shows that the perceived\ndysartrhic-ness of synthesized speech is similar to that of true dysarthric\nspeech, especially for higher levels of dysarthria\n","authors":["Mohammad Soleymanpour","Michael T. Johnson","Rahim Soleymanpour","Jeffrey Berry"],"pdf_url":"https://arxiv.org/pdf/2308.08438v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2201.11571"},{"id":"http://arxiv.org/abs/2304.06783v2","updated":"2023-08-16T15:38:17Z","published":"2023-04-13T19:10:06Z","title":"A Distributionally Robust Approach to Regret Optimal Control using the\n Wasserstein Distance","summary":" This paper proposes a distributionally robust approach to regret optimal\ncontrol of discrete-time linear dynamical systems with quadratic costs subject\nto a stochastic additive disturbance on the state process. The underlying\nprobability distribution of the disturbance process is unknown, but assumed to\nlie in a given ball of distributions defined in terms of the type-2 Wasserstein\ndistance. In this framework, strictly causal linear disturbance feedback\ncontrollers are designed to minimize the worst-case expected regret. The regret\nincurred by a controller is defined as the difference between the cost it\nincurs in response to a realization of the disturbance process and the cost\nincurred by the optimal noncausal controller which has perfect knowledge of the\ndisturbance process realization at the outset. Building on a well-established\nduality theory for optimal transport problems, we derive a reformulation of the\nminimax regret optimal control problem as a tractable semidefinite program.\nUsing the equivalent dual reformulation, we characterize a worst-case\ndistribution achieving the worst-case expected regret in relation to the\ndistribution at the center of the Wasserstein ball. We compare the minimax\nregret optimal control design method with the distributionally robust optimal\ncontrol approach using an illustrative example and numerical experiments.\n","authors":["Feras Al Taha","Shuhao Yan","Eilyan Bitar"],"pdf_url":"https://arxiv.org/pdf/2304.06783v2.pdf","comment":"8 pages, 3 figures, to appear in the proceedings of the 2023 IEEE\n Conference on Decision and Control (CDC)"},{"id":"http://arxiv.org/abs/2304.04137v2","updated":"2023-08-16T15:36:07Z","published":"2023-04-09T02:22:31Z","title":"RD-DPP: Rate-Distortion Theory Meets Determinantal Point Process to\n Diversify Learning Data Samples","summary":" In some practical learning tasks, such as traffic video analysis, the number\nof available training samples is restricted by different factors, such as\nlimited communication bandwidth and computation power. Determinantal Point\nProcess (DPP) is a common method for selecting the most diverse samples to\nenhance learning quality. However, the number of selected samples is restricted\nto the rank of the kernel matrix implied by the dimensionality of data samples.\nSecondly, it is not easily customizable to different learning tasks. In this\npaper, we propose a new way of measuring task-oriented diversity based on the\nRate-Distortion (RD) theory, appropriate for multi-level classification. To\nthis end, we establish a fundamental relationship between DPP and RD theory. We\nobserve that the upper bound of the diversity of data selected by DPP has a\nuniversal trend of $\\textit{phase transition}$, which suggests that DPP is\nbeneficial only at the beginning of sample accumulation. This led to the design\nof a bi-modal method, where RD-DPP is used in the first mode to select initial\ndata samples, then classification inconsistency (as an uncertainty measure) is\nused to select the subsequent samples in the second mode. This phase transition\nsolves the limitation to the rank of the similarity matrix. Applying our method\nto six different datasets and five benchmark models suggests that our method\nconsistently outperforms random selection, DPP-based methods, and alternatives\nlike uncertainty-based and coreset methods under all sampling budgets, while\nexhibiting high generalizability to different learning tasks.\n","authors":["Xiwen Chen","Huayu Li","Rahul Amin","Abolfazl Razi"],"pdf_url":"https://arxiv.org/pdf/2304.04137v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08427v1","updated":"2023-08-16T15:17:57Z","published":"2023-08-16T15:17:57Z","title":"Eliciting Risk Aversion with Inverse Reinforcement Learning via\n Interactive Questioning","summary":" This paper proposes a novel framework for identifying an agent's risk\naversion using interactive questioning. Our study is conducted in two\nscenarios: a one-period case and an infinite horizon case. In the one-period\ncase, we assume that the agent's risk aversion is characterized by a cost\nfunction of the state and a distortion risk measure. In the infinite horizon\ncase, we model risk aversion with an additional component, a discount factor.\nAssuming the access to a finite set of candidates containing the agent's true\nrisk aversion, we show that asking the agent to demonstrate her optimal\npolicies in various environment, which may depend on their previous answers, is\nan effective means of identifying the agent's risk aversion. Specifically, we\nprove that the agent's risk aversion can be identified as the number of\nquestions tends to infinity, and the questions are randomly designed. We also\ndevelop an algorithm for designing optimal questions and provide empirical\nevidence that our method learns risk aversion significantly faster than\nrandomly designed questions in simulations. Our framework has important\napplications in robo-advising and provides a new approach for identifying an\nagent's risk preferences.\n","authors":["Ziteng Cheng","Anthony Coache","Sebastian Jaimungal"],"pdf_url":"https://arxiv.org/pdf/2308.08427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09703v3","updated":"2023-08-16T15:16:43Z","published":"2022-11-17T17:38:55Z","title":"EfficientTrain: Exploring Generalized Curriculum Learning for Training\n Visual Backbones","summary":" The superior performance of modern deep networks usually comes with a costly\ntraining procedure. This paper presents a new curriculum learning approach for\nthe efficient training of visual backbones (e.g., vision Transformers). Our\nwork is inspired by the inherent learning dynamics of deep networks: we\nexperimentally show that at an earlier training stage, the model mainly learns\nto recognize some 'easier-to-learn' discriminative patterns within each\nexample, e.g., the lower-frequency components of images and the original\ninformation before data augmentation. Driven by this phenomenon, we propose a\ncurriculum where the model always leverages all the training data at each\nepoch, while the curriculum starts with only exposing the 'easier-to-learn'\npatterns of each example, and introduces gradually more difficult patterns. To\nimplement this idea, we 1) introduce a cropping operation in the Fourier\nspectrum of the inputs, which enables the model to learn from only the\nlower-frequency components efficiently, 2) demonstrate that exposing the\nfeatures of original images amounts to adopting weaker data augmentation, and\n3) integrate 1) and 2) and design a curriculum learning schedule with a\ngreedy-search algorithm. The resulting approach, EfficientTrain, is simple,\ngeneral, yet surprisingly effective. As an off-the-shelf method, it reduces the\nwall-time training cost of a wide variety of popular models (e.g., ResNet,\nConvNeXt, DeiT, PVT, Swin, and CSWin) by >1.5x on ImageNet-1K/22K without\nsacrificing accuracy. It is also effective for self-supervised learning (e.g.,\nMAE). Code is available at https://github.com/LeapLabTHU/EfficientTrain.\n","authors":["Yulin Wang","Yang Yue","Rui Lu","Tianjiao Liu","Zhao Zhong","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2211.09703v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08410v1","updated":"2023-08-16T14:57:12Z","published":"2023-08-16T14:57:12Z","title":"Digital twinning of cardiac electrophysiology models from the surface\n ECG: a geodesic backpropagation approach","summary":" The eikonal equation has become an indispensable tool for modeling cardiac\nelectrical activation accurately and efficiently. In principle, by matching\nclinically recorded and eikonal-based electrocardiograms (ECGs), it is possible\nto build patient-specific models of cardiac electrophysiology in a purely\nnon-invasive manner. Nonetheless, the fitting procedure remains a challenging\ntask. The present study introduces a novel method, Geodesic-BP, to solve the\ninverse eikonal problem. Geodesic-BP is well-suited for GPU-accelerated machine\nlearning frameworks, allowing us to optimize the parameters of the eikonal\nequation to reproduce a given ECG. We show that Geodesic-BP can reconstruct a\nsimulated cardiac activation with high accuracy in a synthetic test case, even\nin the presence of modeling inaccuracies. Furthermore, we apply our algorithm\nto a publicly available dataset of a rabbit model, with very positive results.\nGiven the future shift towards personalized medicine, Geodesic-BP has the\npotential to help in future functionalizations of cardiac models meeting\nclinical time constraints while maintaining the physiological accuracy of\nstate-of-the-art cardiac models.\n","authors":["Thomas Grandits","Jan Verhülsdonk","Gundolf Haase","Alexander Effland","Simone Pezzuto"],"pdf_url":"https://arxiv.org/pdf/2308.08410v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.08407v1","updated":"2023-08-16T14:51:51Z","published":"2023-08-16T14:51:51Z","title":"Explainable AI for clinical risk prediction: a survey of concepts,\n methods, and modalities","summary":" Recent advancements in AI applications to healthcare have shown incredible\npromise in surpassing human performance in diagnosis and disease prognosis.\nWith the increasing complexity of AI models, however, concerns regarding their\nopacity, potential biases, and the need for interpretability. To ensure trust\nand reliability in AI systems, especially in clinical risk prediction models,\nexplainability becomes crucial. Explainability is usually referred to as an AI\nsystem's ability to provide a robust interpretation of its decision-making\nlogic or the decisions themselves to human stakeholders. In clinical risk\nprediction, other aspects of explainability like fairness, bias, trust, and\ntransparency also represent important concepts beyond just interpretability. In\nthis review, we address the relationship between these concepts as they are\noften used together or interchangeably. This review also discusses recent\nprogress in developing explainable models for clinical risk prediction,\nhighlighting the importance of quantitative and clinical evaluation and\nvalidation across multiple common modalities in clinical practice. It\nemphasizes the need for external validation and the combination of diverse\ninterpretability methods to enhance trust and fairness. Adopting rigorous\ntesting, such as using synthetic datasets with known generative factors, can\nfurther improve the reliability of explainability methods. Open access and\ncode-sharing resources are essential for transparency and reproducibility,\nenabling the growth and trustworthiness of explainable research. While\nchallenges exist, an end-to-end approach to explainability in clinical risk\nprediction, incorporating stakeholders from clinicians to developers, is\nessential for success.\n","authors":["Munib Mesinovic","Peter Watkinson","Tingting Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.08407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08406v1","updated":"2023-08-16T14:50:51Z","published":"2023-08-16T14:50:51Z","title":"Content-based Recommendation Engine for Video Streaming Platform","summary":" Recommendation engine suggest content, product or services to the user by\nusing machine learning algorithm. This paper proposed a content-based\nrecommendation engine for providing video suggestion to the user based on their\nprevious interests and choices. We will use TF-IDF text vectorization method to\ndetermine the relevance of words in a document. Then we will find out the\nsimilarity between each content by calculating cosine similarity between them.\nFinally, engine will recommend videos to the users based on the obtained\nsimilarity score value. In addition, we will measure the engine's performance\nby computing precision, recall, and F1 core of the proposed system.\n","authors":["Puskal Khadka","Prabhav Lamichhane"],"pdf_url":"https://arxiv.org/pdf/2308.08406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05989v2","updated":"2023-08-16T14:47:10Z","published":"2023-06-09T15:59:27Z","title":"QBSD: Quartile-Based Seasonality Decomposition for Cost-Effective Time\n Series Forecasting","summary":" In the telecom domain, precise forecasting of time series patterns, such as\ncell key performance indicators (KPIs), plays a pivotal role in enhancing\nservice quality and operational efficiency. State-of-the-art forecasting\napproaches prioritize forecasting accuracy at the expense of computational\nperformance, rendering them less suitable for data-intensive applications\nencompassing systems with a multitude of time series variables. To address this\nissue, we introduce QBSD, a live forecasting approach tailored to optimize the\ntrade-off between accuracy and computational complexity. We have evaluated the\nperformance of QBSD against state-of-the-art forecasting approaches on publicly\navailable datasets. We have also extended this investigation to our curated\nnetwork KPI dataset, now publicly accessible, to showcase the effect of dynamic\noperating ranges that varies with time. The results demonstrate that the\nproposed method excels in runtime efficiency compared to the leading algorithms\navailable while maintaining competitive forecast accuracy.\n","authors":["Ebenezer RHP Isaac","Bulbul Singh"],"pdf_url":"https://arxiv.org/pdf/2306.05989v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08391v1","updated":"2023-08-16T14:23:24Z","published":"2023-08-16T14:23:24Z","title":"Fast Uncertainty Quantification of Spent Nuclear Fuel with Neural\n Networks","summary":" The accurate calculation and uncertainty quantification of the\ncharacteristics of spent nuclear fuel (SNF) play a crucial role in ensuring the\nsafety, efficiency, and sustainability of nuclear energy production, waste\nmanagement, and nuclear safeguards. State of the art physics-based models,\nwhile reliable, are computationally intensive and time-consuming. This paper\npresents a surrogate modeling approach using neural networks (NN) to predict a\nnumber of SNF characteristics with reduced computational costs compared to\nphysics-based models. An NN is trained using data generated from CASMO5 lattice\ncalculations. The trained NN accurately predicts decay heat and nuclide\nconcentrations of SNF, as a function of key input parameters, such as\nenrichment, burnup, cooling time between cycles, mean boron concentration and\nfuel temperature. The model is validated against physics-based decay heat\nsimulations and measurements of different uranium oxide fuel assemblies from\ntwo different pressurized water reactors. In addition, the NN is used to\nperform sensitivity analysis and uncertainty quantification. The results are in\nvery good alignment to CASMO5, while the computational costs (taking into\naccount the costs of generating training samples) are reduced by a factor of 10\nor more. Our findings demonstrate the feasibility of using NNs as surrogate\nmodels for fast characterization of SNF, providing a promising avenue for\nimproving computational efficiency in assessing nuclear fuel behavior and\nassociated risks.\n","authors":["Arnau Albà","Andreas Adelmann","Lucas Münster","Dimitri Rochman","Romana Boiger"],"pdf_url":"https://arxiv.org/pdf/2308.08391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08387v1","updated":"2023-08-16T14:18:31Z","published":"2023-08-16T14:18:31Z","title":"Continuous Sweep: an improved, binary quantifier","summary":" Quantification is a supervised machine learning task, focused on estimating\nthe class prevalence of a dataset rather than labeling its individual\nobservations. We introduce Continuous Sweep, a new parametric binary quantifier\ninspired by the well-performing Median Sweep. Median Sweep is currently one of\nthe best binary quantifiers, but we have changed this quantifier on three\npoints, namely 1) using parametric class distributions instead of empirical\ndistributions, 2) optimizing decision boundaries instead of applying discrete\ndecision rules, and 3) calculating the mean instead of the median. We derive\nanalytic expressions for the bias and variance of Continuous Sweep under\ngeneral model assumptions. This is one of the first theoretical contributions\nin the field of quantification learning. Moreover, these derivations enable us\nto find the optimal decision boundaries. Finally, our simulation study shows\nthat Continuous Sweep outperforms Median Sweep in a wide range of situations.\n","authors":["Kevin Kloos","Julian D. Karch","Quinten A. Meertens","Mark de Rooij"],"pdf_url":"https://arxiv.org/pdf/2308.08387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07077v2","updated":"2023-08-16T14:16:28Z","published":"2023-06-12T12:43:27Z","title":"Latent Dynamical Implicit Diffusion Processes","summary":" Latent dynamical models are commonly used to learn the distribution of a\nlatent dynamical process that represents a sequence of noisy data samples.\nHowever, producing samples from such models with high fidelity is challenging\ndue to the complexity and variability of latent and observation dynamics.\nRecent advances in diffusion-based generative models, such as DDPM and NCSN,\nhave shown promising alternatives to state-of-the-art latent generative models,\nsuch as Neural ODEs, RNNs, and Normalizing flow networks, for generating\nhigh-quality sequential samples from a prior distribution. However, their\napplication in modeling sequential data with latent dynamical models is yet to\nbe explored. Here, we propose a novel latent variable model named latent\ndynamical implicit diffusion processes (LDIDPs), which utilizes implicit\ndiffusion processes to sample from dynamical latent processes and generate\nsequential observation samples accordingly. We tested LDIDPs on synthetic and\nsimulated neural decoding problems. We demonstrate that LDIDPs can accurately\nlearn the dynamics over latent dimensions. Furthermore, the implicit sampling\nmethod allows for the computationally efficient generation of high-quality\nsequential data samples from the latent and observation spaces.\n","authors":["Mohammad R. Rezaei"],"pdf_url":"https://arxiv.org/pdf/2306.07077v2.pdf","comment":"I request a withdrawal because there are no experiments with\n real-world datasets and also the method section requires major changes to\n look mathematically sounds"},{"id":"http://arxiv.org/abs/2308.08381v1","updated":"2023-08-16T14:09:48Z","published":"2023-08-16T14:09:48Z","title":"Precision and Recall Reject Curves for Classification","summary":" For some classification scenarios, it is desirable to use only those\nclassification instances that a trained model associates with a high certainty.\nTo obtain such high-certainty instances, previous work has proposed\naccuracy-reject curves. Reject curves allow to evaluate and compare the\nperformance of different certainty measures over a range of thresholds for\naccepting or rejecting classifications. However, the accuracy may not be the\nmost suited evaluation metric for all applications, and instead precision or\nrecall may be preferable. This is the case, for example, for data with\nimbalanced class distributions. We therefore propose reject curves that\nevaluate precision and recall, the recall-reject curve and the precision-reject\ncurve. Using prototype-based classifiers from learning vector quantization, we\nfirst validate the proposed curves on artificial benchmark data against the\naccuracy reject curve as a baseline. We then show on imbalanced benchmarks and\nmedical, real-world data that for these scenarios, the proposed precision- and\nrecall-curves yield more accurate insights into classifier performance than\naccuracy reject curves.\n","authors":["Lydia Fischer","Patricia Wollstadt"],"pdf_url":"https://arxiv.org/pdf/2308.08381v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.08379v1","updated":"2023-08-16T14:04:50Z","published":"2023-08-16T14:04:50Z","title":"A distributed neural network architecture for dynamic sensor selection\n with application to bandwidth-constrained body-sensor networks","summary":" We propose a dynamic sensor selection approach for deep neural networks\n(DNNs), which is able to derive an optimal sensor subset selection for each\nspecific input sample instead of a fixed selection for the entire dataset. This\ndynamic selection is jointly learned with the task model in an end-to-end way,\nusing the Gumbel-Softmax trick to allow the discrete decisions to be learned\nthrough standard backpropagation. We then show how we can use this dynamic\nselection to increase the lifetime of a wireless sensor network (WSN) by\nimposing constraints on how often each node is allowed to transmit. We further\nimprove performance by including a dynamic spatial filter that makes the\ntask-DNN more robust against the fact that it now needs to be able to handle a\nmultitude of possible node subsets. Finally, we explain how the selection of\nthe optimal channels can be distributed across the different nodes in a WSN. We\nvalidate this method on a use case in the context of body-sensor networks,\nwhere we use real electroencephalography (EEG) sensor data to emulate an EEG\nsensor network. We analyze the resulting trade-offs between transmission load\nand task accuracy.\n","authors":["Thomas Strypsteen","Alexander Bertrand"],"pdf_url":"https://arxiv.org/pdf/2308.08379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11787v2","updated":"2023-08-16T14:03:03Z","published":"2023-07-20T16:22:36Z","title":"LLM Cognitive Judgements Differ From Human","summary":" Large Language Models (LLMs) have lately been on the spotlight of\nresearchers, businesses, and consumers alike. While the linguistic capabilities\nof such models have been studied extensively, there is growing interest in\ninvestigating them as cognitive subjects. In the present work I examine GPT-3\nand ChatGPT capabilities on an limited-data inductive reasoning task from the\ncognitive science literature. The results suggest that these models' cognitive\njudgements are not human-like.\n","authors":["Sotiris Lamprinidis"],"pdf_url":"https://arxiv.org/pdf/2307.11787v2.pdf","comment":"7 pages, 1 figure. License changed to CC BY-NC-SA"},{"id":"http://arxiv.org/abs/2303.13538v2","updated":"2023-08-16T13:59:00Z","published":"2023-03-15T13:32:11Z","title":"Bluetooth and WiFi Dataset for Real World RF Fingerprinting of\n Commercial Devices","summary":" RF fingerprinting is emerging as a physical layer security scheme to identify\nillegitimate and/or unauthorized emitters sharing the RF spectrum. However, due\nto the lack of publicly accessible real-world datasets, most research focuses\non generating synthetic waveforms with software-defined radios (SDRs) which are\nnot suited for practical deployment settings. On other hand, the limited\ndatasets that are available focus only on chipsets that generate only one kind\nof waveform. Commercial off-the-shelf (COTS) combo chipsets that support two\nwireless standards (for example WiFi and Bluetooth) over a shared dual-band\nantenna such as those found in laptops, adapters, wireless chargers, Raspberry\nPis, among others are becoming ubiquitous in the IoT realm. Hence, to keep up\nwith the modern IoT environment, there is a pressing need for real-world open\ndatasets capturing emissions from these combo chipsets transmitting\nheterogeneous communication protocols. To this end, we capture the first known\nemissions from the COTS IoT chipsets transmitting WiFi and Bluetooth under two\ndifferent time frames. The different time frames are essential to rigorously\nevaluate the generalization capability of the models. To ensure widespread use,\neach capture within the comprehensive 72 GB dataset is long enough (40\nMSamples) to support diverse input tensor lengths and formats. Finally, the\ndataset also comprises emissions at varying signal powers to account for the\nfeeble to high signal strength emissions as encountered in a real-world\nsetting.\n","authors":["Anu Jagannath","Zackary Kane","Jithin Jagannath"],"pdf_url":"https://arxiv.org/pdf/2303.13538v2.pdf","comment":"Revision Under Review"},{"id":"http://arxiv.org/abs/2307.16120v2","updated":"2023-08-16T13:58:20Z","published":"2023-07-30T03:59:47Z","title":"Deep Unrolling Networks with Recurrent Momentum Acceleration for\n Nonlinear Inverse Problems","summary":" Combining the strengths of model-based iterative algorithms and data-driven\ndeep learning solutions, deep unrolling networks (DuNets) have become a popular\ntool to solve inverse imaging problems. While DuNets have been successfully\napplied to many linear inverse problems, nonlinear problems tend to impair the\nperformance of the method. Inspired by momentum acceleration techniques that\nare often used in optimization algorithms, we propose a recurrent momentum\nacceleration (RMA) framework that uses a long short-term memory recurrent\nneural network (LSTM-RNN) to simulate the momentum acceleration process. The\nRMA module leverages the ability of the LSTM-RNN to learn and retain knowledge\nfrom the previous gradients. We apply RMA to two popular DuNets -- the learned\nproximal gradient descent (LPGD) and the learned primal-dual (LPD) methods,\nresulting in LPGD-RMA and LPD-RMA respectively. We provide experimental results\non two nonlinear inverse problems: a nonlinear deconvolution problem, and an\nelectrical impedance tomography problem with limited boundary measurements. In\nthe first experiment we have observed that the improvement due to RMA largely\nincreases with respect to the nonlinearity of the problem. The results of the\nsecond example further demonstrate that the RMA schemes can significantly\nimprove the performance of DuNets in strongly ill-posed problems.\n","authors":["Qingping Zhou","Jiayu Qian","Junqi Tang","Jinglai Li"],"pdf_url":"https://arxiv.org/pdf/2307.16120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04043v2","updated":"2023-08-16T13:51:23Z","published":"2023-05-06T13:13:18Z","title":"Echoes: Unsupervised Debiasing via Pseudo-bias Labeling in an Echo\n Chamber","summary":" Neural networks often learn spurious correlations when exposed to biased\ntraining data, leading to poor performance on out-of-distribution data. A\nbiased dataset can be divided, according to biased features, into bias-aligned\nsamples (i.e., with biased features) and bias-conflicting samples (i.e.,\nwithout biased features). Recent debiasing works typically assume that no bias\nlabel is available during the training phase, as obtaining such information is\nchallenging and labor-intensive. Following this unsupervised assumption,\nexisting methods usually train two models: a biased model specialized to learn\nbiased features and a target model that uses information from the biased model\nfor debiasing. This paper first presents experimental analyses revealing that\nthe existing biased models overfit to bias-conflicting samples in the training\ndata, which negatively impacts the debiasing performance of the target models.\nTo address this issue, we propose a straightforward and effective method called\nEchoes, which trains a biased model and a target model with a different\nstrategy. We construct an \"echo chamber\" environment by reducing the weights of\nsamples which are misclassified by the biased model, to ensure the biased model\nfully learns the biased features without overfitting to the bias-conflicting\nsamples. The biased model then assigns lower weights on the bias-conflicting\nsamples. Subsequently, we use the inverse of the sample weights of the biased\nmodel for training the target model. Experiments show that our approach\nachieves superior debiasing results compared to the existing baselines on both\nsynthetic and real-world datasets. Our code is available at\nhttps://github.com/isruihu/Echoes.\n","authors":["Rui Hu","Yahan Tu","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2305.04043v2.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.08371v1","updated":"2023-08-16T13:50:23Z","published":"2023-08-16T13:50:23Z","title":"PDPK: A Framework to Synthesise Process Data and Corresponding\n Procedural Knowledge for Manufacturing","summary":" Procedural knowledge describes how to accomplish tasks and mitigate problems.\nSuch knowledge is commonly held by domain experts, e.g. operators in\nmanufacturing who adjust parameters to achieve quality targets. To the best of\nour knowledge, no real-world datasets containing process data and corresponding\nprocedural knowledge are publicly available, possibly due to corporate\napprehensions regarding the loss of knowledge advances. Therefore, we provide a\nframework to generate synthetic datasets that can be adapted to different\ndomains. The design choices are inspired by two real-world datasets of\nprocedural knowledge we have access to. Apart from containing representations\nof procedural knowledge in Resource Description Framework (RDF)-compliant\nknowledge graphs, the framework simulates parametrisation processes and\nprovides consistent process data. We compare established embedding methods on\nthe resulting knowledge graphs, detailing which out-of-the-box methods have the\npotential to represent procedural knowledge. This provides a baseline which can\nbe used to increase the comparability of future work. Furthermore, we validate\nthe overall characteristics of a synthesised dataset by comparing the results\nto those achievable on a real-world dataset. The framework and evaluation code,\nas well as the dataset used in the evaluation, are available open source.\n","authors":["Richard Nordsieck","André Schweizer","Michael Heider","Jörg Hähner"],"pdf_url":"https://arxiv.org/pdf/2308.08371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08366v1","updated":"2023-08-16T13:40:58Z","published":"2023-08-16T13:40:58Z","title":"Dual-Branch Temperature Scaling Calibration for Long-Tailed Recognition","summary":" The calibration for deep neural networks is currently receiving widespread\nattention and research. Miscalibration usually leads to overconfidence of the\nmodel. While, under the condition of long-tailed distribution of data, the\nproblem of miscalibration is more prominent due to the different confidence\nlevels of samples in minority and majority categories, and it will result in\nmore serious overconfidence. To address this problem, some current research\nhave designed diverse temperature coefficients for different categories based\non temperature scaling (TS) method. However, in the case of rare samples in\nminority classes, the temperature coefficient is not generalizable, and there\nis a large difference between the temperature coefficients of the training set\nand the validation set. To solve this challenge, this paper proposes a\ndual-branch temperature scaling calibration model (Dual-TS), which considers\nthe diversities in temperature parameters of different categories and the\nnon-generalizability of temperature parameters for rare samples in minority\nclasses simultaneously. Moreover, we noticed that the traditional calibration\nevaluation metric, Excepted Calibration Error (ECE), gives a higher weight to\nlow-confidence samples in the minority classes, which leads to inaccurate\nevaluation of model calibration. Therefore, we also propose Equal Sample Bin\nExcepted Calibration Error (Esbin-ECE) as a new calibration evaluation metric.\nThrough experiments, we demonstrate that our model yields state-of-the-art in\nboth traditional ECE and Esbin-ECE metrics.\n","authors":["Jialin Guo","Zhenyu Wu","Zhiqiang Zhan","Yang Ji"],"pdf_url":"https://arxiv.org/pdf/2308.08366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08361v1","updated":"2023-08-16T13:35:09Z","published":"2023-08-16T13:35:09Z","title":"KernelWarehouse: Towards Parameter-Efficient Dynamic Convolution","summary":" Dynamic convolution learns a linear mixture of $n$ static kernels weighted\nwith their sample-dependent attentions, demonstrating superior performance\ncompared to normal convolution. However, existing designs are\nparameter-inefficient: they increase the number of convolutional parameters by\n$n$ times. This and the optimization difficulty lead to no research progress in\ndynamic convolution that can allow us to use a significant large value of $n$\n(e.g., $n>100$ instead of typical setting $n<10$) to push forward the\nperformance boundary. In this paper, we propose $KernelWarehouse$, a more\ngeneral form of dynamic convolution, which can strike a favorable trade-off\nbetween parameter efficiency and representation power. Its key idea is to\nredefine the basic concepts of \"$kernels$\" and \"$assembling$ $kernels$\" in\ndynamic convolution from the perspective of reducing kernel dimension and\nincreasing kernel number significantly. In principle, KernelWarehouse enhances\nconvolutional parameter dependencies within the same layer and across\nsuccessive layers via tactful kernel partition and warehouse sharing, yielding\na high degree of freedom to fit a desired parameter budget. We validate our\nmethod on ImageNet and MS-COCO datasets with different ConvNet architectures,\nand show that it attains state-of-the-art results. For instance, the\nResNet18|ResNet50|MobileNetV2|ConvNeXt-Tiny model trained with KernelWarehouse\non ImageNet reaches 76.05%|81.05%|75.52%|82.51% top-1 accuracy. Thanks to its\nflexible design, KernelWarehouse can even reduce the model size of a ConvNet\nwhile improving the accuracy, e.g., our ResNet18 model with 36.45%|65.10%\nparameter reduction to the baseline shows 2.89%|2.29% absolute improvement to\ntop-1 accuracy.\n","authors":["Chao Li","Anbang Yao"],"pdf_url":"https://arxiv.org/pdf/2308.08361v1.pdf","comment":"This research work was completed and submitted in early May 2023.\n Code and pre-trained models are available at\n https://github.com/OSVAI/KernelWarehouse"},{"id":"http://arxiv.org/abs/2305.09781v2","updated":"2023-08-16T13:33:06Z","published":"2023-05-16T20:12:59Z","title":"SpecInfer: Accelerating Generative Large Language Model Serving with\n Speculative Inference and Token Tree Verification","summary":" The high computational and memory requirements of generative large language\nmodels (LLMs) make it challenging to serve them quickly and cheaply. This paper\nintroduces SpecInfer, an LLM serving system that accelerates generative LLM\ninference with speculative inference and token tree verification. A key insight\nbehind Specinfer is to combine various collectively boost-tuned small language\nmodels to jointly predict the LLM's outputs; the predictions are organized as a\ntoken tree, whose nodes each represent a candidate token sequence. The\ncorrectness of all candidate token sequences represented by a token tree is\nverified against the LLM in parallel using a novel tree-based parallel decoding\nmechanism. SpecInfer uses an LLM as a token tree verifier instead of an\nincremental decoder, which significantly reduces the end-to-end latency and\ncomputational requirement for serving generative LLMs while provably preserving\nmodel quality. Our evaluation shows that SpecInfer outperforms existing LLM\nserving systems by 1.3-2.4x for distributed LLM inference and by 2.6-3.5x for\noffloading-based LLM inference, while preserving the same generative\nperformance. SpecInfer is publicly available at\nhttps://github.com/flexflow/FlexFlow/tree/inference.\n","authors":["Xupeng Miao","Gabriele Oliaro","Zhihao Zhang","Xinhao Cheng","Zeyu Wang","Rae Ying Yee Wong","Alan Zhu","Lijie Yang","Xiaoxiang Shi","Chunan Shi","Zhuoming Chen","Daiyaan Arfeen","Reyna Abhyankar","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2305.09781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08360v1","updated":"2023-08-16T13:32:43Z","published":"2023-08-16T13:32:43Z","title":"Independent Distribution Regularization for Private Graph Embedding","summary":" Learning graph embeddings is a crucial task in graph mining tasks. An\neffective graph embedding model can learn low-dimensional representations from\ngraph-structured data for data publishing benefiting various downstream\napplications such as node classification, link prediction, etc. However, recent\nstudies have revealed that graph embeddings are susceptible to attribute\ninference attacks, which allow attackers to infer private node attributes from\nthe learned graph embeddings. To address these concerns, privacy-preserving\ngraph embedding methods have emerged, aiming to simultaneously consider primary\nlearning and privacy protection through adversarial learning. However, most\nexisting methods assume that representation models have access to all sensitive\nattributes in advance during the training stage, which is not always the case\ndue to diverse privacy preferences. Furthermore, the commonly used adversarial\nlearning technique in privacy-preserving representation learning suffers from\nunstable training issues. In this paper, we propose a novel approach called\nPrivate Variational Graph AutoEncoders (PVGAE) with the aid of independent\ndistribution penalty as a regularization term. Specifically, we split the\noriginal variational graph autoencoder (VGAE) to learn sensitive and\nnon-sensitive latent representations using two sets of encoders. Additionally,\nwe introduce a novel regularization to enforce the independence of the\nencoders. We prove the theoretical effectiveness of regularization from the\nperspective of mutual information. Experimental results on three real-world\ndatasets demonstrate that PVGAE outperforms other baselines in private\nembedding learning regarding utility performance and privacy protection.\n","authors":["Qi Hu","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2308.08360v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2308.08358v1","updated":"2023-08-16T13:30:45Z","published":"2023-08-16T13:30:45Z","title":"Convergence of Two-Layer Regression with Nonlinear Units","summary":" Large language models (LLMs), such as ChatGPT and GPT4, have shown\noutstanding performance in many human life task. Attention computation plays an\nimportant role in training LLMs. Softmax unit and ReLU unit are the key\nstructure in attention computation. Inspired by them, we put forward a softmax\nReLU regression problem. Generally speaking, our goal is to find an optimal\nsolution to the regression problem involving the ReLU unit. In this work, we\ncalculate a close form representation for the Hessian of the loss function.\nUnder certain assumptions, we prove the Lipschitz continuous and the PSDness of\nthe Hessian. Then, we introduce an greedy algorithm based on approximate Newton\nmethod, which converges in the sense of the distance to optimal solution. Last,\nWe relax the Lipschitz condition and prove the convergence in the sense of loss\nvalue.\n","authors":["Yichuan Deng","Zhao Song","Shenghao Xie"],"pdf_url":"https://arxiv.org/pdf/2308.08358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08354v1","updated":"2023-08-16T13:24:47Z","published":"2023-08-16T13:24:47Z","title":"Is Meta-Learning the Right Approach for the Cold-Start Problem in\n Recommender Systems?","summary":" Recommender systems have become fundamental building blocks of modern online\nproducts and services, and have a substantial impact on user experience. In the\npast few years, deep learning methods have attracted a lot of research, and are\nnow heavily used in modern real-world recommender systems. Nevertheless,\ndealing with recommendations in the cold-start setting, e.g., when a user has\ndone limited interactions in the system, is a problem that remains far from\nsolved. Meta-learning techniques, and in particular optimization-based\nmeta-learning, have recently become the most popular approaches in the academic\nresearch literature for tackling the cold-start problem in deep learning models\nfor recommender systems. However, current meta-learning approaches are not\npractical for real-world recommender systems, which have billions of users and\nitems, and strict latency requirements. In this paper we show that it is\npossible to obtaining similar, or higher, performance on commonly used\nbenchmarks for the cold-start problem without using meta-learning techniques.\nIn more detail, we show that, when tuned correctly, standard and widely adopted\ndeep learning models perform just as well as newer meta-learning models. We\nfurther show that an extremely simple modular approach using common\nrepresentation learning techniques, can perform comparably to meta-learning\ntechniques specifically designed for the cold-start setting while being much\nmore easily deployable in real-world applications.\n","authors":["Davide Buffelli","Ashish Gupta","Agnieszka Strzalka","Vassilis Plachouras"],"pdf_url":"https://arxiv.org/pdf/2308.08354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03589v2","updated":"2023-08-16T13:17:03Z","published":"2023-06-06T11:15:53Z","title":"How does over-squashing affect the power of GNNs?","summary":" Graph Neural Networks (GNNs) are the state-of-the-art model for machine\nlearning on graph-structured data. The most popular class of GNNs operate by\nexchanging information between adjacent nodes, and are known as Message Passing\nNeural Networks (MPNNs). Given their widespread use, understanding the\nexpressive power of MPNNs is a key question. However, existing results\ntypically consider settings with uninformative node features. In this paper, we\nprovide a rigorous analysis to determine which function classes of node\nfeatures can be learned by an MPNN of a given capacity. We do so by measuring\nthe level of pairwise interactions between nodes that MPNNs allow for. This\nmeasure provides a novel quantitative characterization of the so-called\nover-squashing effect, which is observed to occur when a large volume of\nmessages is aggregated into fixed-size vectors. Using our measure, we prove\nthat, to guarantee sufficient communication between pairs of nodes, the\ncapacity of the MPNN must be large enough, depending on properties of the input\ngraph structure, such as commute times. For many relevant scenarios, our\nanalysis results in impossibility statements in practice, showing that\nover-squashing hinders the expressive power of MPNNs. We validate our\ntheoretical findings through extensive controlled experiments and ablation\nstudies.\n","authors":["Francesco Di Giovanni","T. Konstantin Rusch","Michael M. Bronstein","Andreea Deac","Marc Lackenby","Siddhartha Mishra","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2306.03589v2.pdf","comment":"37 pages"},{"id":"http://arxiv.org/abs/2308.08344v1","updated":"2023-08-16T13:10:27Z","published":"2023-08-16T13:10:27Z","title":"Graph Out-of-Distribution Generalization with Controllable Data\n Augmentation","summary":" Graph Neural Network (GNN) has demonstrated extraordinary performance in\nclassifying graph properties. However, due to the selection bias of training\nand testing data (e.g., training on small graphs and testing on large graphs,\nor training on dense graphs and testing on sparse graphs), distribution\ndeviation is widespread. More importantly, we often observe \\emph{hybrid\nstructure distribution shift} of both scale and density, despite of one-sided\nbiased data partition. The spurious correlations over hybrid distribution\ndeviation degrade the performance of previous GNN methods and show large\ninstability among different datasets. To alleviate this problem, we propose\n\\texttt{OOD-GMixup} to jointly manipulate the training distribution with\n\\emph{controllable data augmentation} in metric space. Specifically, we first\nextract the graph rationales to eliminate the spurious correlations due to\nirrelevant information. Secondly, we generate virtual samples with perturbation\non graph rationale representation domain to obtain potential OOD training\nsamples. Finally, we propose OOD calibration to measure the distribution\ndeviation of virtual samples by leveraging Extreme Value Theory, and further\nactively control the training distribution by emphasizing the impact of virtual\nOOD samples. Extensive studies on several real-world datasets on graph\nclassification demonstrate the superiority of our proposed method over\nstate-of-the-art baselines.\n","authors":["Bin Lu","Xiaoying Gan","Ze Zhao","Shiyu Liang","Luoyi Fu","Xinbing Wang","Chenghu Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.08344v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.08334v1","updated":"2023-08-16T12:50:10Z","published":"2023-08-16T12:50:10Z","title":"Learning Logic Programs by Discovering Higher-Order Abstractions","summary":" Discovering novel abstractions is important for human-level AI. We introduce\nan approach to discover higher-order abstractions, such as map, filter, and\nfold. We focus on inductive logic programming, which induces logic programs\nfrom examples and background knowledge. We introduce the higher-order\nrefactoring problem, where the goal is to compress a logic program by\nintroducing higher-order abstractions. We implement our approach in STEVIE,\nwhich formulates the higher-order refactoring problem as a constraint\noptimisation problem. Our experimental results on multiple domains, including\nprogram synthesis and visual reasoning, show that, compared to no refactoring,\nSTEVIE can improve predictive accuracies by 27% and reduce learning times by\n47%. We also show that STEVIE can discover abstractions that transfer to\ndifferent domains\n","authors":["Céline Hocquette","Sebastijan Dumančić","Andrew Cropper"],"pdf_url":"https://arxiv.org/pdf/2308.08334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01631v2","updated":"2023-08-16T12:30:27Z","published":"2023-06-02T15:49:45Z","title":"Bi-level Contrastive Learning for Knowledge-Enhanced Molecule\n Representations","summary":" Molecule representation learning underpins diverse downstream applications\nsuch as molecular property and side effect understanding and prediction. In\nthis paper, we recognize the two-level structure of individual molecule as\nhaving intrinsic graph structure as well as being a node in a large molecule\nknowledge graph, and present GODE, a new approach that seamlessly integrates\ngraph representations of individual molecules with multi-domain biomedical data\nfrom knowledge graphs. By pre-training two graph neural networks (GNNs) on\ndifferent graph structures, combined with contrastive learning, GODE adeptly\nfuses molecular structures with their corresponding knowledge graph\nsubstructures. This fusion results in a more robust and informative\nrepresentation, enhancing molecular property prediction by harnessing both\nchemical and biological information. Finetuned on 11 chemical property tasks,\nour model surpasses benchmarks, achieving an average ROC-AUC improvement of\n14.5%, 9.8%, and 7.3% on BBBP, SIDER, and Tox21 datasets. In regression tasks\non ESOL and QM7 datasets, we achieve average improvements of 21.0% and 29.6%\nimprovements in RMSE and MAE, setting a new field benchmark.\n","authors":["Pengcheng Jiang","Cao Xiao","Tianfan Fu","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2306.01631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08305v1","updated":"2023-08-16T12:08:50Z","published":"2023-08-16T12:08:50Z","title":"Warped geometric information on the optimisation of Euclidean functions","summary":" We consider the fundamental task of optimizing a real-valued function defined\nin a potentially high-dimensional Euclidean space, such as the loss function in\nmany machine-learning tasks or the logarithm of the probability distribution in\nstatistical inference. We use the warped Riemannian geometry notions to\nredefine the optimisation problem of a function on Euclidean space to a\nRiemannian manifold with a warped metric, and then find the function's optimum\nalong this manifold. The warped metric chosen for the search domain induces a\ncomputational friendly metric-tensor for which optimal search directions\nassociate with geodesic curves on the manifold becomes easier to compute.\nPerforming optimization along geodesics is known to be generally infeasible,\nyet we show that in this specific manifold we can analytically derive Taylor\napproximations up to third-order. In general these approximations to the\ngeodesic curve will not lie on the manifold, however we construct suitable\nretraction maps to pull them back onto the manifold. Therefore, we can\nefficiently optimize along the approximate geodesic curves. We cover the\nrelated theory, describe a practical optimization algorithm and empirically\nevaluate it on a collection of challenging optimisation benchmarks. Our\nproposed algorithm, using third-order approximation of geodesics, outperforms\nstandard Euclidean gradient-based counterparts in term of number of iterations\nuntil convergence and an alternative method for Hessian-based optimisation\nroutines.\n","authors":["Marcelo Hartmann","Bernardo Williams","Hanlin Yu","Mark Girolami","Alessandro Barp","Arto Klami"],"pdf_url":"https://arxiv.org/pdf/2308.08305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14036v2","updated":"2023-08-16T12:03:47Z","published":"2023-02-27T18:47:55Z","title":"Text-only domain adaptation for end-to-end ASR using integrated\n text-to-mel-spectrogram generator","summary":" We propose an end-to-end Automatic Speech Recognition (ASR) system that can\nbe trained on transcribed speech data, text-only data, or a mixture of both.\nThe proposed model uses an integrated auxiliary block for text-based training.\nThis block combines a non-autoregressive multi-speaker text-to-mel-spectrogram\ngenerator with a GAN-based enhancer to improve the spectrogram quality. The\nproposed system can generate a mel-spectrogram dynamically during training. It\ncan be used to adapt the ASR model to a new domain by using text-only data from\nthis domain. We demonstrate that the proposed training method significantly\nimproves ASR accuracy compared to the system trained on transcribed speech\nonly. It also surpasses cascade TTS systems with the vocoder in the adaptation\nquality and training speed.\n","authors":["Vladimir Bataev","Roman Korostik","Evgeny Shabalin","Vitaly Lavrukhin","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2302.14036v2.pdf","comment":"Accepted to INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2308.08291v1","updated":"2023-08-16T11:31:18Z","published":"2023-08-16T11:31:18Z","title":"Robust Bayesian Satisficing","summary":" Distributional shifts pose a significant challenge to achieving robustness in\ncontemporary machine learning. To overcome this challenge, robust satisficing\n(RS) seeks a robust solution to an unspecified distributional shift while\nachieving a utility above a desired threshold. This paper focuses on the\nproblem of RS in contextual Bayesian optimization when there is a discrepancy\nbetween the true and reference distributions of the context. We propose a novel\nrobust Bayesian satisficing algorithm called RoBOS for noisy black-box\noptimization. Our algorithm guarantees sublinear lenient regret under certain\nassumptions on the amount of distribution shift. In addition, we define a\nweaker notion of regret called robust satisficing regret, in which our\nalgorithm achieves a sublinear upper bound independent of the amount of\ndistribution shift. To demonstrate the effectiveness of our method, we apply it\nto various learning problems and compare it to other approaches, such as\ndistributionally robust optimization.\n","authors":["Artun Saday","Yaşar Cahit Yıldırım","Cem Tekin"],"pdf_url":"https://arxiv.org/pdf/2308.08291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12791v2","updated":"2023-08-16T11:23:05Z","published":"2022-11-23T09:12:17Z","title":"An ensemble of VisNet, Transformer-M, and pretraining models for\n molecular property prediction in OGB Large-Scale Challenge @ NeurIPS 2022","summary":" In the technical report, we provide our solution for OGB-LSC 2022 Graph\nRegression Task. The target of this task is to predict the quantum chemical\nproperty, HOMO-LUMO gap for a given molecule on PCQM4Mv2 dataset. In the\ncompetition, we designed two kinds of models: Transformer-M-ViSNet which is an\ngeometry-enhanced graph neural network for fully connected molecular graphs and\nPretrained-3D-ViSNet which is a pretrained ViSNet by distilling geomeotric\ninformation from optimized structures. With an ensemble of 22 models, ViSNet\nTeam achieved the MAE of 0.0723 eV on the test-challenge set, dramatically\nreducing the error by 39.75% compared with the best method in the last year\ncompetition.\n","authors":["Yusong Wang","Shaoning Li","Zun Wang","Xinheng He","Bin Shao","Tie-Yan Liu","Tong Wang"],"pdf_url":"https://arxiv.org/pdf/2211.12791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08290v1","updated":"2023-08-16T11:22:36Z","published":"2023-08-16T11:22:36Z","title":"DFedADMM: Dual Constraints Controlled Model Inconsistency for\n Decentralized Federated Learning","summary":" To address the communication burden issues associated with federated learning\n(FL), decentralized federated learning (DFL) discards the central server and\nestablishes a decentralized communication network, where each client\ncommunicates only with neighboring clients. However, existing DFL methods still\nsuffer from two major challenges: local inconsistency and local heterogeneous\noverfitting, which have not been fundamentally addressed by existing DFL\nmethods. To tackle these issues, we propose novel DFL algorithms, DFedADMM and\nits enhanced version DFedADMM-SAM, to enhance the performance of DFL. The\nDFedADMM algorithm employs primal-dual optimization (ADMM) by utilizing dual\nvariables to control the model inconsistency raised from the decentralized\nheterogeneous data distributions. The DFedADMM-SAM algorithm further improves\non DFedADMM by employing a Sharpness-Aware Minimization (SAM) optimizer, which\nuses gradient perturbations to generate locally flat models and searches for\nmodels with uniformly low loss values to mitigate local heterogeneous\noverfitting. Theoretically, we derive convergence rates of $\\small\n\\mathcal{O}\\Big(\\frac{1}{\\sqrt{KT}}+\\frac{1}{KT(1-\\psi)^2}\\Big)$ and $\\small\n\\mathcal{O}\\Big(\\frac{1}{\\sqrt{KT}}+\\frac{1}{KT(1-\\psi)^2}+\n\\frac{1}{T^{3/2}K^{1/2}}\\Big)$ in the non-convex setting for DFedADMM and\nDFedADMM-SAM, respectively, where $1 - \\psi$ represents the spectral gap of the\ngossip matrix. Empirically, extensive experiments on MNIST, CIFAR10 and\nCIFAR100 datesets demonstrate that our algorithms exhibit superior performance\nin terms of both generalization and convergence speed compared to existing\nstate-of-the-art (SOTA) optimizers in DFL.\n","authors":["Qinglun Li","Li Shen","Guanghao Li","Quanjun Yin","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2308.08290v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2305.13873v2","updated":"2023-08-16T11:16:15Z","published":"2023-05-23T09:48:16Z","title":"Unsafe Diffusion: On the Generation of Unsafe Images and Hateful Memes\n From Text-To-Image Models","summary":" State-of-the-art Text-to-Image models like Stable Diffusion and DALLE$\\cdot$2\nare revolutionizing how people generate visual content. At the same time,\nsociety has serious concerns about how adversaries can exploit such models to\ngenerate unsafe images. In this work, we focus on demystifying the generation\nof unsafe images and hateful memes from Text-to-Image models. We first\nconstruct a typology of unsafe images consisting of five categories (sexually\nexplicit, violent, disturbing, hateful, and political). Then, we assess the\nproportion of unsafe images generated by four advanced Text-to-Image models\nusing four prompt datasets. We find that these models can generate a\nsubstantial percentage of unsafe images; across four models and four prompt\ndatasets, 14.56% of all generated images are unsafe. When comparing the four\nmodels, we find different risk levels, with Stable Diffusion being the most\nprone to generating unsafe content (18.92% of all generated images are unsafe).\nGiven Stable Diffusion's tendency to generate more unsafe content, we evaluate\nits potential to generate hateful meme variants if exploited by an adversary to\nattack a specific individual or community. We employ three image editing\nmethods, DreamBooth, Textual Inversion, and SDEdit, which are supported by\nStable Diffusion. Our evaluation result shows that 24% of the generated images\nusing DreamBooth are hateful meme variants that present the features of the\noriginal hateful meme and the target individual/community; these generated\nimages are comparable to hateful meme variants collected from the real world.\nOverall, our results demonstrate that the danger of large-scale generation of\nunsafe images is imminent. We discuss several mitigating measures, such as\ncurating training data, regulating prompts, and implementing safety filters,\nand encourage better safeguard tools to be developed to prevent unsafe\ngeneration.\n","authors":["Yiting Qu","Xinyue Shen","Xinlei He","Michael Backes","Savvas Zannettou","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13873v2.pdf","comment":"To Appear in the ACM Conference on Computer and Communications\n Security, November 26, 2023"},{"id":"http://arxiv.org/abs/2302.06608v3","updated":"2023-08-16T11:12:42Z","published":"2023-02-13T18:59:52Z","title":"3D-aware Blending with Generative NeRFs","summary":" Image blending aims to combine multiple images seamlessly. It remains\nchallenging for existing 2D-based methods, especially when input images are\nmisaligned due to differences in 3D camera poses and object shapes. To tackle\nthese issues, we propose a 3D-aware blending method using generative Neural\nRadiance Fields (NeRF), including two key components: 3D-aware alignment and\n3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of\nthe reference image with respect to generative NeRFs and then perform 3D local\nalignment for each part. To further leverage 3D information of the generative\nNeRF, we propose 3D-aware blending that directly blends images on the NeRF's\nlatent representation space, rather than raw pixel space. Collectively, our\nmethod outperforms existing 2D baselines, as validated by extensive\nquantitative and qualitative evaluations with FFHQ and AFHQ-Cat.\n","authors":["Hyunsu Kim","Gayoung Lee","Yunjey Choi","Jin-Hwa Kim","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.06608v3.pdf","comment":"ICCV 2023, Project page: https://blandocs.github.io/blendnerf"},{"id":"http://arxiv.org/abs/2209.04744v2","updated":"2023-08-16T11:03:47Z","published":"2022-09-10T20:40:30Z","title":"Active Learning for Optimal Intervention Design in Causal Models","summary":" Sequential experimental design to discover interventions that achieve a\ndesired outcome is a key problem in various domains including science,\nengineering and public policy. When the space of possible interventions is\nlarge, making an exhaustive search infeasible, experimental design strategies\nare needed. In this context, encoding the causal relationships between the\nvariables, and thus the effect of interventions on the system, is critical for\nidentifying desirable interventions more efficiently. Here, we develop a causal\nactive learning strategy to identify interventions that are optimal, as\nmeasured by the discrepancy between the post-interventional mean of the\ndistribution and a desired target mean. The approach employs a Bayesian update\nfor the causal model and prioritizes interventions using a carefully designed,\ncausally informed acquisition function. This acquisition function is evaluated\nin closed form, allowing for fast optimization. The resulting algorithms are\ntheoretically grounded with information-theoretic bounds and provable\nconsistency results for linear causal models with known causal graph. We apply\nour approach to both synthetic data and single-cell transcriptomic data from\nPerturb-CITE-seq experiments to identify optimal perturbations that induce a\nspecific cell state transition. The causally informed acquisition function\ngenerally outperforms existing criteria allowing for optimal intervention\ndesign with fewer but carefully selected samples.\n","authors":["Jiaqi Zhang","Louis Cammarata","Chandler Squires","Themistoklis P. Sapsis","Caroline Uhler"],"pdf_url":"https://arxiv.org/pdf/2209.04744v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.10405v5","updated":"2023-08-16T10:57:58Z","published":"2023-01-25T04:45:06Z","title":"Editing Language Model-based Knowledge Graph Embeddings","summary":" Recently decades have witnessed the empirical success of framing Knowledge\nGraph (KG) embeddings via language models. However, language model-based KG\nembeddings are usually deployed as static artifacts, making them difficult to\nmodify post-deployment without re-training after deployment. To address this\nissue, we propose a new task of editing language model-based KG embeddings in\nthis paper. This task is designed to facilitate rapid, data-efficient updates\nto KG embeddings without compromising the performance of other aspects. We\nbuild four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and\nevaluate several knowledge editing baselines demonstrating the limited ability\nof previous models to handle the proposed challenging task. We further propose\na simple yet strong baseline dubbed KGEditor, which utilizes additional\nparametric layers of the hyper network to edit/add facts. Our comprehensive\nexperimental results reveal that KGEditor excels in updating specific facts\nwithout impacting the overall performance, even when faced with limited\ntraining resources. Code and datasets are available in\nhttps://github.com/zjunlp/PromptKG/tree/main/deltaKG.\n","authors":["Siyuan Cheng","Ningyu Zhang","Bozhong Tian","Xi Chen","Qingbing Liu","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2301.10405v5.pdf","comment":"Work in progress and the project website is\n https://zjunlp.github.io/project/KGE_Editing/"},{"id":"http://arxiv.org/abs/2308.08283v1","updated":"2023-08-16T10:51:27Z","published":"2023-08-16T10:51:27Z","title":"CARE: A Large Scale CT Image Dataset and Clinical Applicable Benchmark\n Model for Rectal Cancer Segmentation","summary":" Rectal cancer segmentation of CT image plays a crucial role in timely\nclinical diagnosis, radiotherapy treatment, and follow-up. Although current\nsegmentation methods have shown promise in delineating cancerous tissues, they\nstill encounter challenges in achieving high segmentation precision. These\nobstacles arise from the intricate anatomical structures of the rectum and the\ndifficulties in performing differential diagnosis of rectal cancer.\nAdditionally, a major obstacle is the lack of a large-scale, finely annotated\nCT image dataset for rectal cancer segmentation. To address these issues, this\nwork introduces a novel large scale rectal cancer CT image dataset CARE with\npixel-level annotations for both normal and cancerous rectum, which serves as a\nvaluable resource for algorithm research and clinical application development.\nMoreover, we propose a novel medical cancer lesion segmentation benchmark model\nnamed U-SAM. The model is specifically designed to tackle the challenges posed\nby the intricate anatomical structures of abdominal organs by incorporating\nprompt information. U-SAM contains three key components: promptable information\n(e.g., points) to aid in target area localization, a convolution module for\ncapturing low-level lesion details, and skip-connections to preserve and\nrecover spatial information during the encoding-decoding process. To evaluate\nthe effectiveness of U-SAM, we systematically compare its performance with\nseveral popular segmentation methods on the CARE dataset. The generalization of\nthe model is further verified on the WORD dataset. Extensive experiments\ndemonstrate that the proposed U-SAM outperforms state-of-the-art methods on\nthese two datasets. These experiments can serve as the baseline for future\nresearch and clinical application development.\n","authors":["Hantao Zhang","Weidong Guo","Chenyang Qiu","Shouhong Wan","Bingbing Zou","Wanqin Wang","Peiquan Jin"],"pdf_url":"https://arxiv.org/pdf/2308.08283v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.07118v2","updated":"2023-08-16T10:46:35Z","published":"2023-08-14T12:57:12Z","title":"Neural radiance fields in the industrial and robotics domain:\n applications, research opportunities and use cases","summary":" The proliferation of technologies, such as extended reality (XR), has\nincreased the demand for high-quality three-dimensional (3D) graphical\nrepresentations. Industrial 3D applications encompass computer-aided design\n(CAD), finite element analysis (FEA), scanning, and robotics. However, current\nmethods employed for industrial 3D representations suffer from high\nimplementation costs and reliance on manual human input for accurate 3D\nmodeling. To address these challenges, neural radiance fields (NeRFs) have\nemerged as a promising approach for learning 3D scene representations based on\nprovided training 2D images. Despite a growing interest in NeRFs, their\npotential applications in various industrial subdomains are still unexplored.\nIn this paper, we deliver a comprehensive examination of NeRF industrial\napplications while also providing direction for future research endeavors. We\nalso present a series of proof-of-concept experiments that demonstrate the\npotential of NeRFs in the industrial domain. These experiments include\nNeRF-based video compression techniques and using NeRFs for 3D motion\nestimation in the context of collision avoidance. In the video compression\nexperiment, our results show compression savings up to 48\\% and 74\\% for\nresolutions of 1920x1080 and 300x168, respectively. The motion estimation\nexperiment used a 3D animation of a robotic arm to train Dynamic-NeRF (D-NeRF)\nand achieved an average peak signal-to-noise ratio (PSNR) of disparity map with\nthe value of 23 dB and an structural similarity index measure (SSIM) 0.97.\n","authors":["Eugen Šlapak","Enric Pardo","Matúš Dopiriak","Taras Maksymyuk","Juraj Gazda"],"pdf_url":"https://arxiv.org/pdf/2308.07118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08268v1","updated":"2023-08-16T10:09:42Z","published":"2023-08-16T10:09:42Z","title":"It Ain't That Bad: Understanding the Mysterious Performance Drop in OOD\n Generalization for Generative Transformer Models","summary":" Generative Transformer-based models have achieved remarkable proficiency on\nsolving diverse problems. However, their generalization ability is not fully\nunderstood and not always satisfying. Researchers take basic mathematical tasks\nlike n-digit addition or multiplication as important perspectives for\ninvestigating their generalization behaviors. Curiously, it is observed that\nwhen training on n-digit operations (e.g., additions) in which both input\noperands are n-digit in length, models generalize successfully on unseen\nn-digit inputs (in-distribution (ID) generalization), but fail miserably and\nmysteriously on longer, unseen cases (out-of-distribution (OOD)\ngeneralization). Studies try to bridge this gap with workarounds such as\nmodifying position embedding, fine-tuning, and priming with more extensive or\ninstructive data. However, without addressing the essential mechanism, there is\nhardly any guarantee regarding the robustness of these solutions. We bring this\nunexplained performance drop into attention and ask whether it is purely from\nrandom errors. Here we turn to the mechanistic line of research which has\nnotable successes in model interpretability. We discover that the strong ID\ngeneralization stems from structured representations, while behind the\nunsatisfying OOD performance, the models still exhibit clear learned algebraic\nstructures. Specifically, these models map unseen OOD inputs to outputs with\nequivalence relations in the ID domain. These highlight the potential of the\nmodels to carry useful information for improved generalization.\n","authors":["Xingcheng Xu","Zihao Pan","Haipeng Zhang","Yanqing Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.03973v7","updated":"2023-08-16T10:06:53Z","published":"2021-02-08T02:51:34Z","title":"STS-GAN: Can We Synthesize Solid Texture with High Fidelity from\n Arbitrary 2D Exemplar?","summary":" Solid texture synthesis (STS), an effective way to extend a 2D exemplar to a\n3D solid volume, exhibits advantages in computational photography. However,\nexisting methods generally fail to accurately learn arbitrary textures, which\nmay result in the failure to synthesize solid textures with high fidelity. In\nthis paper, we propose a novel generative adversarial nets-based framework\n(STS-GAN) to extend the given 2D exemplar to arbitrary 3D solid textures. In\nSTS-GAN, multi-scale 2D texture discriminators evaluate the similarity between\nthe given 2D exemplar and slices from the generated 3D texture, promoting the\n3D texture generator synthesizing realistic solid textures. Finally,\nexperiments demonstrate that the proposed method can generate high-fidelity\nsolid textures with similar visual characteristics to the 2D exemplar.\n","authors":["Xin Zhao","Jifeng Guo","Lin Wang","Fanqi Li","Jiahao Li","Junteng Zheng","Bo Yang"],"pdf_url":"https://arxiv.org/pdf/2102.03973v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08259v1","updated":"2023-08-16T09:53:20Z","published":"2023-08-16T09:53:20Z","title":"Graph Relation Aware Continual Learning","summary":" Continual graph learning (CGL) studies the problem of learning from an\ninfinite stream of graph data, consolidating historical knowledge, and\ngeneralizing it to the future task. At once, only current graph data are\navailable. Although some recent attempts have been made to handle this task, we\nstill face two potential challenges: 1) most of existing works only manipulate\non the intermediate graph embedding and ignore intrinsic properties of graphs.\nIt is non-trivial to differentiate the transferred information across graphs.\n2) recent attempts take a parameter-sharing policy to transfer knowledge across\ntime steps or progressively expand new architecture given shifted graph\ndistribution. Learning a single model could loss discriminative information for\neach graph task while the model expansion scheme suffers from high model\ncomplexity. In this paper, we point out that latent relations behind graph\nedges can be attributed as an invariant factor for the evolving graphs and the\nstatistical information of latent relations evolves. Motivated by this, we\ndesign a relation-aware adaptive model, dubbed as RAM-CG, that consists of a\nrelation-discovery modular to explore latent relations behind edges and a\ntask-awareness masking classifier to accounts for the shifted. Extensive\nexperiments show that RAM-CG provides significant 2.2%, 6.9% and 6.6% accuracy\nimprovements over the state-of-the-art results on CitationNet, OGBN-arxiv and\nTWITCH dataset, respective.\n","authors":["Qinghua Shen","Weijieying Ren","Wei Qin"],"pdf_url":"https://arxiv.org/pdf/2308.08259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08247v1","updated":"2023-08-16T09:28:55Z","published":"2023-08-16T09:28:55Z","title":"Two Phases of Scaling Laws for Nearest Neighbor Classifiers","summary":" A scaling law refers to the observation that the test performance of a model\nimproves as the number of training data increases. A fast scaling law implies\nthat one can solve machine learning problems by simply boosting the data and\nthe model sizes. Yet, in many cases, the benefit of adding more data can be\nnegligible. In this work, we study the rate of scaling laws of nearest neighbor\nclassifiers. We show that a scaling law can have two phases: in the first\nphase, the generalization error depends polynomially on the data dimension and\ndecreases fast; whereas in the second phase, the error depends exponentially on\nthe data dimension and decreases slowly. Our analysis highlights the complexity\nof the data distribution in determining the generalization error. When the data\ndistributes benignly, our result suggests that nearest neighbor classifier can\nachieve a generalization error that depends polynomially, instead of\nexponentially, on the data dimension.\n","authors":["Pengkun Yang","Jingzhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16296v2","updated":"2023-08-16T09:28:17Z","published":"2023-06-28T15:17:59Z","title":"Relevant Entity Selection: Knowledge Graph Bootstrapping via Zero-Shot\n Analogical Pruning","summary":" Knowledge Graph Construction (KGC) can be seen as an iterative process\nstarting from a high quality nucleus that is refined by knowledge extraction\napproaches in a virtuous loop. Such a nucleus can be obtained from knowledge\nexisting in an open KG like Wikidata. However, due to the size of such generic\nKGs, integrating them as a whole may entail irrelevant content and scalability\nissues. We propose an analogy-based approach that starts from seed entities of\ninterest in a generic KG, and keeps or prunes their neighboring entities. We\nevaluate our approach on Wikidata through two manually labeled datasets that\ncontain either domain-homogeneous or -heterogeneous seed entities. We\nempirically show that our analogy-based approach outperforms LSTM, Random\nForest, SVM, and MLP, with a drastically lower number of parameters. We also\nevaluate its generalization potential in a transfer learning setting. These\nresults advocate for the further integration of analogy-based inference in\ntasks related to the KG lifecycle.\n","authors":["Lucas Jarnac","Miguel Couceiro","Pierre Monnin"],"pdf_url":"https://arxiv.org/pdf/2306.16296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18477v3","updated":"2023-08-16T09:23:37Z","published":"2023-05-29T11:05:20Z","title":"Beyond the Meta: Leveraging Game Design Parameters for Patch-Agnostic\n Esport Analytics","summary":" Esport games comprise a sizeable fraction of the global games market, and is\nthe fastest growing segment in games. This has given rise to the domain of\nesports analytics, which uses telemetry data from games to inform players,\ncoaches, broadcasters and other stakeholders. Compared to traditional sports,\nesport titles change rapidly, in terms of mechanics as well as rules. Due to\nthese frequent changes to the parameters of the game, esport analytics models\ncan have a short life-spam, a problem which is largely ignored within the\nliterature. This paper extracts information from game design (i.e. patch notes)\nand utilises clustering techniques to propose a new form of character\nrepresentation. As a case study, a neural network model is trained to predict\nthe number of kills in a Dota 2 match utilising this novel character\nrepresentation technique. The performance of this model is then evaluated\nagainst two distinct baselines, including conventional techniques. Not only did\nthe model significantly outperform the baselines in terms of accuracy (85%\nAUC), but the model also maintains the accuracy in two newer iterations of the\ngame that introduced one new character and a brand new character type. These\nchanges introduced to the design of the game would typically break conventional\ntechniques that are commonly used within the literature. Therefore, the\nproposed methodology for representing characters can increase the life-spam of\nmachine learning models as well as contribute to a higher performance when\ncompared to traditional techniques typically employed within the literature.\n","authors":["Alan Pedrassoli Chitayat","Florian Block","James Walker","Anders Drachen"],"pdf_url":"https://arxiv.org/pdf/2305.18477v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15121v2","updated":"2023-08-16T09:19:20Z","published":"2023-05-24T13:13:26Z","title":"Beyond Individual Input for Deep Anomaly Detection on Tabular Data","summary":" Anomaly detection is crucial in various domains, such as finance, healthcare,\nand cybersecurity. In this paper, we propose a novel deep anomaly detection\nmethod for tabular data that leverages Non-Parametric Transformers (NPTs), a\nmodel initially proposed for supervised tasks, to capture both feature-feature\nand sample-sample dependencies. In a reconstruction-based framework, we train\nthe NPT to reconstruct masked features of normal samples. In a non-parametric\nfashion, we leverage the whole training set during inference and use the\nmodel's ability to reconstruct the masked features during to generate an\nanomaly score. To the best of our knowledge, our proposed method is the first\nto successfully combine feature-feature and sample-sample dependencies for\nanomaly detection on tabular datasets. We evaluate our method on an extensive\nbenchmark of 31 tabular datasets and demonstrate that our approach outperforms\nexisting state-of-the-art methods based on the F1-score and AUROC by a\nsignificant margin.\n","authors":["Hugo Thimonier","Fabrice Popineau","Arpad Rimmel","Bich-Liên Doan"],"pdf_url":"https://arxiv.org/pdf/2305.15121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08235v1","updated":"2023-08-16T09:12:21Z","published":"2023-08-16T09:12:21Z","title":"The Expressive Power of Graph Neural Networks: A Survey","summary":" Graph neural networks (GNNs) are effective machine learning models for many\ngraph-related applications. Despite their empirical success, many research\nefforts focus on the theoretical limitations of GNNs, i.e., the GNNs expressive\npower. Early works in this domain mainly focus on studying the graph\nisomorphism recognition ability of GNNs, and recent works try to leverage the\nproperties such as subgraph counting and connectivity learning to characterize\nthe expressive power of GNNs, which are more practical and closer to\nreal-world. However, no survey papers and open-source repositories\ncomprehensively summarize and discuss models in this important direction. To\nfill the gap, we conduct a first survey for models for enhancing expressive\npower under different forms of definition. Concretely, the models are reviewed\nbased on three categories, i.e., Graph feature enhancement, Graph topology\nenhancement, and GNNs architecture enhancement.\n","authors":["Bingxu Zhang","Changjun Fan","Shixuan Liu","Kuihua Huang","Xiang Zhao","Jincai Huang","Zhong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.08235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08234v1","updated":"2023-08-16T09:11:00Z","published":"2023-08-16T09:11:00Z","title":"Challenges and Opportunities of Using Transformer-Based Multi-Task\n Learning in NLP Through ML Lifecycle: A Survey","summary":" The increasing adoption of natural language processing (NLP) models across\nindustries has led to practitioners' need for machine learning systems to\nhandle these models efficiently, from training to serving them in production.\nHowever, training, deploying, and updating multiple models can be complex,\ncostly, and time-consuming, mainly when using transformer-based pre-trained\nlanguage models. Multi-Task Learning (MTL) has emerged as a promising approach\nto improve efficiency and performance through joint training, rather than\ntraining separate models. Motivated by this, we first provide an overview of\ntransformer-based MTL approaches in NLP. Then, we discuss the challenges and\nopportunities of using MTL approaches throughout typical ML lifecycle phases,\nspecifically focusing on the challenges related to data engineering, model\ndevelopment, deployment, and monitoring phases. This survey focuses on\ntransformer-based MTL architectures and, to the best of our knowledge, is novel\nin that it systematically analyses how transformer-based MTL in NLP fits into\nML lifecycle phases. Furthermore, we motivate research on the connection\nbetween MTL and continual learning (CL), as this area remains unexplored. We\nbelieve it would be practical to have a model that can handle both MTL and CL,\nas this would make it easier to periodically re-train the model, update it due\nto distribution shifts, and add new capabilities to meet real-world\nrequirements.\n","authors":["Lovre Torbarina","Tin Ferkovic","Lukasz Roguski","Velimir Mihelcic","Bruno Sarlija","Zeljko Kraljevic"],"pdf_url":"https://arxiv.org/pdf/2308.08234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08232v1","updated":"2023-08-16T09:06:54Z","published":"2023-08-16T09:06:54Z","title":"SCQPTH: an efficient differentiable splitting method for convex\n quadratic programming","summary":" We present SCQPTH: a differentiable first-order splitting method for convex\nquadratic programs. The SCQPTH framework is based on the alternating direction\nmethod of multipliers (ADMM) and the software implementation is motivated by\nthe state-of-the art solver OSQP: an operating splitting solver for convex\nquadratic programs (QPs). The SCQPTH software is made available as an\nopen-source python package and contains many similar features including\nefficient reuse of matrix factorizations, infeasibility detection, automatic\nscaling and parameter selection. The forward pass algorithm performs operator\nsplitting in the dimension of the original problem space and is therefore\nsuitable for large scale QPs with $100-1000$ decision variables and thousands\nof constraints. Backpropagation is performed by implicit differentiation of the\nADMM fixed-point mapping. Experiments demonstrate that for large scale QPs,\nSCQPTH can provide a $1\\times - 10\\times$ improvement in computational\nefficiency in comparison to existing differentiable QP solvers.\n","authors":["Andrew Butler"],"pdf_url":"https://arxiv.org/pdf/2308.08232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03496v2","updated":"2023-08-16T09:05:42Z","published":"2023-04-07T06:36:41Z","title":"Architecture-Preserving Provable Repair of Deep Neural Networks","summary":" Deep neural networks (DNNs) are becoming increasingly important components of\nsoftware, and are considered the state-of-the-art solution for a number of\nproblems, such as image recognition. However, DNNs are far from infallible, and\nincorrect behavior of DNNs can have disastrous real-world consequences. This\npaper addresses the problem of architecture-preserving V-polytope provable\nrepair of DNNs. A V-polytope defines a convex bounded polytope using its vertex\nrepresentation. V-polytope provable repair guarantees that the repaired DNN\nsatisfies the given specification on the infinite set of points in the given\nV-polytope. An architecture-preserving repair only modifies the parameters of\nthe DNN, without modifying its architecture. The repair has the flexibility to\nmodify multiple layers of the DNN, and runs in polynomial time. It supports\nDNNs with activation functions that have some linear pieces, as well as\nfully-connected, convolutional, pooling and residual layers. To the best our\nknowledge, this is the first provable repair approach that has all of these\nfeatures. We implement our approach in a tool called APRNN. Using MNIST,\nImageNet, and ACAS Xu DNNs, we show that it has better efficiency, scalability,\nand generalization compared to PRDNN and REASSURE, prior provable repair\nmethods that are not architecture preserving.\n","authors":["Zhe Tao","Stephanie Nawas","Jacqueline Mitchell","Aditya V. Thakur"],"pdf_url":"https://arxiv.org/pdf/2304.03496v2.pdf","comment":"Accepted paper at PLDI 2023. Tool is available at\n https://github.com/95616ARG/APRNN/"},{"id":"http://arxiv.org/abs/2308.08230v1","updated":"2023-08-16T09:03:13Z","published":"2023-08-16T09:03:13Z","title":"Exploring Winograd Convolution for Cost-effective Neural Network Fault\n Tolerance","summary":" Winograd is generally utilized to optimize convolution performance and\ncomputational efficiency because of the reduced multiplication operations, but\nthe reliability issues brought by winograd are usually overlooked. In this\nwork, we observe the great potential of winograd convolution in improving\nneural network (NN) fault tolerance. Based on the observation, we evaluate\nwinograd convolution fault tolerance comprehensively from different\ngranularities ranging from models, layers, and operation types for the first\ntime. Then, we explore the use of inherent fault tolerance of winograd\nconvolution for cost-effective NN protection against soft errors. Specifically,\nwe mainly investigate how winograd convolution can be effectively incorporated\nwith classical fault-tolerant design approaches including triple modular\nredundancy (TMR), fault-aware retraining, and constrained activation functions.\nAccording to our experiments, winograd convolution can reduce the\nfault-tolerant design overhead by 55.77\\% on average without any accuracy loss\ncompared to standard convolution, and further reduce the computing overhead by\n17.24\\% when the inherent fault tolerance of winograd convolution is\nconsidered. When it is applied on fault-tolerant neural networks enhanced with\nfault-aware retraining and constrained activation functions, the resulting\nmodel accuracy generally shows significant improvement in presence of various\nfaults.\n","authors":["Xinghua Xue","Cheng Liu","Bo Liu","Haitong Huang","Ying Wang","Tao Luo","Lei Zhang","Huawei Li","Xiaowei Li"],"pdf_url":"https://arxiv.org/pdf/2308.08230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08227v1","updated":"2023-08-16T08:58:25Z","published":"2023-08-16T08:58:25Z","title":"Inherent Redundancy in Spiking Neural Networks","summary":" Spiking Neural Networks (SNNs) are well known as a promising energy-efficient\nalternative to conventional artificial neural networks. Subject to the\npreconceived impression that SNNs are sparse firing, the analysis and\noptimization of inherent redundancy in SNNs have been largely overlooked, thus\nthe potential advantages of spike-based neuromorphic computing in accuracy and\nenergy efficiency are interfered. In this work, we pose and focus on three key\nquestions regarding the inherent redundancy in SNNs. We argue that the\nredundancy is induced by the spatio-temporal invariance of SNNs, which enhances\nthe efficiency of parameter utilization but also invites lots of noise spikes.\nFurther, we analyze the effect of spatio-temporal invariance on the\nspatio-temporal dynamics and spike firing of SNNs. Then, motivated by these\nanalyses, we propose an Advance Spatial Attention (ASA) module to harness SNNs'\nredundancy, which can adaptively optimize their membrane potential distribution\nby a pair of individual spatial attention sub-modules. In this way, noise spike\nfeatures are accurately regulated. Experimental results demonstrate that the\nproposed method can significantly drop the spike firing with better performance\nthan state-of-the-art SNN baselines. Our code is available in\n\\url{https://github.com/BICLab/ASA-SNN}.\n","authors":["Man Yao","Jiakui Hu","Guangshe Zhao","Yaoyuan Wang","Ziyang Zhang","Bo Xu","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2308.08227v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.08224v1","updated":"2023-08-16T08:52:49Z","published":"2023-08-16T08:52:49Z","title":"How To Overcome Confirmation Bias in Semi-Supervised Image\n Classification By Active Learning","summary":" Do we need active learning? The rise of strong deep semi-supervised methods\nraises doubt about the usability of active learning in limited labeled data\nsettings. This is caused by results showing that combining semi-supervised\nlearning (SSL) methods with a random selection for labeling can outperform\nexisting active learning (AL) techniques. However, these results are obtained\nfrom experiments on well-established benchmark datasets that can overestimate\nthe external validity. However, the literature lacks sufficient research on the\nperformance of active semi-supervised learning methods in realistic data\nscenarios, leaving a notable gap in our understanding. Therefore we present\nthree data challenges common in real-world applications: between-class\nimbalance, within-class imbalance, and between-class similarity. These\nchallenges can hurt SSL performance due to confirmation bias. We conduct\nexperiments with SSL and AL on simulated data challenges and find that random\nsampling does not mitigate confirmation bias and, in some cases, leads to worse\nperformance than supervised learning. In contrast, we demonstrate that AL can\novercome confirmation bias in SSL in these realistic settings. Our results\nprovide insights into the potential of combining active and semi-supervised\nlearning in the presence of common real-world challenges, which is a promising\ndirection for robust methods when learning with limited labeled data in\nreal-world applications.\n","authors":["Sandra Gilhuber","Rasmus Hvingelby","Mang Ling Ada Fok","Thomas Seidl"],"pdf_url":"https://arxiv.org/pdf/2308.08224v1.pdf","comment":"Accepted @ ECML PKDD 2023. This is the author's version of the work.\n The definitive Version of Record will be published in the Proceedings of ECML\n PKDD 2023"},{"id":"http://arxiv.org/abs/2305.01397v2","updated":"2023-08-16T08:50:47Z","published":"2023-05-02T13:16:04Z","title":"Are demographically invariant models and representations in medical\n imaging fair?","summary":" Medical imaging models have been shown to encode information about patient\ndemographics such as age, race, and sex in their latent representation, raising\nconcerns about their potential for discrimination. Here, we ask whether\nrequiring models not to encode demographic attributes is desirable. We point\nout that marginal and class-conditional representation invariance imply the\nstandard group fairness notions of demographic parity and equalized odds,\nrespectively, while additionally requiring risk distribution matching, thus\npotentially equalizing away important group differences. Enforcing the\ntraditional fairness notions directly instead does not entail these strong\nconstraints. Moreover, representationally invariant models may still take\ndemographic attributes into account for deriving predictions. The latter can be\nprevented using counterfactual notions of (individual) fairness or invariance.\nWe caution, however, that properly defining medical image counterfactuals with\nrespect to demographic attributes is highly challenging. Finally, we posit that\nencoding demographic attributes may even be advantageous if it enables learning\na task-specific encoding of demographic features that does not rely on social\nconstructs such as 'race' and 'gender.' We conclude that demographically\ninvariant representations are neither necessary nor sufficient for fairness in\nmedical imaging. Models may need to encode demographic attributes, lending\nfurther urgency to calls for comprehensive model fairness assessments in terms\nof predictive performance across diverse patient groups.\n","authors":["Eike Petersen","Enzo Ferrante","Melanie Ganz","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2305.01397v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08222v1","updated":"2023-08-16T08:48:04Z","published":"2023-08-16T08:48:04Z","title":"HyperSNN: A new efficient and robust deep learning model for resource\n constrained control applications","summary":" In light of the increasing adoption of edge computing in areas such as\nintelligent furniture, robotics, and smart homes, this paper introduces\nHyperSNN, an innovative method for control tasks that uses spiking neural\nnetworks (SNNs) in combination with hyperdimensional computing. HyperSNN\nsubstitutes expensive 32-bit floating point multiplications with 8-bit integer\nadditions, resulting in reduced energy consumption while enhancing robustness\nand potentially improving accuracy. Our model was tested on AI Gym benchmarks,\nincluding Cartpole, Acrobot, MountainCar, and Lunar Lander. HyperSNN achieves\ncontrol accuracies that are on par with conventional machine learning methods\nbut with only 1.36% to 9.96% of the energy expenditure. Furthermore, our\nexperiments showed increased robustness when using HyperSNN. We believe that\nHyperSNN is especially suitable for interactive, mobile, and wearable devices,\npromoting energy-efficient and robust system design. Furthermore, it paves the\nway for the practical implementation of complex algorithms like model\npredictive control (MPC) in real-world industrial scenarios.\n","authors":["Zhanglu Yan","Shida Wang","Kaiwen Tang","Wong-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2308.08222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08203v1","updated":"2023-08-16T08:08:54Z","published":"2023-08-16T08:08:54Z","title":"Epicure: Distilling Sequence Model Predictions into Patterns","summary":" Most machine learning models predict a probability distribution over concrete\noutputs and struggle to accurately predict names over high entropy sequence\ndistributions. Here, we explore finding abstract, high-precision patterns\nintrinsic to these predictions in order to make abstract predictions that\nusefully capture rare sequences. In this short paper, we present Epicure, a\nmethod that distils the predictions of a sequence model, such as the output of\nbeam search, into simple patterns. Epicure maps a model's predictions into a\nlattice that represents increasingly more general patterns that subsume the\nconcrete model predictions.\n On the tasks of predicting a descriptive name of a function given the source\ncode of its body and detecting anomalous names given a function, we show that\nEpicure yields accurate naming patterns that match the ground truth more often\ncompared to just the highest probability model prediction. For a false alarm\nrate of 10%, Epicure predicts patterns that match 61% more ground-truth names\ncompared to the best model prediction, making Epicure well-suited for scenarios\nthat require high precision.\n","authors":["Miltiadis Allamanis","Earl T. Barr"],"pdf_url":"https://arxiv.org/pdf/2308.08203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07896v2","updated":"2023-08-16T08:00:58Z","published":"2023-08-15T17:37:44Z","title":"SciRE-Solver: Efficient Sampling of Diffusion Probabilistic Models by\n Score-integrand Solver with Recursive Derivative Estimation","summary":" Diffusion probabilistic models (DPMs) are a powerful class of generative\nmodels known for their ability to generate high-fidelity image samples. A major\nchallenge in the implementation of DPMs is the slow sampling process. In this\nwork, we bring a high-efficiency sampler for DPMs. Specifically, we propose a\nscore-based exact solution paradigm for the diffusion ODEs corresponding to the\nsampling process of DPMs, which introduces a new perspective on developing\nnumerical algorithms for solving diffusion ODEs. To achieve an efficient\nsampler, we propose a recursive derivative estimation (RDE) method to reduce\nthe estimation error. With our proposed solution paradigm and RDE method, we\npropose the score-integrand solver with the convergence order guarantee as\nefficient solver (SciRE-Solver) for solving diffusion ODEs. The SciRE-Solver\nattains state-of-the-art (SOTA) sampling performance with a limited number of\nscore function evaluations (NFE) on both discrete-time and continuous-time DPMs\nin comparison to existing training-free sampling algorithms. Such as, we\nachieve $3.48$ FID with $12$ NFE and $2.42$ FID with $20$ NFE for\ncontinuous-time DPMs on CIFAR10, respectively. Different from other samplers,\nSciRE-Solver has the promising potential to surpass the FIDs achieved in the\noriginal papers of some pre-trained models with a small NFEs. For example, we\nreach SOTA value of $2.40$ FID with $100$ NFE for continuous-time DPM and of\n$3.15$ FID with $84$ NFE for discrete-time DPM on CIFAR-10, as well as of\n$2.17$ ($2.02$) FID with $18$ ($50$) NFE for discrete-time DPM on CelebA\n64$\\times$64.\n","authors":["Shigui Li","Wei Chen","Delu Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.07896v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08198v1","updated":"2023-08-16T07:58:02Z","published":"2023-08-16T07:58:02Z","title":"DeSCo: Towards Generalizable and Scalable Deep Subgraph Counting","summary":" Subgraph counting is the problem of counting the occurrences of a given query\ngraph in a large target graph. Large-scale subgraph counting is useful in\nvarious domains, such as motif counting for social network analysis and loop\ncounting for money laundering detection on transaction networks. Recently, to\naddress the exponential runtime complexity of scalable subgraph counting,\nneural methods are proposed. However, existing neural counting approaches fall\nshort in three aspects. Firstly, the counts of the same query can vary from\nzero to millions on different target graphs, posing a much larger challenge\nthan most graph regression tasks. Secondly, current scalable graph neural\nnetworks have limited expressive power and fail to efficiently distinguish\ngraphs in count prediction. Furthermore, existing neural approaches cannot\npredict the occurrence position of queries in the target graph.\n Here we design DeSCo, a scalable neural deep subgraph counting pipeline,\nwhich aims to accurately predict the query count and occurrence position on any\ntarget graph after one-time training. Firstly, DeSCo uses a novel canonical\npartition and divides the large target graph into small neighborhood graphs.\nThe technique greatly reduces the count variation while guaranteeing no missing\nor double-counting. Secondly, neighborhood counting uses an expressive\nsubgraph-based heterogeneous graph neural network to accurately perform\ncounting in each neighborhood. Finally, gossip propagation propagates\nneighborhood counts with learnable gates to harness the inductive biases of\nmotif counts. DeSCo is evaluated on eight real-world datasets from various\ndomains. It outperforms state-of-the-art neural methods with 137x improvement\nin the mean squared error of count prediction, while maintaining the polynomial\nruntime complexity.\n","authors":["Tianyu Fu","Chiyue Wei","Yu Wang","Rex Ying"],"pdf_url":"https://arxiv.org/pdf/2308.08198v1.pdf","comment":"8 pages main text, 10 pages appendix"},{"id":"http://arxiv.org/abs/2308.08187v1","updated":"2023-08-16T07:36:58Z","published":"2023-08-16T07:36:58Z","title":"Endogenous Macrodynamics in Algorithmic Recourse","summary":" Existing work on Counterfactual Explanations (CE) and Algorithmic Recourse\n(AR) has largely focused on single individuals in a static environment: given\nsome estimated model, the goal is to find valid counterfactuals for an\nindividual instance that fulfill various desiderata. The ability of such\ncounterfactuals to handle dynamics like data and model drift remains a largely\nunexplored research challenge. There has also been surprisingly little work on\nthe related question of how the actual implementation of recourse by one\nindividual may affect other individuals. Through this work, we aim to close\nthat gap. We first show that many of the existing methodologies can be\ncollectively described by a generalized framework. We then argue that the\nexisting framework does not account for a hidden external cost of recourse,\nthat only reveals itself when studying the endogenous dynamics of recourse at\nthe group level. Through simulation experiments involving various state-of\nthe-art counterfactual generators and several benchmark datasets, we generate\nlarge numbers of counterfactuals and study the resulting domain and model\nshifts. We find that the induced shifts are substantial enough to likely impede\nthe applicability of Algorithmic Recourse in some situations. Fortunately, we\nfind various strategies to mitigate these concerns. Our simulation framework\nfor studying recourse dynamics is fast and opensourced.\n","authors":["Patrick Altmeyer","Giovan Angela","Aleksander Buszydlik","Karol Dobiczek","Arie van Deursen","Cynthia C. S. Liem"],"pdf_url":"https://arxiv.org/pdf/2308.08187v1.pdf","comment":"12 pages, 11 figures. Originally published at the 2023 IEEE\n Conference on Secure and Trustworthy Machine Learning (SaTML). IEEE holds the\n copyright"},{"id":"http://arxiv.org/abs/2210.05740v2","updated":"2023-08-16T07:06:58Z","published":"2022-10-11T19:11:19Z","title":"Stochastic Constrained DRO with a Complexity Independent of Sample Size","summary":" Distributionally Robust Optimization (DRO), as a popular method to train\nrobust models against distribution shift between training and test sets, has\nreceived tremendous attention in recent years. In this paper, we propose and\nanalyze stochastic algorithms that apply to both non-convex and convex losses\nfor solving Kullback Leibler divergence constrained DRO problem. Compared with\nexisting methods solving this problem, our stochastic algorithms not only enjoy\ncompetitive if not better complexity independent of sample size but also just\nrequire a constant batch size at every iteration, which is more practical for\nbroad applications. We establish a nearly optimal complexity bound for finding\nan $\\epsilon$ stationary solution for non-convex losses and an optimal\ncomplexity for finding an $\\epsilon$ optimal solution for convex losses.\nEmpirical studies demonstrate the effectiveness of the proposed algorithms for\nsolving non-convex and convex constrained DRO problems.\n","authors":["Qi Qi","Jiameng Lyu","Kung sik Chan","Er Wei Bai","Tianbao Yang"],"pdf_url":"https://arxiv.org/pdf/2210.05740v2.pdf","comment":"37 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.08174v1","updated":"2023-08-16T07:05:47Z","published":"2023-08-16T07:05:47Z","title":"Accelerating Generic Graph Neural Networks via Architecture, Compiler,\n Partition Method Co-Design","summary":" Graph neural networks (GNNs) have shown significant accuracy improvements in\na variety of graph learning domains, sparking considerable research interest.\nTo translate these accuracy improvements into practical applications, it is\nessential to develop high-performance and efficient hardware acceleration for\nGNN models. However, designing GNN accelerators faces two fundamental\nchallenges: the high bandwidth requirement of GNN models and the diversity of\nGNN models. Previous works have addressed the first challenge by using more\nexpensive memory interfaces to achieve higher bandwidth. For the second\nchallenge, existing works either support specific GNN models or have generic\ndesigns with poor hardware utilization.\n In this work, we tackle both challenges simultaneously. First, we identify a\nnew type of partition-level operator fusion, which we utilize to internally\nreduce the high bandwidth requirement of GNNs. Next, we introduce\npartition-level multi-threading to schedule the concurrent processing of graph\npartitions, utilizing different hardware resources. To further reduce the extra\non-chip memory required by multi-threading, we propose fine-grained graph\npartitioning to generate denser graph partitions. Importantly, these three\nmethods make no assumptions about the targeted GNN models, addressing the\nchallenge of model variety. We implement these methods in a framework called\nSwitchBlade, consisting of a compiler, a graph partitioner, and a hardware\naccelerator. Our evaluation demonstrates that SwitchBlade achieves an average\nspeedup of $1.85\\times$ and energy savings of $19.03\\times$ compared to the\nNVIDIA V100 GPU. Additionally, SwitchBlade delivers performance comparable to\nstate-of-the-art specialized accelerators.\n","authors":["Shuwen Lu","Zhihui Zhang","Cong Guo","Jingwen Leng","Yangjie Zhou","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2308.08174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08173v1","updated":"2023-08-16T07:05:41Z","published":"2023-08-16T07:05:41Z","title":"Expressivity of Graph Neural Networks Through the Lens of Adversarial\n Robustness","summary":" We perform the first adversarial robustness study into Graph Neural Networks\n(GNNs) that are provably more powerful than traditional Message Passing Neural\nNetworks (MPNNs). In particular, we use adversarial robustness as a tool to\nuncover a significant gap between their theoretically possible and empirically\nachieved expressive power. To do so, we focus on the ability of GNNs to count\nspecific subgraph patterns, which is an established measure of expressivity,\nand extend the concept of adversarial robustness to this task. Based on this,\nwe develop efficient adversarial attacks for subgraph counting and show that\nmore powerful GNNs fail to generalize even to small perturbations to the\ngraph's structure. Expanding on this, we show that such architectures also fail\nto count substructures on out-of-distribution graphs.\n","authors":["Francesco Campi","Lukas Gosch","Tom Wollschläger","Yan Scholten","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2308.08173v1.pdf","comment":"Published in ${2}^{nd}$ AdvML Frontiers workshop at ${40}^{th}$\n International Conference on Machine Learning"},{"id":"http://arxiv.org/abs/2308.08172v1","updated":"2023-08-16T07:02:02Z","published":"2023-08-16T07:02:02Z","title":"AATCT-IDS: A Benchmark Abdominal Adipose Tissue CT Image Dataset for\n Image Denoising, Semantic Segmentation, and Radiomics Evaluation","summary":" Methods: In this study, a benchmark \\emph{Abdominal Adipose Tissue CT Image\nDataset} (AATTCT-IDS) containing 300 subjects is prepared and published.\nAATTCT-IDS publics 13,732 raw CT slices, and the researchers individually\nannotate the subcutaneous and visceral adipose tissue regions of 3,213 of those\nslices that have the same slice distance to validate denoising methods, train\nsemantic segmentation models, and study radiomics. For different tasks, this\npaper compares and analyzes the performance of various methods on AATTCT-IDS by\ncombining the visualization results and evaluation data. Thus, verify the\nresearch potential of this data set in the above three types of tasks.\n Results: In the comparative study of image denoising, algorithms using a\nsmoothing strategy suppress mixed noise at the expense of image details and\nobtain better evaluation data. Methods such as BM3D preserve the original image\nstructure better, although the evaluation data are slightly lower. The results\nshow significant differences among them. In the comparative study of semantic\nsegmentation of abdominal adipose tissue, the segmentation results of adipose\ntissue by each model show different structural characteristics. Among them,\nBiSeNet obtains segmentation results only slightly inferior to U-Net with the\nshortest training time and effectively separates small and isolated adipose\ntissue. In addition, the radiomics study based on AATTCT-IDS reveals three\nadipose distributions in the subject population.\n Conclusion: AATTCT-IDS contains the ground truth of adipose tissue regions in\nabdominal CT slices. This open-source dataset can attract researchers to\nexplore the multi-dimensional characteristics of abdominal adipose tissue and\nthus help physicians and patients in clinical practice. AATCT-IDS is freely\npublished for non-commercial purpose at:\n\\url{https://figshare.com/articles/dataset/AATTCT-IDS/23807256}.\n","authors":["Zhiyu Ma","Chen Li","Tianming Du","Le Zhang","Dechao Tang","Deguo Ma","Shanchuan Huang","Yan Liu","Yihao Sun","Zhihao Chen","Jin Yuan","Qianqing Nie","Marcin Grzegorzek","Hongzan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.08172v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.08167v1","updated":"2023-08-16T06:46:37Z","published":"2023-08-16T06:46:37Z","title":"A Quantum Approximation Scheme for k-Means","summary":" We give a quantum approximation scheme (i.e., $(1 +\n\\varepsilon)$-approximation for every $\\varepsilon > 0$) for the classical\n$k$-means clustering problem in the QRAM model with a running time that has\nonly polylogarithmic dependence on the number of data points. More\nspecifically, given a dataset $V$ with $N$ points in $\\mathbb{R}^d$ stored in\nQRAM data structure, our quantum algorithm runs in time $\\tilde{O} \\left(\n2^{\\tilde{O}(\\frac{k}{\\varepsilon})} \\eta^2 d\\right)$ and with high probability\noutputs a set $C$ of $k$ centers such that $cost(V, C) \\leq (1+\\varepsilon)\n\\cdot cost(V, C_{OPT})$. Here $C_{OPT}$ denotes the optimal $k$-centers,\n$cost(.)$ denotes the standard $k$-means cost function (i.e., the sum of the\nsquared distance of points to the closest center), and $\\eta$ is the aspect\nratio (i.e., the ratio of maximum distance to minimum distance). This is the\nfirst quantum algorithm with a polylogarithmic running time that gives a\nprovable approximation guarantee of $(1+\\varepsilon)$ for the $k$-means\nproblem. Also, unlike previous works on unsupervised learning, our quantum\nalgorithm does not require quantum linear algebra subroutines and has a running\ntime independent of parameters (e.g., condition number) that appear in such\nprocedures.\n","authors":["Ragesh Jaiswal"],"pdf_url":"https://arxiv.org/pdf/2308.08167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08163v1","updated":"2023-08-16T06:11:27Z","published":"2023-08-16T06:11:27Z","title":"Characteristics of networks generated by kernel growing neural gas","summary":" This research aims to develop kernel GNG, a kernelized version of the growing\nneural gas (GNG) algorithm, and to investigate the features of the networks\ngenerated by the kernel GNG. The GNG is an unsupervised artificial neural\nnetwork that can transform a dataset into an undirected graph, thereby\nextracting the features of the dataset as a graph. The GNG is widely used in\nvector quantization, clustering, and 3D graphics. Kernel methods are often used\nto map a dataset to feature space, with support vector machines being the most\nprominent application. This paper introduces the kernel GNG approach and\nexplores the characteristics of the networks generated by kernel GNG. Five\nkernels, including Gaussian, Laplacian, Cauchy, inverse multiquadric, and log\nkernels, are used in this study.\n","authors":["Kazuhisa Fujita"],"pdf_url":"https://arxiv.org/pdf/2308.08163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08162v1","updated":"2023-08-16T06:09:51Z","published":"2023-08-16T06:09:51Z","title":"Interpretability Benchmark for Evaluating Spatial Misalignment of\n Prototypical Parts Explanations","summary":" Prototypical parts-based networks are becoming increasingly popular due to\ntheir faithful self-explanations. However, their similarity maps are calculated\nin the penultimate network layer. Therefore, the receptive field of the\nprototype activation region often depends on parts of the image outside this\nregion, which can lead to misleading interpretations. We name this undesired\nbehavior a spatial explanation misalignment and introduce an interpretability\nbenchmark with a set of dedicated metrics for quantifying this phenomenon. In\naddition, we propose a method for misalignment compensation and apply it to\nexisting state-of-the-art models. We show the expressiveness of our benchmark\nand the effectiveness of the proposed compensation methodology through\nextensive empirical studies.\n","authors":["Mikołaj Sacha","Bartosz Jura","Dawid Rymarczyk","Łukasz Struski","Jacek Tabor","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2308.08162v1.pdf","comment":"Under review. Code will be release upon acceptance"},{"id":"http://arxiv.org/abs/2308.08160v1","updated":"2023-08-16T06:06:56Z","published":"2023-08-16T06:06:56Z","title":"Benchmarking Adversarial Robustness of Compressed Deep Learning Models","summary":" The increasing size of Deep Neural Networks (DNNs) poses a pressing need for\nmodel compression, particularly when employed on resource constrained devices.\nConcurrently, the susceptibility of DNNs to adversarial attacks presents\nanother significant hurdle. Despite substantial research on both model\ncompression and adversarial robustness, their joint examination remains\nunderexplored. Our study bridges this gap, seeking to understand the effect of\nadversarial inputs crafted for base models on their pruned versions. To examine\nthis relationship, we have developed a comprehensive benchmark across diverse\nadversarial attacks and popular DNN models. We uniquely focus on models not\npreviously exposed to adversarial training and apply pruning schemes optimized\nfor accuracy and performance. Our findings reveal that while the benefits of\npruning enhanced generalizability, compression, and faster inference times are\npreserved, adversarial robustness remains comparable to the base model. This\nsuggests that model compression while offering its unique advantages, does not\nundermine adversarial robustness.\n","authors":["Brijesh Vora","Kartik Patwari","Syed Mahbub Hafiz","Zubair Shafiq","Chen-Nee Chuah"],"pdf_url":"https://arxiv.org/pdf/2308.08160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08158v1","updated":"2023-08-16T06:01:12Z","published":"2023-08-16T06:01:12Z","title":"Deep Generative Imputation Model for Missing Not At Random Data","summary":" Data analysis usually suffers from the Missing Not At Random (MNAR) problem,\nwhere the cause of the value missing is not fully observed. Compared to the\nnaive Missing Completely At Random (MCAR) problem, it is more in line with the\nrealistic scenario whereas more complex and challenging. Existing statistical\nmethods model the MNAR mechanism by different decomposition of the joint\ndistribution of the complete data and the missing mask. But we empirically find\nthat directly incorporating these statistical methods into deep generative\nmodels is sub-optimal. Specifically, it would neglect the confidence of the\nreconstructed mask during the MNAR imputation process, which leads to\ninsufficient information extraction and less-guaranteed imputation quality. In\nthis paper, we revisit the MNAR problem from a novel perspective that the\ncomplete data and missing mask are two modalities of incomplete data on an\nequal footing. Along with this line, we put forward a generative-model-specific\njoint probability decomposition method, conjunction model, to represent the\ndistributions of two modalities in parallel and extract sufficient information\nfrom both complete data and missing mask. Taking a step further, we exploit a\ndeep generative imputation model, namely GNR, to process the real-world missing\nmechanism in the latent space and concurrently impute the incomplete data and\nreconstruct the missing mask. The experimental results show that our GNR\nsurpasses state-of-the-art MNAR baselines with significant margins (averagely\nimproved from 9.9% to 18.8% in RMSE) and always gives a better mask\nreconstruction accuracy which makes the imputation more principle.\n","authors":["Jialei Chen","Yuanbo Xu","Pengyang Wang","Yongjian Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08156v1","updated":"2023-08-16T05:58:12Z","published":"2023-08-16T05:58:12Z","title":"Sarcasm Detection in a Disaster Context","summary":" During natural disasters, people often use social media platforms such as\nTwitter to ask for help, to provide information about the disaster situation,\nor to express contempt about the unfolding event or public policies and\nguidelines. This contempt is in some cases expressed as sarcasm or irony.\nUnderstanding this form of speech in a disaster-centric context is essential to\nimproving natural language understanding of disaster-related tweets. In this\npaper, we introduce HurricaneSARC, a dataset of 15,000 tweets annotated for\nintended sarcasm, and provide a comprehensive investigation of sarcasm\ndetection using pre-trained language models. Our best model is able to obtain\nas much as 0.70 F1 on our dataset. We also demonstrate that the performance on\nHurricaneSARC can be improved by leveraging intermediate task transfer\nlearning. We release our data and code at\nhttps://github.com/tsosea2/HurricaneSarc.\n","authors":["Tiberiu Sosea","Junyi Jessy Li","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2308.08156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.11434v3","updated":"2023-08-16T05:39:43Z","published":"2022-03-22T03:13:39Z","title":"Non-linear Embeddings in Hilbert Simplex Geometry","summary":" A key technique of machine learning and computer vision is to embed discrete\nweighted graphs into continuous spaces for further downstream processing.\nEmbedding discrete hierarchical structures in hyperbolic geometry has proven\nvery successful since it was shown that any weighted tree can be embedded in\nthat geometry with arbitrary low distortion. Various optimization methods for\nhyperbolic embeddings based on common models of hyperbolic geometry have been\nstudied. In this paper, we consider Hilbert geometry for the standard simplex\nwhich is isometric to a vector space equipped with the variation polytope norm.\nWe study the representation power of this Hilbert simplex geometry by embedding\ndistance matrices of graphs. Our findings demonstrate that Hilbert simplex\ngeometry is competitive to alternative geometries such as the Poincar\\'e\nhyperbolic ball or the Euclidean geometry for embedding tasks while being fast\nand numerically robust.\n","authors":["Frank Nielsen","Ke Sun"],"pdf_url":"https://arxiv.org/pdf/2203.11434v3.pdf","comment":"21 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.07687v2","updated":"2023-08-16T05:24:46Z","published":"2023-08-15T10:37:04Z","title":"DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using\n Pre-trained Diffusion Models","summary":" Given a classifier, the inherent property of semantic Out-of-Distribution\n(OOD) samples is that their contents differ from all legal classes in terms of\nsemantics, namely semantic mismatch. There is a recent work that directly\napplies it to OOD detection, which employs a conditional Generative Adversarial\nNetwork (cGAN) to enlarge semantic mismatch in the image space. While achieving\nremarkable OOD detection performance on small datasets, it is not applicable to\nImageNet-scale datasets due to the difficulty in training cGANs with both input\nimages and labels as conditions. As diffusion models are much easier to train\nand amenable to various conditions compared to cGANs, in this work, we propose\nto directly use pre-trained diffusion models for semantic mismatch-guided OOD\ndetection, named DiffGuard. Specifically, given an OOD input image and the\npredicted label from the classifier, we try to enlarge the semantic difference\nbetween the reconstructed OOD image under these conditions and the original\ninput image. We also present several test-time techniques to further strengthen\nsuch differences. Experimental results show that DiffGuard is effective on both\nCifar-10 and hard cases of the large-scale ImageNet, and it can be easily\ncombined with existing OOD detection techniques to achieve state-of-the-art OOD\ndetection results.\n","authors":["Ruiyuan Gao","Chenchen Zhao","Lanqing Hong","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.07687v2.pdf","comment":"Accepted by ICCV2023, with supplementary materials"},{"id":"http://arxiv.org/abs/2210.14184v2","updated":"2023-08-16T05:08:00Z","published":"2022-10-25T17:22:31Z","title":"Learning Ability of Interpolating Deep Convolutional Neural Networks","summary":" It is frequently observed that overparameterized neural networks generalize\nwell. Regarding such phenomena, existing theoretical work mainly devotes to\nlinear settings or fully-connected neural networks. This paper studies the\nlearning ability of an important family of deep neural networks, deep\nconvolutional neural networks (DCNNs), under both underparameterized and\noverparameterized settings. We establish the first learning rates of\nunderparameterized DCNNs without parameter or function variable structure\nrestrictions presented in the literature. We also show that by adding\nwell-defined layers to a non-interpolating DCNN, we can obtain some\ninterpolating DCNNs that maintain the good learning rates of the\nnon-interpolating DCNN. This result is achieved by a novel network deepening\nscheme designed for DCNNs. Our work provides theoretical verification of how\noverfitted DCNNs generalize well.\n","authors":["Tian-Yi Zhou","Xiaoming Huo"],"pdf_url":"https://arxiv.org/pdf/2210.14184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08148v1","updated":"2023-08-16T05:01:33Z","published":"2023-08-16T05:01:33Z","title":"Hierarchical Topological Ordering with Conditional Independence Test for\n Limited Time Series","summary":" Learning directed acyclic graphs (DAGs) to identify causal relations\nunderlying observational data is crucial but also poses significant challenges.\nRecently, topology-based methods have emerged as a two-step approach to\ndiscovering DAGs by first learning the topological ordering of variables and\nthen eliminating redundant edges, while ensuring that the graph remains\nacyclic. However, one limitation is that these methods would generate numerous\nspurious edges that require subsequent pruning. To overcome this limitation, in\nthis paper, we propose an improvement to topology-based methods by introducing\nlimited time series data, consisting of only two cross-sectional records that\nneed not be adjacent in time and are subject to flexible timing. By\nincorporating conditional instrumental variables as exogenous interventions, we\naim to identify descendant nodes for each variable. Following this line, we\npropose a hierarchical topological ordering algorithm with conditional\nindependence test (HT-CIT), which enables the efficient learning of sparse DAGs\nwith a smaller search space compared to other popular approaches. The HT-CIT\nalgorithm greatly reduces the number of edges that need to be pruned. Empirical\nresults from synthetic and real-world datasets demonstrate the superiority of\nthe proposed HT-CIT algorithm.\n","authors":["Anpeng Wu","Haoxuan Li","Kun Kuang","Keli Zhang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2308.08148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08503v3","updated":"2023-08-16T04:59:32Z","published":"2023-04-17T06:48:07Z","title":"A Scalable Test Problem Generator for Sequential Transfer Optimization","summary":" Sequential transfer optimization (STO), which aims to improve the\noptimization performance on a task at hand by exploiting the knowledge captured\nfrom several previously-solved optimization tasks stored in a database, has\nbeen gaining increasing research attention over the years. However, despite\nremarkable advances in algorithm design, the development of a systematic\nbenchmark suite for comprehensive comparisons of STO algorithms received far\nless attention. Existing test problems are either simply generated by\nassembling other benchmark functions or extended from specific practical\nproblems with limited variations. The relationships between the optimal\nsolutions of the source and target tasks in these problems are always manually\nconfigured, limiting their ability to model different relationships presented\nin real-world problems. Consequently, the good performance achieved by an\nalgorithm on these problems might be biased and could not be generalized to\nother problems. In light of the above, in this study, we first introduce four\nrudimentary concepts for characterizing STO problems (STOPs) and present an\nimportant problem feature, namely similarity distribution, which quantitatively\ndelineates the relationship between the optima of the source and target tasks.\nThen, we propose the general design guidelines and a problem generator with\nsuperior scalability. Specifically, the similarity distribution of an STOP can\nbe easily customized, enabling a continuous spectrum of representation of the\ndiverse similarity relationships of real-world problems. Lastly, a benchmark\nsuite with 12 STOPs featured by a variety of customized similarity\nrelationships is developed using the proposed generator, which would serve as\nan arena for STO algorithms and provide more comprehensive evaluation results.\nThe source code of the problem generator is available at\nhttps://github.com/XmingHsueh/STOP-G.\n","authors":["Xiaoming Xue","Cuie Yang","Liang Feng","Kai Zhang","Linqi Song","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2304.08503v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15891v2","updated":"2023-08-16T04:53:15Z","published":"2023-06-28T03:16:45Z","title":"Capturing the Diffusive Behavior of the Multiscale Linear Transport\n Equations by Asymptotic-Preserving Convolutional DeepONets","summary":" In this paper, we introduce two types of novel Asymptotic-Preserving\nConvolutional Deep Operator Networks (APCONs) designed to address the\nmultiscale time-dependent linear transport problem. We observe that the vanilla\nphysics-informed DeepONets with modified MLP may exhibit instability in\nmaintaining the desired limiting macroscopic behavior. Therefore, this\nnecessitates the utilization of an asymptotic-preserving loss function. Drawing\ninspiration from the heat kernel in the diffusion equation, we propose a new\narchitecture called Convolutional Deep Operator Networks, which employ multiple\nlocal convolution operations instead of a global heat kernel, along with\npooling and activation operations in each filter layer. Our APCON methods\npossess a parameter count that is independent of the grid size and are capable\nof capturing the diffusive behavior of the linear transport problem. Finally,\nwe validate the effectiveness of our methods through several numerical\nexamples.\n","authors":["Keke Wu","Xiong-bin Yan","Shi Jin","Zheng Ma"],"pdf_url":"https://arxiv.org/pdf/2306.15891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05890v2","updated":"2023-08-16T04:15:14Z","published":"2023-05-10T04:20:36Z","title":"CUTS+: High-dimensional Causal Discovery from Irregular Time-series","summary":" Causal discovery in time-series is a fundamental problem in the machine\nlearning community, enabling causal reasoning and decision-making in complex\nscenarios. Recently, researchers successfully discover causality by combining\nneural networks with Granger causality, but their performances degrade largely\nwhen encountering high-dimensional data because of the highly redundant network\ndesign and huge causal graphs. Moreover, the missing entries in the\nobservations further hamper the causal structural learning. To overcome these\nlimitations, We propose CUTS+, which is built on the Granger-causality-based\ncausal discovery method CUTS and raises the scalability by introducing a\ntechnique called Coarse-to-fine-discovery (C2FD) and leveraging a\nmessage-passing-based graph neural network (MPGNN). Compared to previous\nmethods on simulated, quasi-real, and real datasets, we show that CUTS+ largely\nimproves the causal discovery performance on high-dimensional data with\ndifferent types of irregular sampling.\n","authors":["Yuxiao Cheng","Lianglong Li","Tingxiong Xiao","Zongren Li","Qin Zhong","Jinli Suo","Kunlun He"],"pdf_url":"https://arxiv.org/pdf/2305.05890v2.pdf","comment":"Submit to AAAI-24"},{"id":"http://arxiv.org/abs/2308.08138v1","updated":"2023-08-16T04:05:22Z","published":"2023-08-16T04:05:22Z","title":"Online Control for Linear Dynamics: A Data-Driven Approach","summary":" This paper considers an online control problem over a linear time-invariant\nsystem with unknown dynamics, bounded disturbance, and adversarial cost. We\npropose a data-driven strategy to reduce the regret of the controller. Unlike\nmodel-based methods, our algorithm does not identify the system model, instead,\nit leverages a single noise-free trajectory to calculate the accumulation of\ndisturbance and makes decisions using the accumulated disturbance action\ncontroller we design, whose parameters are updated by online gradient descent.\nWe prove that the regret of our algorithm is $\\mathcal{O}(\\sqrt{T})$ under mild\nassumptions, suggesting that its performance is on par with model-based\nmethods.\n","authors":["Zishun Liu","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09043v2","updated":"2023-08-16T04:01:14Z","published":"2022-10-14T01:51:33Z","title":"ST-former for short-term passenger flow prediction during COVID-19 in\n urban rail transit system","summary":" Accurate passenger flow prediction of urban rail transit is essential for\nimproving the performance of intelligent transportation systems, especially\nduring the epidemic. How to dynamically model the complex spatiotemporal\ndependencies of passenger flow is the main issue in achieving accurate\npassenger flow prediction during the epidemic. To solve this issue, this paper\nproposes a brand-new transformer-based architecture called STformer under the\nencoder-decoder framework specifically for COVID-19. Concretely, we develop a\nmodified self-attention mechanism named Causal-Convolution ProbSparse\nSelf-Attention (CPSA) to model the multiple temporal dependencies of passenger\nflow with low computational costs. To capture the complex and dynamic spatial\ndependencies, we introduce a novel Adaptive Multi-Graph Convolution Network\n(AMGCN) by leveraging multiple graphs in a self-adaptive manner. Additionally,\nthe Multi-source Data Fusion block fuses the passenger flow data, COVID-19\nconfirmed case data, and the relevant social media data to study the impact of\nCOVID-19 to passenger flow. Experiments on real-world passenger flow datasets\ndemonstrate the superiority of ST-former over the other eleven state-of-the-art\nmethods. Several ablation studies are carried out to verify the effectiveness\nand reliability of our model structure. Results can provide critical insights\nfor the operation of URT systems.\n","authors":["Shuxin Zhang","Jinlei Zhang","Lixing Yang","Chengcheng Wang","Ziyou Gao"],"pdf_url":"https://arxiv.org/pdf/2210.09043v2.pdf","comment":"There are some errors that might mislead readers for this version.\n There is no new version right now"},{"id":"http://arxiv.org/abs/2203.00007v4","updated":"2023-08-16T04:00:48Z","published":"2022-02-27T01:06:24Z","title":"Spatial-Temporal Attention Fusion Network for short-term passenger flow\n prediction on holidays in urban rail transit systems","summary":" The short term passenger flow prediction of the urban rail transit system is\nof great significance for traffic operation and management. The emerging deep\nlearning-based models provide effective methods to improve prediction accuracy.\nHowever, most of the existing models mainly predict the passenger flow on\ngeneral weekdays or weekends. There are only few studies focusing on predicting\nthe passenger flow on holidays, which is a significantly challenging task for\ntraffic management because of its suddenness and irregularity. To this end, we\npropose a deep learning-based model named Spatial Temporal Attention Fusion\nNetwork comprising a novel Multi-Graph Attention Network, a Conv-Attention\nBlock, and Feature Fusion Block for short-term passenger flow prediction on\nholidays. The multi-graph attention network is applied to extract the complex\nspatial dependencies of passenger flow dynamically and the conv-attention block\nis applied to extract the temporal dependencies of passenger flow from global\nand local perspectives. Moreover, in addition to the historical passenger flow\ndata, the social media data, which has been proven that they can effectively\nreflect the evolution trend of passenger flow under events, are also fused into\nthe feature fusion block of STAFN. The STAFN is tested on two large-scale urban\nrail transit AFC datasets from China on the New Year holiday, and the\nprediction performance of the model are compared with that of several\nconventional prediction models. Results demonstrate its better robustness and\nadvantages among benchmark methods, which can provide overwhelming support for\npractical applications of short term passenger flow prediction on holidays.\n","authors":["Shuxin Zhang","Jinlei Zhang","Lixing Yang","Jiateng Yin","Ziyou Gao"],"pdf_url":"https://arxiv.org/pdf/2203.00007v4.pdf","comment":"There are some errors that might mislead readers for this version.\n There is no new version right now"},{"id":"http://arxiv.org/abs/2202.06727v3","updated":"2023-08-16T04:00:33Z","published":"2022-02-10T13:18:11Z","title":"STG-GAN: A spatiotemporal graph generative adversarial networks for\n short-term passenger flow prediction in urban rail transit systems","summary":" Short-term passenger flow prediction is an important but challenging task for\nbetter managing urban rail transit (URT) systems. Some emerging deep learning\nmodels provide good insights to improve short-term prediction accuracy.\nHowever, there exist many complex spatiotemporal dependencies in URT systems.\nMost previous methods only consider the absolute error between ground truth and\npredictions as the optimization objective, which fails to account for spatial\nand temporal constraints on the predictions. Furthermore, a large number of\nexisting prediction models introduce complex neural network layers to improve\naccuracy while ignoring their training efficiency and memory occupancy,\ndecreasing the chances to be applied to the real world. To overcome these\nlimitations, we propose a novel deep learning-based spatiotemporal graph\ngenerative adversarial network (STG-GAN) model with higher prediction accuracy,\nhigher efficiency, and lower memory occupancy to predict short-term passenger\nflows of the URT network. Our model consists of two major parts, which are\noptimized in an adversarial learning manner: (1) a generator network including\ngated temporal conventional networks (TCN) and weight sharing graph convolution\nnetworks (GCN) to capture structural spatiotemporal dependencies and generate\npredictions with a relatively small computational burden; (2) a discriminator\nnetwork including a spatial discriminator and a temporal discriminator to\nenhance the spatial and temporal constraints of the predictions. The STG-GAN is\nevaluated on two large-scale real-world datasets from Beijing Subway. A\ncomparison with those of several state-of-the-art models illustrates its\nsuperiority and robustness. This study can provide critical experience in\nconducting short-term passenger flow predictions, especially from the\nperspective of real-world applications.\n","authors":["Jinlei Zhang","Hua Li","Lixing Yang","Guangyin Jin","Jianguo Qi","Ziyou Gao"],"pdf_url":"https://arxiv.org/pdf/2202.06727v3.pdf","comment":"There are some errors that might mislead readers for this version.\n There is no new version right now"},{"id":"http://arxiv.org/abs/2308.08135v1","updated":"2023-08-16T03:56:58Z","published":"2023-08-16T03:56:58Z","title":"Microstructure-Empowered Stock Factor Extraction and Utilization","summary":" High-frequency quantitative investment is a crucial aspect of stock\ninvestment. Notably, order flow data plays a critical role as it provides the\nmost detailed level of information among high-frequency trading data, including\ncomprehensive data from the order book and transaction records at the tick\nlevel. The order flow data is extremely valuable for market analysis as it\nequips traders with essential insights for making informed decisions. However,\nextracting and effectively utilizing order flow data present challenges due to\nthe large volume of data involved and the limitations of traditional factor\nmining techniques, which are primarily designed for coarser-level stock data.\nTo address these challenges, we propose a novel framework that aims to\neffectively extract essential factors from order flow data for diverse\ndownstream tasks across different granularities and scenarios. Our method\nconsists of a Context Encoder and an Factor Extractor. The Context Encoder\nlearns an embedding for the current order flow data segment's context by\nconsidering both the expected and actual market state. In addition, the Factor\nExtractor uses unsupervised learning methods to select such important signals\nthat are most distinct from the majority within the given context. The\nextracted factors are then utilized for downstream tasks. In empirical studies,\nour proposed framework efficiently handles an entire year of stock order flow\ndata across diverse scenarios, offering a broader range of applications\ncompared to existing tick-level approaches that are limited to only a few days\nof stock data. We demonstrate that our method extracts superior factors from\norder flow data, enabling significant improvement for stock trend prediction\nand order execution tasks at the second and minute level.\n","authors":["Xianfeng Jiao","Zizhong Li","Chang Xu","Yang Liu","Weiqing Liu","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2308.08135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08129v1","updated":"2023-08-16T03:38:43Z","published":"2023-08-16T03:38:43Z","title":"Is Self-Supervised Pretraining Good for Extrapolation in Molecular\n Property Prediction?","summary":" The prediction of material properties plays a crucial role in the development\nand discovery of materials in diverse applications, such as batteries,\nsemiconductors, catalysts, and pharmaceuticals. Recently, there has been a\ngrowing interest in employing data-driven approaches by using machine learning\ntechnologies, in combination with conventional theoretical calculations. In\nmaterial science, the prediction of unobserved values, commonly referred to as\nextrapolation, is particularly critical for property prediction as it enables\nresearchers to gain insight into materials beyond the limits of available data.\nHowever, even with the recent advancements in powerful machine learning models,\naccurate extrapolation is still widely recognized as a significantly\nchallenging problem. On the other hand, self-supervised pretraining is a\nmachine learning technique where a model is first trained on unlabeled data\nusing relatively simple pretext tasks before being trained on labeled data for\ntarget tasks. As self-supervised pretraining can effectively utilize material\ndata without observed property values, it has the potential to improve the\nmodel's extrapolation ability. In this paper, we clarify how such\nself-supervised pretraining can enhance extrapolation performance.We propose an\nexperimental framework for the demonstration and empirically reveal that while\nmodels were unable to accurately extrapolate absolute property values,\nself-supervised pretraining enables them to learn relative tendencies of\nunobserved property values and improve extrapolation performance.\n","authors":["Shun Takashige","Masatoshi Hanai","Toyotaro Suzumura","Limin Wang","Kenjiro Taura"],"pdf_url":"https://arxiv.org/pdf/2308.08129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08128v1","updated":"2023-08-16T03:35:52Z","published":"2023-08-16T03:35:52Z","title":"How to Mask in Error Correction Code Transformer: Systematic and Double\n Masking","summary":" In communication and storage systems, error correction codes (ECCs) are\npivotal in ensuring data reliability. As deep learning's applicability has\nbroadened across diverse domains, there is a growing research focus on neural\nnetwork-based decoders that outperform traditional decoding algorithms. Among\nthese neural decoders, Error Correction Code Transformer (ECCT) has achieved\nthe state-of-the-art performance, outperforming other methods by large margins.\nTo further enhance the performance of ECCT, we propose two novel methods.\nFirst, leveraging the systematic encoding technique of ECCs, we introduce a new\nmasking matrix for ECCT, aiming to improve the performance and reduce the\ncomputational complexity. Second, we propose a novel transformer architecture\nof ECCT called a double-masked ECCT. This architecture employs two different\nmask matrices in a parallel manner to learn more diverse features of the\nrelationship between codeword bits in the masked self-attention blocks.\nExtensive simulation results show that the proposed double-masked ECCT\noutperforms the conventional ECCT, achieving the state-of-the-art decoding\nperformance with significant margins.\n","authors":["Seong-Joon Park","Hee-Youl Kwak","Sang-Hyo Kim","Sunghwan Kim","Yongjune Kim","Jong-Seon No"],"pdf_url":"https://arxiv.org/pdf/2308.08128v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2306.10698v4","updated":"2023-08-16T02:11:16Z","published":"2023-06-19T04:48:36Z","title":"Deep Reinforcement Learning with Multitask Episodic Memory Based on\n Task-Conditioned Hypernetwork","summary":" Deep reinforcement learning algorithms are usually impeded by sampling\ninefficiency, heavily depending on multiple interactions with the environment\nto acquire accurate decision-making capabilities. In contrast, humans rely on\ntheir hippocampus to retrieve relevant information from past experiences of\nrelevant tasks, which guides their decision-making when learning a new task,\nrather than exclusively depending on environmental interactions. Nevertheless,\ndesigning a hippocampus-like module for an agent to incorporate past\nexperiences into established reinforcement learning algorithms presents two\nchallenges. The first challenge involves selecting the most relevant past\nexperiences for the current task, and the second challenge is integrating such\nexperiences into the decision network. To address these challenges, we propose\na novel method that utilizes a retrieval network based on task-conditioned\nhypernetwork, which adapts the retrieval network's parameters depending on the\ntask. At the same time, a dynamic modification mechanism enhances the\ncollaborative efforts between the retrieval and decision networks. We evaluate\nthe proposed method on the MiniGrid environment.The experimental results\ndemonstrate that our proposed method significantly outperforms strong\nbaselines.\n","authors":["Yonggang Jin","Chenxu Wang","Liuyu Xiang","Yaodong Yang","Junge Zhang","Jie Fu","Zhaofeng He"],"pdf_url":"https://arxiv.org/pdf/2306.10698v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08097v1","updated":"2023-08-16T02:08:46Z","published":"2023-08-16T02:08:46Z","title":"S-Mixup: Structural Mixup for Graph Neural Networks","summary":" Existing studies for applying the mixup technique on graphs mainly focus on\ngraph classification tasks, while the research in node classification is still\nunder-explored. In this paper, we propose a novel mixup augmentation for node\nclassification called Structural Mixup (S-Mixup). The core idea is to take into\naccount the structural information while mixing nodes. Specifically, S-Mixup\nobtains pseudo-labels for unlabeled nodes in a graph along with their\nprediction confidence via a Graph Neural Network (GNN) classifier. These serve\nas the criteria for the composition of the mixup pool for both inter and\nintra-class mixups. Furthermore, we utilize the edge gradient obtained from the\nGNN training and propose a gradient-based edge selection strategy for selecting\nedges to be attached to the nodes generated by the mixup. Through extensive\nexperiments on real-world benchmark datasets, we demonstrate the effectiveness\nof S-Mixup evaluated on the node classification task. We observe that S-Mixup\nenhances the robustness and generalization performance of GNNs, especially in\nheterophilous situations. The source code of S-Mixup can be found at\n\\url{https://github.com/SukwonYun/S-Mixup}\n","authors":["Junghurn Kim","Sukwon Yun","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2308.08097v1.pdf","comment":"CIKM 2023 (Short Paper)"},{"id":"http://arxiv.org/abs/2308.08086v1","updated":"2023-08-16T01:30:13Z","published":"2023-08-16T01:30:13Z","title":"Safety Filter Design for Neural Network Systems via Convex Optimization","summary":" With the increase in data availability, it has been widely demonstrated that\nneural networks (NN) can capture complex system dynamics precisely in a\ndata-driven manner. However, the architectural complexity and nonlinearity of\nthe NNs make it challenging to synthesize a provably safe controller. In this\nwork, we propose a novel safety filter that relies on convex optimization to\nensure safety for a NN system, subject to additive disturbances that are\ncapable of capturing modeling errors. Our approach leverages tools from NN\nverification to over-approximate NN dynamics with a set of linear bounds,\nfollowed by an application of robust linear MPC to search for controllers that\ncan guarantee robust constraint satisfaction. We demonstrate the efficacy of\nthe proposed framework numerically on a nonlinear pendulum system.\n","authors":["Shaoru Chen","Kong Yao Chee","Nikolai Matni","M. Ani Hsieh","George J. Pappas"],"pdf_url":"https://arxiv.org/pdf/2308.08086v1.pdf","comment":"This paper has been accepted to the 2023 62nd IEEE Conference on\n Decision and Control (CDC)"},{"id":"http://arxiv.org/abs/2308.07439v2","updated":"2023-08-16T01:29:39Z","published":"2023-08-14T20:20:26Z","title":"Interaction-Aware Personalized Vehicle Trajectory Prediction Using\n Temporal Graph Neural Networks","summary":" Accurate prediction of vehicle trajectories is vital for advanced driver\nassistance systems and autonomous vehicles. Existing methods mainly rely on\ngeneric trajectory predictions derived from large datasets, overlooking the\npersonalized driving patterns of individual drivers. To address this gap, we\npropose an approach for interaction-aware personalized vehicle trajectory\nprediction that incorporates temporal graph neural networks. Our method\nutilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to\nmodel the spatio-temporal interactions between target vehicles and their\nsurrounding traffic. To personalize the predictions, we establish a pipeline\nthat leverages transfer learning: the model is initially pre-trained on a\nlarge-scale trajectory dataset and then fine-tuned for each driver using their\nspecific driving data. We employ human-in-the-loop simulation to collect\npersonalized naturalistic driving trajectories and corresponding surrounding\nvehicle trajectories. Experimental results demonstrate the superior performance\nof our personalized GCN-LSTM model, particularly for longer prediction\nhorizons, compared to its generic counterpart. Moreover, the personalized model\noutperforms individual models created without pre-training, emphasizing the\nsignificance of pre-training on a large dataset to avoid overfitting. By\nincorporating personalization, our approach enhances trajectory prediction\naccuracy.\n","authors":["Amr Abdelraouf","Rohit Gupta","Kyungtae Han"],"pdf_url":"https://arxiv.org/pdf/2308.07439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12288v2","updated":"2023-08-16T01:05:13Z","published":"2022-08-25T18:25:27Z","title":"Neuro-Dynamic State Estimation for Networked Microgrids","summary":" We devise neuro-dynamic state estimation (Neuro-DSE), a learning-based\ndynamic state estimation (DSE) algorithm for networked microgrids (NMs) under\nunknown subsystems. Our contributions include: 1) a data-driven Neuro-DSE\nalgorithm for NMs DSE with partially unidentified dynamic models, which\nincorporates the neural-ordinary-differential-equations (ODE-Net) into Kalman\nfilters; 2) a self-refining Neuro-DSE algorithm (Neuro-DSE+) which enables\ndata-driven DSE under limited and noisy measurements by establishing an\nautomatic filtering, augmenting and correcting framework; 3) a\nNeuro-KalmanNet-DSE algorithm which further integrates KalmanNet with Neuro-DSE\nto relieve the model mismatch of both neural- and physics-based dynamic models;\nand 4) an augmented Neuro-DSE for joint estimation of NMs states and unknown\nparameters (e.g., inertia). Extensive case studies demonstrate the efficacy of\nNeuro-DSE and its variants under different noise levels, control modes, power\nsources, observabilities and model knowledge, respectively.\n","authors":["Fei Feng","Yifan Zhou","Peng Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.12288v2.pdf","comment":"This paper needs to be withdrawn by the author. In Section II, Part\n C, there is lack of procedure to achieve parameter estimation using the\n proposed model. In Section V, Part E, experiment parameter setting is missed.\n Noise for estimating inertia case needs to be reset for simulation.\n Additional tests need to be added. These two parts need to be rewritten"},{"id":"http://arxiv.org/abs/2210.10592v2","updated":"2023-08-16T01:01:50Z","published":"2022-10-19T14:34:12Z","title":"DyTed: Disentangled Representation Learning for Discrete-time Dynamic\n Graph","summary":" Unsupervised representation learning for dynamic graphs has attracted a lot\nof research attention in recent years. Compared with static graph, the dynamic\ngraph is a comprehensive embodiment of both the intrinsic stable\ncharacteristics of nodes and the time-related dynamic preference. However,\nexisting methods generally mix these two types of information into a single\nrepresentation space, which may lead to poor explanation, less robustness, and\na limited ability when applied to different downstream tasks. To solve the\nabove problems, in this paper, we propose a novel disenTangled representation\nlearning framework for discrete-time Dynamic graphs, namely DyTed. We specially\ndesign a temporal-clips contrastive learning task together with a structure\ncontrastive learning to effectively identify the time-invariant and\ntime-varying representations respectively. To further enhance the\ndisentanglement of these two types of representation, we propose a\ndisentanglement-aware discriminator under an adversarial learning framework\nfrom the perspective of information theory. Extensive experiments on Tencent\nand five commonly used public datasets demonstrate that DyTed, as a general\nframework that can be applied to existing methods, achieves state-of-the-art\nperformance on various downstream tasks, as well as be more robust against\nnoise.\n","authors":["Kaike Zhang","Qi Cao","Gaolin Fang","Bingbing Xu","Hongjian Zou","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2210.10592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18437v2","updated":"2023-08-16T00:58:55Z","published":"2023-05-29T00:41:32Z","title":"Explainable Machine Learning for Categorical and Mixed Data with\n Lossless Visualization","summary":" Building accurate and interpretable Machine Learning (ML) models for\nheterogeneous/mixed data is a long-standing challenge for algorithms designed\nfor numeric data. This work focuses on developing numeric coding schemes for\nnon-numeric attributes for ML algorithms to support accurate and explainable ML\nmodels, methods for lossless visualization of n-D non-numeric categorical data\nwith visual rule discovery in these visualizations, and accurate and\nexplainable ML models for categorical data. This study proposes a\nclassification of mixed data types and analyzes their important role in Machine\nLearning. It presents a toolkit for enforcing interpretability of all internal\noperations of ML algorithms on mixed data with a visual data exploration on\nmixed data. A new Sequential Rule Generation (SRG) algorithm for explainable\nrule generation with categorical data is proposed and successfully evaluated in\nmultiple computational experiments. This work is one of the steps to the full\nscope ML algorithms for mixed data supported by lossless visualization of n-D\ndata in General Line Coordinates beyond Parallel Coordinates.\n","authors":["Boris Kovalerchuk","Elijah McCoy"],"pdf_url":"https://arxiv.org/pdf/2305.18437v2.pdf","comment":"46 pages, 32 figures, 29 tables. arXiv admin note: substantial text\n overlap with arXiv:2206.06476"},{"id":"http://arxiv.org/abs/2305.11095v3","updated":"2023-08-16T00:57:34Z","published":"2023-05-18T16:32:58Z","title":"Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot\n Task Generalization","summary":" We investigate the emergent abilities of the recently proposed web-scale\nspeech model Whisper, by adapting it to unseen tasks with prompt engineering.\nWe selected three tasks: audio-visual speech recognition (AVSR), code-switched\nspeech recognition (CS-ASR), and speech translation (ST) on unseen language\npairs. We design task-specific prompts, by either leveraging another\nlarge-scale model, or simply manipulating the special tokens in the default\nprompts. Experiments show that compared to the default prompts, our proposed\nprompts improve performance by 10% to 45% on the three zero-shot tasks, and\neven outperform SotA supervised models on some datasets. In addition, our\nexperiments reveal many interesting properties of Whisper, including its\nrobustness to prompts, bias on accents, and the multilingual understanding in\nits latent space. Code is available at\nhttps://github.com/jasonppy/PromptingWhisper\n","authors":["Puyuan Peng","Brian Yan","Shinji Watanabe","David Harwath"],"pdf_url":"https://arxiv.org/pdf/2305.11095v3.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.08079v1","updated":"2023-08-16T00:19:52Z","published":"2023-08-16T00:19:52Z","title":"Rigid Transformations for Stabilized Lower Dimensional Space to Support\n Subsurface Uncertainty Quantification and Interpretation","summary":" Subsurface datasets inherently possess big data characteristics such as vast\nvolume, diverse features, and high sampling speeds, further compounded by the\ncurse of dimensionality from various physical, engineering, and geological\ninputs. Among the existing dimensionality reduction (DR) methods, nonlinear\ndimensionality reduction (NDR) methods, especially Metric-multidimensional\nscaling (MDS), are preferred for subsurface datasets due to their inherent\ncomplexity. While MDS retains intrinsic data structure and quantifies\nuncertainty, its limitations include unstabilized unique solutions invariant to\nEuclidean transformations and an absence of out-of-sample points (OOSP)\nextension. To enhance subsurface inferential and machine learning workflows,\ndatasets must be transformed into stable, reduced-dimension representations\nthat accommodate OOSP.\n Our solution employs rigid transformations for a stabilized Euclidean\ninvariant representation for LDS. By computing an MDS input dissimilarity\nmatrix, and applying rigid transformations on multiple realizations, we ensure\ntransformation invariance and integrate OOSP. This process leverages a convex\nhull algorithm and incorporates loss function and normalized stress for\ndistortion quantification. We validate our approach with synthetic data,\nvarying distance metrics, and real-world wells from the Duvernay Formation.\nResults confirm our method's efficacy in achieving consistent LDS\nrepresentations. Furthermore, our proposed \"stress ratio\" (SR) metric provides\ninsight into uncertainty, beneficial for model adjustments and inferential\nanalysis. Consequently, our workflow promises enhanced repeatability and\ncomparability in NDR for subsurface energy resource engineering and associated\nbig data workflows.\n","authors":["Ademide O. Mabadeje","Michael J. Pyrcz"],"pdf_url":"https://arxiv.org/pdf/2308.08079v1.pdf","comment":"30 pages, 17 figures, Submitted to Computational Geosciences Journal"},{"id":"http://arxiv.org/abs/2303.13516v3","updated":"2023-08-16T00:00:47Z","published":"2023-03-23T17:59:42Z","title":"Ablating Concepts in Text-to-Image Diffusion Models","summary":" Large-scale text-to-image diffusion models can generate high-fidelity images\nwith powerful compositional ability. However, these models are typically\ntrained on an enormous amount of Internet data, often containing copyrighted\nmaterial, licensed images, and personal photos. Furthermore, they have been\nfound to replicate the style of various living artists or memorize exact\ntraining samples. How can we remove such copyrighted concepts or images without\nretraining the model from scratch? To achieve this goal, we propose an\nefficient method of ablating concepts in the pretrained model, i.e., preventing\nthe generation of a target concept. Our algorithm learns to match the image\ndistribution for a target style, instance, or text prompt we wish to ablate to\nthe distribution corresponding to an anchor concept. This prevents the model\nfrom generating target concepts given its text condition. Extensive experiments\nshow that our method can successfully prevent the generation of the ablated\nconcept while preserving closely related concepts in the model.\n","authors":["Nupur Kumari","Bingliang Zhang","Sheng-Yu Wang","Eli Shechtman","Richard Zhang","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.13516v3.pdf","comment":"ICCV 2023. Project website: https://www.cs.cmu.edu/~concept-ablation/"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.07578v2","updated":"2023-08-16T14:12:43Z","published":"2023-08-15T05:33:48Z","title":"Understanding User Behavior in Volumetric Video Watching: Dataset,\n Analysis and Prediction","summary":" Volumetric video emerges as a new attractive video paradigm in recent years\nsince it provides an immersive and interactive 3D viewing experience with six\ndegree-of-freedom (DoF). Unlike traditional 2D or panoramic videos, volumetric\nvideos require dense point clouds, voxels, meshes, or huge neural models to\ndepict volumetric scenes, which results in a prohibitively high bandwidth\nburden for video delivery. Users' behavior analysis, especially the viewport\nand gaze analysis, then plays a significant role in prioritizing the content\nstreaming within users' viewport and degrading the remaining content to\nmaximize user QoE with limited bandwidth. Although understanding user behavior\nis crucial, to the best of our best knowledge, there are no available 3D\nvolumetric video viewing datasets containing fine-grained user interactivity\nfeatures, not to mention further analysis and behavior prediction. In this\npaper, we for the first time release a volumetric video viewing behavior\ndataset, with a large scale, multiple dimensions, and diverse conditions. We\nconduct an in-depth analysis to understand user behaviors when viewing\nvolumetric videos. Interesting findings on user viewport, gaze, and motion\npreference related to different videos and users are revealed. We finally\ndesign a transformer-based viewport prediction model that fuses the features of\nboth gaze and motion, which is able to achieve high accuracy at various\nconditions. Our prediction model is expected to further benefit volumetric\nvideo streaming optimization. Our dataset, along with the corresponding\nvisualization tools is accessible at\nhttps://cuhksz-inml.github.io/user-behavior-in-vv-watching/\n","authors":["Kaiyuan Hu","Haowen Yang","Yili Jin","Junhua Liu","Yongting Chen","Miao Zhang","Fangxin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07578v2.pdf","comment":"Accepted by ACM MM'23"},{"id":"http://arxiv.org/abs/2308.02510v2","updated":"2023-08-16T09:59:40Z","published":"2023-07-27T12:54:16Z","title":"Seeing through the Brain: Image Reconstruction of Visual Perception from\n Human Brain Signals","summary":" Seeing is believing, however, the underlying mechanism of how human visual\nperceptions are intertwined with our cognitions is still a mystery. Thanks to\nthe recent advances in both neuroscience and artificial intelligence, we have\nbeen able to record the visually evoked brain activities and mimic the visual\nperception ability through computational approaches. In this paper, we pay\nattention to visual stimuli reconstruction by reconstructing the observed\nimages based on portably accessible brain signals, i.e., electroencephalography\n(EEG) data. Since EEG signals are dynamic in the time-series format and are\nnotorious to be noisy, processing and extracting useful information requires\nmore dedicated efforts; In this paper, we propose a comprehensive pipeline,\nnamed NeuroImagen, for reconstructing visual stimuli images from EEG signals.\nSpecifically, we incorporate a novel multi-level perceptual information\ndecoding to draw multi-grained outputs from the given EEG data. A latent\ndiffusion model will then leverage the extracted information to reconstruct the\nhigh-resolution visual stimuli images. The experimental results have\nillustrated the effectiveness of image reconstruction and superior quantitative\nperformance of our proposed method.\n","authors":["Yu-Ting Lan","Kan Ren","Yansen Wang","Wei-Long Zheng","Dongsheng Li","Bao-Liang Lu","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.02510v2.pdf","comment":"A preprint version of an ongoing work"},{"id":"http://arxiv.org/abs/2308.08143v1","updated":"2023-08-16T04:31:33Z","published":"2023-08-16T04:31:33Z","title":"SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech\n Separation","summary":" The integration of different modalities, such as audio and visual\ninformation, plays a crucial role in human perception of the surrounding\nenvironment. Recent research has made significant progress in designing fusion\nmodules for audio-visual speech separation. However, they predominantly focus\non multi-modal fusion architectures situated either at the top or bottom\npositions, rather than comprehensively considering multi-modal fusion at\nvarious hierarchical positions within the network. In this paper, we propose a\nnovel model called self- and cross-attention network (SCANet), which leverages\nthe attention mechanism for efficient audio-visual feature fusion. SCANet\nconsists of two types of attention blocks: self-attention (SA) and\ncross-attention (CA) blocks, where the CA blocks are distributed at the top\n(TCA), middle (MCA) and bottom (BCA) of SCANet. These blocks maintain the\nability to learn modality-specific features and enable the extraction of\ndifferent semantics from audio-visual features. Comprehensive experiments on\nthree standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2)\ndemonstrate the effectiveness of SCANet, outperforming existing\nstate-of-the-art (SOTA) methods while maintaining comparable inference time.\n","authors":["Kai Li","Runxuan Yang","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08143v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.07056v2","updated":"2023-08-16T01:58:26Z","published":"2023-08-14T10:31:29Z","title":"VoxBlink: X-Large Speaker Verification Dataset on Camera","summary":" In this paper, we contribute a novel and extensive dataset for speaker\nverification, which contains noisy 38k identities/1.45M utterances (VoxBlink)\nand relatively cleaned 18k identities/1.02M (VoxBlink-Clean) utterances for\ntraining. Firstly, we collect a 60K+ users' list as well as their avatar and\ndownload their SHORT videos on the YouTube. Then, an automatically pipeline is\ndevised to extract target user's speech segments and videos, which is efficient\nand scalable. To the best of our knowledge, the VoxBlink dataset is the largest\nspeaker recognition dataset. Secondly, we develop a series of experiments based\non VoxBlink-clean together with VoxCeleb2. Our findings highlight a notable\nimprovement in performance, ranging from 15% to 30%, across different backbone\narchitectures, upon integrating our dataset for training. The dataset will be\nreleased SOON~.\n","authors":["Yuke Lin","Xiaoyi Qin","Ming Cheng","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2308.07056v2.pdf","comment":"submit to ICASSP2023"},{"id":"http://arxiv.org/abs/2308.08088v1","updated":"2023-08-16T01:38:49Z","published":"2023-08-16T01:38:49Z","title":"Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme\n Detection","summary":" Hateful meme detection is a challenging multimodal task that requires\ncomprehension of both vision and language, as well as cross-modal interactions.\nRecent studies have tried to fine-tune pre-trained vision-language models\n(PVLMs) for this task. However, with increasing model sizes, it becomes\nimportant to leverage powerful PVLMs more efficiently, rather than simply\nfine-tuning them. Recently, researchers have attempted to convert meme images\ninto textual captions and prompt language models for predictions. This approach\nhas shown good performance but suffers from non-informative image captions.\nConsidering the two factors mentioned above, we propose a probing-based\ncaptioning approach to leverage PVLMs in a zero-shot visual question answering\n(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful\ncontent-related questions and use the answers as image captions (which we call\nPro-Cap), so that the captions contain information critical for hateful content\ndetection. The good performance of models with Pro-Cap on three benchmarks\nvalidates the effectiveness and generalization of the proposed method.\n","authors":["Rui Cao","Ming Shan Hee","Adriel Kuek","Wen-Haw Chong","Roy Ka-Wei Lee","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.08088v1.pdf","comment":"Camera-ready for 23, ACM MM"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..f47b875c --- /dev/null +++ b/index.html @@ -0,0 +1,65986 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 43 + +
+
+
+ + ☆ Time Travel in LLMs: Tracing Data Contamination in Large Language Models + + +
+ Data contamination, i.e., the presence of test data from downstream tasks in +the training data of large language models (LLMs), is a potential major issue +in understanding LLMs' effectiveness on other tasks. We propose a +straightforward yet effective method for identifying data contamination within +LLMs. At its core, our approach starts by identifying potential contamination +in individual instances that are drawn from a small random sample; using this +information, our approach then assesses if an entire dataset partition is +contaminated. To estimate contamination of individual instances, we employ +"guided instruction:" a prompt consisting of the dataset name, partition type, +and the initial segment of a reference instance, asking the LLM to complete it. +An instance is flagged as contaminated if the LLM's output either exactly or +closely matches the latter segment of the reference. To understand if an entire +partition is contaminated, we propose two ideas. The first idea marks a dataset +partition as contaminated if the average overlap score with the reference +instances (as measured by ROUGE or BLEURT) is statistically significantly +better with the guided instruction vs. a general instruction that does not +include the dataset and partition name. The second idea marks a dataset as +contaminated if a classifier based on GPT-4 with in-context learning prompting +marks multiple instances as contaminated. Our best method achieves an accuracy +between 92% and 100% in detecting if an LLM is contaminated with seven +datasets, containing train and test/validation partitions, when contrasted with +manual evaluation by human expert. Further, our findings indicate that GPT-4 is +contaminated with AG News, WNLI, and XSum datasets. + +
+
+ comment: v1 preprint +
+
+
+
+
+ + ☆ Mitigating the Exposure Bias in Sentence-Level Grapheme-to-Phoneme (G2P) + Transduction INTERSPEECH 2023 + + +
+ Text-to-Text Transfer Transformer (T5) has recently been considered for the +Grapheme-to-Phoneme (G2P) transduction. As a follow-up, a tokenizer-free +byte-level model based on T5 referred to as ByT5, recently gave promising +results on word-level G2P conversion by representing each input character with +its corresponding UTF-8 encoding. Although it is generally understood that +sentence-level or paragraph-level G2P can improve usability in real-world +applications as it is better suited to perform on heteronyms and linking sounds +between words, we find that using ByT5 for these scenarios is nontrivial. Since +ByT5 operates on the character level, it requires longer decoding steps, which +deteriorates the performance due to the exposure bias commonly observed in +auto-regressive generation models. This paper shows that the performance of +sentence-level and paragraph-level G2P can be improved by mitigating such +exposure bias using our proposed loss-based sampling method. + +
+
+ comment: INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Knowledge-Enhanced Multi-Label Few-Shot Product Attribute-Value + Extraction CIKM 2023 + + +
+ Existing attribute-value extraction (AVE) models require large quantities of +labeled data for training. However, new products with new attribute-value pairs +enter the market every day in real-world e-Commerce. Thus, we formulate AVE in +multi-label few-shot learning (FSL), aiming to extract unseen attribute value +pairs based on a small number of training examples. We propose a +Knowledge-Enhanced Attentive Framework (KEAF) based on prototypical networks, +leveraging the generated label description and category information to learn +more discriminative prototypes. Besides, KEAF integrates with hybrid attention +to reduce noise and capture more informative semantics for each class by +calculating the label-relevant and query-related weights. To achieve +multi-label inference, KEAF further learns a dynamic threshold by integrating +the semantic information from both the support set and the query set. Extensive +experiments with ablation studies conducted on two datasets demonstrate that +KEAF outperforms other SOTA models for information extraction in FSL. The code +can be found at: https://github.com/gjiaying/KEAF + +
+
+ comment: 6 pages, 2 figures, published in CIKM 2023 +
+
+
+
+
+ + ☆ Advancing continual lifelong learning in neural information retrieval: + definition, dataset, framework, and empirical evaluation + + +
+ Continual learning refers to the capability of a machine learning model to +learn and adapt to new information, without compromising its performance on +previously learned tasks. Although several studies have investigated continual +learning methods for information retrieval tasks, a well-defined task +formulation is still lacking, and it is unclear how typical learning strategies +perform in this context. To address this challenge, a systematic task +formulation of continual neural information retrieval is presented, along with +a multiple-topic dataset that simulates continuous information retrieval. A +comprehensive continual neural information retrieval framework consisting of +typical retrieval models and continual learning strategies is then proposed. +Empirical evaluations illustrate that the proposed framework can successfully +prevent catastrophic forgetting in neural information retrieval and enhance +performance on previously learned tasks. The results indicate that +embedding-based retrieval models experience a decline in their continual +learning performance as the topic shift distance and dataset volume of new +tasks increase. In contrast, pretraining-based models do not show any such +correlation. Adopting suitable learning strategies can mitigate the effects of +topic shift and data augmentation. + +
+
+ comment: Submitted to Information Sciences +
+
+
+
+
+ + ☆ SummHelper: Collaborative Human-Computer Summarization + + +
+ Current approaches for text summarization are predominantly automatic, with +rather limited space for human intervention and control over the process. In +this paper, we introduce SummHelper, a 2-phase summarization assistant designed +to foster human-machine collaboration. The initial phase involves content +selection, where the system recommends potential content, allowing users to +accept, modify, or introduce additional selections. The subsequent phase, +content consolidation, involves SummHelper generating a coherent summary from +these selections, which users can then refine using visual mappings between the +summary and the source text. Small-scale user studies reveal the effectiveness +of our application, with participants being especially appreciative of the +balance between automated guidance and opportunities for personal input. + +
+
+ comment: Demo paper +
+
+
+
+
+ + ☆ Detoxify Language Model Step-by-Step + + +
+ Detoxification for LLMs is challenging since it requires models to avoid +generating harmful content while maintaining the generation capability. To +ensure the safety of generations, previous detoxification methods detoxify the +models by changing the data distributions or constraining the generations from +different aspects in a single-step manner. However, these approaches will +dramatically affect the generation quality of LLMs, e.g., discourse coherence +and semantic consistency, since language models tend to generate along the +toxic prompt while detoxification methods work in the opposite direction. To +handle such a conflict, we decompose the detoxification process into different +sub-steps, where the detoxification is concentrated in the input stage and the +subsequent continual generation is based on the non-toxic prompt. Besides, we +also calibrate the strong reasoning ability of LLMs by designing a Detox-Chain +to connect the above sub-steps in an orderly manner, which allows LLMs to +detoxify the text step-by-step. Automatic and human evaluation on two +benchmarks reveals that by training with Detox-Chain, six LLMs scaling from 1B +to 33B can obtain significant detoxification and generation improvement. Our +code and data are available at https://github.com/CODINNLG/Detox-CoT. Warning: +examples in the paper may contain uncensored offensive content. + +
+
+
+
+
+ + ☆ Pre-training with Large Language Model-based Document Expansion for + Dense Passage Retrieval + + +
+ In this paper, we systematically study the potential of pre-training with +Large Language Model(LLM)-based document expansion for dense passage retrieval. +Concretely, we leverage the capabilities of LLMs for document expansion, i.e. +query generation, and effectively transfer expanded knowledge to retrievers +using pre-training strategies tailored for passage retrieval. These strategies +include contrastive learning and bottlenecked query generation. Furthermore, we +incorporate a curriculum learning strategy to reduce the reliance on LLM +inferences. Experimental results demonstrate that pre-training with LLM-based +document expansion significantly boosts the retrieval performance on +large-scale web-search tasks. Our work shows strong zero-shot and out-of-domain +retrieval abilities, making it more widely applicable for retrieval when +initializing with no human-labeled data. + +
+
+ comment: 10 pages, 3 tables, 4 figures, under review +
+
+
+
+
+ + ☆ Benchmarking Neural Network Generalization for Grammar Induction + + +
+ How well do neural networks generalize? Even for grammar induction tasks, +where the target generalization is fully known, previous works have left the +question open, testing very limited ranges beyond the training set and using +different success criteria. We provide a measure of neural network +generalization based on fully specified formal languages. Given a model and a +formal grammar, the method assigns a generalization score representing how well +a model generalizes to unseen samples in inverse relation to the amount of data +it was trained on. The benchmark includes languages such as $a^nb^n$, +$a^nb^nc^n$, $a^nb^mc^{n+m}$, and Dyck-1 and 2. We evaluate selected +architectures using the benchmark and find that networks trained with a Minimum +Description Length objective (MDL) generalize better and using less data than +networks trained using standard loss functions. The benchmark is available at +https://github.com/taucompling/bliss. + +
+
+ comment: 10 pages, 4 figures, 2 tables. Conference: Learning with Small Data + 2023 +
+
+
+
+
+ + ☆ TEST: Text Prototype Aligned Embedding to Activate LLM's Ability for + Time Series + + +
+ This work summarizes two strategies for completing time-series (TS) tasks +using today's language model (LLM): LLM-for-TS, design and train a fundamental +large model for TS data; TS-for-LLM, enable the pre-trained LLM to handle TS +data. Considering the insufficient data accumulation, limited resources, and +semantic context requirements, this work focuses on TS-for-LLM methods, where +we aim to activate LLM's ability for TS data by designing a TS embedding method +suitable for LLM. The proposed method is named TEST. It first tokenizes TS, +builds an encoder to embed them by instance-wise, feature-wise, and +text-prototype-aligned contrast, and then creates prompts to make LLM more open +to embeddings, and finally implements TS tasks. Experiments are carried out on +TS classification and forecasting tasks using 8 LLMs with different structures +and sizes. Although its results cannot significantly outperform the current +SOTA models customized for TS tasks, by treating LLM as the pattern machine, it +can endow LLM's ability to process TS data without compromising the language +ability. This paper is intended to serve as a foundational work that will +inspire further research. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ MemoChat: Tuning LLMs to Use Memos for Consistent Long-Range Open-Domain + Conversation + + +
+ We propose MemoChat, a pipeline for refining instructions that enables large +language models (LLMs) to effectively employ self-composed memos for +maintaining consistent long-range open-domain conversations. We demonstrate a +long-range open-domain conversation through iterative +"memorization-retrieval-response" cycles. This requires us to carefully design +tailored tuning instructions for each distinct stage. The instructions are +reconstructed from a collection of public datasets to teach the LLMs to +memorize and retrieve past dialogues with structured memos, leading to enhanced +consistency when participating in future conversations. We invite experts to +manually annotate a test set designed to evaluate the consistency of long-range +conversations questions. Experiments on three testing scenarios involving both +open-source and API-accessible chatbots at scale verify the efficacy of +MemoChat, which outperforms strong baselines. + +
+
+ comment: Codes, data and models will be available soon +
+
+
+
+
+ + ☆ Challenges and Opportunities of Using Transformer-Based Multi-Task + Learning in NLP Through ML Lifecycle: A Survey + + +
+ The increasing adoption of natural language processing (NLP) models across +industries has led to practitioners' need for machine learning systems to +handle these models efficiently, from training to serving them in production. +However, training, deploying, and updating multiple models can be complex, +costly, and time-consuming, mainly when using transformer-based pre-trained +language models. Multi-Task Learning (MTL) has emerged as a promising approach +to improve efficiency and performance through joint training, rather than +training separate models. Motivated by this, we first provide an overview of +transformer-based MTL approaches in NLP. Then, we discuss the challenges and +opportunities of using MTL approaches throughout typical ML lifecycle phases, +specifically focusing on the challenges related to data engineering, model +development, deployment, and monitoring phases. This survey focuses on +transformer-based MTL architectures and, to the best of our knowledge, is novel +in that it systematically analyses how transformer-based MTL in NLP fits into +ML lifecycle phases. Furthermore, we motivate research on the connection +between MTL and continual learning (CL), as this area remains unexplored. We +believe it would be practical to have a model that can handle both MTL and CL, +as this would make it easier to periodically re-train the model, update it due +to distribution shifts, and add new capabilities to meet real-world +requirements. + +
+
+
+
+
+ + ☆ MoCoSA: Momentum Contrast for Knowledge Graph Completion with + Structure-Augmented Pre-trained Language Models + + +
+ Knowledge Graph Completion (KGC) aims to conduct reasoning on the facts +within knowledge graphs and automatically infer missing links. Existing methods +can mainly be categorized into structure-based or description-based. On the one +hand, structure-based methods effectively represent relational facts in +knowledge graphs using entity embeddings. However, they struggle with +semantically rich real-world entities due to limited structural information and +fail to generalize to unseen entities. On the other hand, description-based +methods leverage pre-trained language models (PLMs) to understand textual +information. They exhibit strong robustness towards unseen entities. However, +they have difficulty with larger negative sampling and often lag behind +structure-based methods. To address these issues, in this paper, we propose +Momentum Contrast for knowledge graph completion with Structure-Augmented +pre-trained language models (MoCoSA), which allows the PLM to perceive the +structural information by the adaptable structure encoder. To improve learning +efficiency, we proposed momentum hard negative and intra-relation negative +sampling. Experimental results demonstrate that our approach achieves +state-of-the-art performance in terms of mean reciprocal rank (MRR), with +improvements of 2.5% on WN18RR and 21% on OpenBG500. + +
+
+
+
+
+ + ☆ ChinaTelecom System Description to VoxCeleb Speaker Recognition + Challenge 2023 + + +
+ This technical report describes ChinaTelecom system for Track 1 (closed) of +the VoxCeleb2023 Speaker Recognition Challenge (VoxSRC 2023). Our system +consists of several ResNet variants trained only on VoxCeleb2, which were fused +for better performance later. Score calibration was also applied for each +variant and the fused system. The final submission achieved minDCF of 0.1066 +and EER of 1.980%. + +
+
+ comment: System description of VoxSRC 2023 +
+
+
+
+
+ + ☆ RSpell: Retrieval-augmented Framework for Domain Adaptive Chinese + Spelling Check + + +
+ Chinese Spelling Check (CSC) refers to the detection and correction of +spelling errors in Chinese texts. In practical application scenarios, it is +important to make CSC models have the ability to correct errors across +different domains. In this paper, we propose a retrieval-augmented spelling +check framework called RSpell, which searches corresponding domain terms and +incorporates them into CSC models. Specifically, we employ pinyin fuzzy +matching to search for terms, which are combined with the input and fed into +the CSC model. Then, we introduce an adaptive process control mechanism to +dynamically adjust the impact of external knowledge on the model. Additionally, +we develop an iterative strategy for the RSpell framework to enhance reasoning +capabilities. We conducted experiments on CSC datasets in three domains: law, +medicine, and official document writing. The results demonstrate that RSpell +achieves state-of-the-art performance in both zero-shot and fine-tuning +scenarios, demonstrating the effectiveness of the retrieval-augmented CSC +framework. Our code is available at https://github.com/47777777/Rspell. + +
+
+
+
+
+ + ☆ Enhancing Performance on Seen and Unseen Dialogue Scenarios using + Retrieval-Augmented End-to-End Task-Oriented System SIGDIAL 2023 + + +
+ End-to-end task-oriented dialogue (TOD) systems have achieved promising +performance by leveraging sophisticated natural language understanding and +natural language generation capabilities of pre-trained models. This work +enables the TOD systems with more flexibility through a simple cache. The cache +provides the flexibility to dynamically update the TOD systems and handle both +existing and unseen dialogue scenarios. Towards this end, we first fine-tune a +retrieval module to effectively retrieve the most relevant information entries +from the cache. We then train end-to-end TOD models that can refer to and +ground on both dialogue history and retrieved information during TOD +generation. The cache is straightforward to construct, and the backbone models +of TOD systems are compatible with existing pre-trained generative models. +Extensive experiments demonstrate the superior performance of our framework, +with a notable improvement in non-empty joint goal accuracy by 6.7% compared to +strong baselines. + +
+
+ comment: Accepted by SIGDIAL 2023 as a long paper +
+
+
+
+
+ + ☆ Sarcasm Detection in a Disaster Context + + +
+ During natural disasters, people often use social media platforms such as +Twitter to ask for help, to provide information about the disaster situation, +or to express contempt about the unfolding event or public policies and +guidelines. This contempt is in some cases expressed as sarcasm or irony. +Understanding this form of speech in a disaster-centric context is essential to +improving natural language understanding of disaster-related tweets. In this +paper, we introduce HurricaneSARC, a dataset of 15,000 tweets annotated for +intended sarcasm, and provide a comprehensive investigation of sarcasm +detection using pre-trained language models. Our best model is able to obtain +as much as 0.70 F1 on our dataset. We also demonstrate that the performance on +HurricaneSARC can be improved by leveraging intermediate task transfer +learning. We release our data and code at +https://github.com/tsosea2/HurricaneSarc. + +
+
+
+
+
+ + ☆ AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation + Framework + + +
+ This technical report presents AutoGen, a new framework that enables +development of LLM applications using multiple agents that can converse with +each other to solve tasks. AutoGen agents are customizable, conversable, and +seamlessly allow human participation. They can operate in various modes that +employ combinations of LLMs, human inputs, and tools. AutoGen's design offers +multiple advantages: a) it gracefully navigates the strong but imperfect +generation and reasoning abilities of these LLMs; b) it leverages human +understanding and intelligence, while providing valuable automation through +conversations between agents; c) it simplifies and unifies the implementation +of complex LLM workflows as automated agent chats. We provide many diverse +examples of how developers can easily use AutoGen to effectively solve tasks or +build applications, ranging from coding, mathematics, operations research, +entertainment, online decision-making, question answering, etc. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ☆ Fast Training of NMT Model with Data Sorting + + +
+ The Transformer model has revolutionized Natural Language Processing tasks +such as Neural Machine Translation, and many efforts have been made to study +the Transformer architecture, which increased its efficiency and accuracy. One +potential area for improvement is to address the computation of empty tokens +that the Transformer computes only to discard them later, leading to an +unnecessary computational burden. To tackle this, we propose an algorithm that +sorts translation sentence pairs based on their length before batching, +minimizing the waste of computing power. Since the amount of sorting could +violate the independent and identically distributed (i.i.d) data assumption, we +sort the data partially. In experiments, we apply the proposed method to +English-Korean and English-Luganda language pairs for machine translation and +show that there are gains in computational time while maintaining the +performance. Our method is independent of architectures, so that it can be +easily integrated into any training process with flexible data lengths. + +
+
+
+
+
+ + ☆ MDDial: A Multi-turn Differential Diagnosis Dialogue Dataset with + Reliability Evaluation + + +
+ Dialogue systems for Automatic Differential Diagnosis (ADD) have a wide range +of real-life applications. These dialogue systems are promising for providing +easy access and reducing medical costs. Building end-to-end ADD dialogue +systems requires dialogue training datasets. However, to the best of our +knowledge, there is no publicly available ADD dialogue dataset in English +(although non-English datasets exist). Driven by this, we introduce MDDial, the +first differential diagnosis dialogue dataset in English which can aid to build +and evaluate end-to-end ADD dialogue systems. Additionally, earlier studies +present the accuracy of diagnosis and symptoms either individually or as a +combined weighted score. This method overlooks the connection between the +symptoms and the diagnosis. We introduce a unified score for the ADD system +that takes into account the interplay between symptoms and diagnosis. This +score also indicates the system's reliability. To the end, we train two +moderate-size of language models on MDDial. Our experiments suggest that while +these language models can perform well on many natural language understanding +tasks, including dialogue tasks in the general domain, they struggle to relate +relevant symptoms and disease and thus have poor performance on MDDial. MDDial +will be released publicly to aid the study of ADD dialogue research. + +
+
+
+
+
+ + ☆ Radio2Text: Streaming Speech Recognition Using mmWave Radio Signals + + +
+ Millimeter wave (mmWave) based speech recognition provides more possibility +for audio-related applications, such as conference speech transcription and +eavesdropping. However, considering the practicality in real scenarios, latency +and recognizable vocabulary size are two critical factors that cannot be +overlooked. In this paper, we propose Radio2Text, the first mmWave-based system +for streaming automatic speech recognition (ASR) with a vocabulary size +exceeding 13,000 words. Radio2Text is based on a tailored streaming Transformer +that is capable of effectively learning representations of speech-related +features, paving the way for streaming ASR with a large vocabulary. To +alleviate the deficiency of streaming networks unable to access entire future +inputs, we propose the Guidance Initialization that facilitates the transfer of +feature knowledge related to the global context from the non-streaming +Transformer to the tailored streaming Transformer through weight inheritance. +Further, we propose a cross-modal structure based on knowledge distillation +(KD), named cross-modal KD, to mitigate the negative effect of low quality +mmWave signals on recognition performance. In the cross-modal KD, the audio +streaming Transformer provides feature and response guidance that inherit +fruitful and accurate speech information to supervise the training of the +tailored radio streaming Transformer. The experimental results show that our +Radio2Text can achieve a character error rate of 5.7% and a word error rate of +9.4% for the recognition of a vocabulary consisting of over 13,000 words. + +
+
+ comment: Accepted by Proceedings of the ACM on Interactive, Mobile, Wearable + and Ubiquitous Technologies (ACM IMWUT/UbiComp 2023) +
+
+
+
+
+ + ☆ Separate the Wheat from the Chaff: Model Deficiency Unlearning via + Parameter-Efficient Module Operation + + +
+ Large language models (LLMs) have been widely used in various applications +but are known to suffer from issues related to untruthfulness and toxicity. +While parameter-efficient modules (PEMs) have demonstrated their effectiveness +in equipping models with new skills, leveraging PEMs for deficiency unlearning +remains underexplored. In this work, we propose a PEMs operation approach, +namely Extraction-before-Subtraction (Ext-Sub), to enhance the truthfulness and +detoxification of LLMs through the integration of ``expert'' PEM and +``anti-expert'' PEM. Remarkably, even anti-expert PEM possess valuable +capabilities due to their proficiency in generating fabricated content, which +necessitates language modeling and logical narrative competence. Rather than +merely negating the parameters, our approach involves extracting and +eliminating solely the deficiency capability within anti-expert PEM while +preserving the general capabilities. To evaluate the effectiveness of our +approach in terms of truthfulness and detoxification, we conduct extensive +experiments on LLMs, encompassing additional abilities such as language +modeling and mathematical reasoning. Our empirical results demonstrate that our +approach effectively improves truthfulness and detoxification, while largely +preserving the fundamental abilities of LLMs. + +
+
+
+
+
+ + ♻ ☆ LLM-Rec: Personalized Recommendation via Prompting Large Language Models + + +
+ We investigate various prompting strategies for enhancing personalized +recommendation performance with large language models (LLMs) through input +augmentation. Our proposed approach, termed LLM-Rec, encompasses four distinct +prompting strategies: (1) basic prompting, (2) recommendation-driven prompting, +(3) engagement-guided prompting, and (4) recommendation-driven + +engagement-guided prompting. Our empirical experiments show that incorporating +the augmented input text generated by LLM leads to improved recommendation +performance. Recommendation-driven and engagement-guided prompting strategies +are found to elicit LLM's understanding of global and local item +characteristics. This finding highlights the importance of leveraging diverse +prompts and input augmentation techniques to enhance the recommendation +capabilities with LLMs. + +
+
+
+
+
+ + ♻ ☆ Allophant: Cross-lingual Phoneme Recognition with Articulatory + Attributes INTERSPEECH 2023 + + +
+ This paper proposes Allophant, a multilingual phoneme recognizer. It requires +only a phoneme inventory for cross-lingual transfer to a target language, +allowing for low-resource recognition. The architecture combines a +compositional phone embedding approach with individually supervised phonetic +attribute classifiers in a multi-task architecture. We also introduce +Allophoible, an extension of the PHOIBLE database. When combined with a +distance based mapping approach for grapheme-to-phoneme outputs, it allows us +to train on PHOIBLE inventories directly. By training and evaluating on 34 +languages, we found that the addition of multi-task learning improves the +model's capability of being applied to unseen phonemes and phoneme inventories. +On supervised languages we achieve phoneme error rate improvements of 11 +percentage points (pp.) compared to a baseline without multi-task learning. +Evaluation of zero-shot transfer on 84 languages yielded a decrease in PER of +2.63 pp. over the baseline. + +
+
+ comment: 5 pages, 2 figures, 2 tables, accepted to INTERSPEECH 2023; published + version +
+
+
+
+
+ + ♻ ☆ Black Box Few-Shot Adaptation for Vision-Language models ICCV 2023 + + +
+ Vision-Language (V-L) models trained with contrastive learning to align the +visual and language modalities have been shown to be strong few-shot learners. +Soft prompt learning is the method of choice for few-shot downstream adaption +aiming to bridge the modality gap caused by the distribution shift induced by +the new domain. While parameter-efficient, prompt learning still requires +access to the model weights and can be computationally infeasible for large +models with billions of parameters. To address these shortcomings, in this +work, we describe a black-box method for V-L few-shot adaptation that (a) +operates on pre-computed image and text features and hence works without access +to the model's weights, (b) it is orders of magnitude faster at training time, +(c) it is amenable to both supervised and unsupervised training, and (d) it can +be even used to align image and text features computed from uni-modal models. +To achieve this, we propose Linear Feature Alignment (LFA), a simple linear +approach for V-L re-alignment in the target domain. LFA is initialized from a +closed-form solution to a least-squares problem and then it is iteratively +updated by minimizing a re-ranking loss. Despite its simplicity, our approach +can even surpass soft-prompt learning methods as shown by extensive experiments +on 11 image and 2 video datasets. + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ LLMatic: Neural Architecture Search via Large Language Models and + Quality-Diversity Optimization + + +
+ Large Language Models (LLMs) have emerged as powerful tools capable of +accomplishing a broad spectrum of tasks. Their abilities span numerous areas, +and one area where they have made a significant impact is in the domain of code +generation. In this context, we view LLMs as mutation and crossover tools. +Meanwhile, Quality-Diversity (QD) algorithms are known to discover diverse and +robust solutions. By merging the code-generating abilities of LLMs with the +diversity and robustness of QD solutions, we introduce LLMatic, a Neural +Architecture Search (NAS) algorithm. While LLMs struggle to conduct NAS +directly through prompts, LLMatic uses a procedural approach, leveraging QD for +prompts and network architecture to create diverse and highly performant +networks. We test LLMatic on the CIFAR-10 image classification benchmark, +demonstrating that it can produce competitive networks with just $2,000$ +searches, even without prior knowledge of the benchmark domain or exposure to +any previous top-performing models for the benchmark. + +
+
+
+
+
+ + ♻ ☆ LLM Comparative Assessment: Zero-shot NLG Evaluation through Pairwise + Comparisons using Large Language Models + + +
+ Current developments in large language models (LLMs) have enabled impressive +zero-shot capabilities across various natural language tasks. An interesting +application of these systems is in the automated assessment of natural language +generation (NLG), a highly challenging area with great practical benefit. In +this paper, we explore two options for exploiting the emergent abilities of +LLMs for zero-shot NLG assessment: absolute score prediction, and comparative +assessment which uses relative comparisons between pairs of candidates. Though +comparative assessment has not been extensively studied in NLG assessment, we +note that humans often find it more intuitive to compare two options rather +than scoring each one independently. This work examines comparative assessment +from multiple perspectives: performance compared to absolute grading; +positional biases in the prompt; and efficient ranking in terms of the number +of comparisons. We illustrate that LLM comparative assessment is a simple, +general and effective approach for NLG assessment. For moderate-sized +open-source LLMs, such as FlanT5 and Llama2-chat, comparative assessment is +superior to prompt scoring, and in many cases can achieve performance +competitive with state-of-the-art methods. Additionally, we demonstrate that +LLMs often exhibit strong positional biases when making pairwise comparisons, +and we propose debiasing methods that can further improve performance. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ LLM Cognitive Judgements Differ From Human + + +
+ Large Language Models (LLMs) have lately been on the spotlight of +researchers, businesses, and consumers alike. While the linguistic capabilities +of such models have been studied extensively, there is growing interest in +investigating them as cognitive subjects. In the present work I examine GPT-3 +and ChatGPT capabilities on an limited-data inductive reasoning task from the +cognitive science literature. The results suggest that these models' cognitive +judgements are not human-like. + +
+
+ comment: 7 pages, 1 figure. License changed to CC BY-NC-SA +
+
+
+
+
+ + ♻ ☆ SpecInfer: Accelerating Generative Large Language Model Serving with + Speculative Inference and Token Tree Verification + + +
+ The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them quickly and cheaply. This paper +introduces SpecInfer, an LLM serving system that accelerates generative LLM +inference with speculative inference and token tree verification. A key insight +behind Specinfer is to combine various collectively boost-tuned small language +models to jointly predict the LLM's outputs; the predictions are organized as a +token tree, whose nodes each represent a candidate token sequence. The +correctness of all candidate token sequences represented by a token tree is +verified against the LLM in parallel using a novel tree-based parallel decoding +mechanism. SpecInfer uses an LLM as a token tree verifier instead of an +incremental decoder, which significantly reduces the end-to-end latency and +computational requirement for serving generative LLMs while provably preserving +model quality. Our evaluation shows that SpecInfer outperforms existing LLM +serving systems by 1.3-2.4x for distributed LLM inference and by 2.6-3.5x for +offloading-based LLM inference, while preserving the same generative +performance. SpecInfer is publicly available at +https://github.com/flexflow/FlexFlow/tree/inference. + +
+
+
+
+
+ + ♻ ☆ T-SciQ: Teaching Multimodal Chain-of-Thought Reasoning via Large + Language Model Signals for Science Question Answering + + +
+ Large Language Models (LLMs) have recently demonstrated exceptional +performance in various Natural Language Processing (NLP) tasks. They have also +shown the ability to perform chain-of-thought (CoT) reasoning to solve complex +problems. Recent studies have explored CoT reasoning in complex multimodal +scenarios, such as the science question answering task, by fine-tuning +multimodal models with high-quality human-annotated CoT rationales. However, +collecting high-quality COT rationales is usually time-consuming and costly. +Besides, the annotated rationales are hardly accurate due to the external +essential information missed. To address these issues, we propose a novel +method termed \emph{T-SciQ} that aims at teaching science question answering +with LLM signals. The T-SciQ approach generates high-quality CoT rationales as +teaching signals and is advanced to train much smaller models to perform CoT +reasoning in complex modalities. Additionally, we introduce a novel data mixing +strategy to produce more effective teaching data samples by policy for simple +and complex science question answer problems. Extensive experimental results +show that our T-SciQ method achieves a new state-of-the-art performance on the +ScienceQA benchmark, with an accuracy of 96.18\%. Moreover, our approach +outperforms the most powerful fine-tuned baseline by 4.5\%. + +
+
+
+
+
+ + ♻ ☆ Text-only domain adaptation for end-to-end ASR using integrated + text-to-mel-spectrogram generator INTERSPEECH 2023 + + +
+ We propose an end-to-end Automatic Speech Recognition (ASR) system that can +be trained on transcribed speech data, text-only data, or a mixture of both. +The proposed model uses an integrated auxiliary block for text-based training. +This block combines a non-autoregressive multi-speaker text-to-mel-spectrogram +generator with a GAN-based enhancer to improve the spectrogram quality. The +proposed system can generate a mel-spectrogram dynamically during training. It +can be used to adapt the ASR model to a new domain by using text-only data from +this domain. We demonstrate that the proposed training method significantly +improves ASR accuracy compared to the system trained on transcribed speech +only. It also surpasses cascade TTS systems with the vocoder in the adaptation +quality and training speed. + +
+
+ comment: Accepted to INTERSPEECH 2023 +
+
+
+
+
+ + ♻ ☆ An interpretability framework for Similar case matching + + +
+ Similar Case Matching (SCM) plays a pivotal role in the legal system by +facilitating the efficient identification of similar cases for legal +professionals. While previous research has primarily concentrated on enhancing +the performance of SCM models, the aspect of interpretability has been +neglected. To bridge the gap, this study proposes an integrated pipeline +framework for interpretable SCM. The framework comprises four modules: judicial +feature sentence identification, case matching, feature sentence alignment, and +conflict resolution. In contrast to current SCM methods, our framework first +extracts feature sentences within a legal case that contain essential +information. Then it conducts case matching based on these extracted features. +Subsequently, our framework aligns the corresponding sentences in two legal +cases to provide evidence of similarity. In instances where the results of case +matching and feature sentence alignment exhibit conflicts, the conflict +resolution module resolves these inconsistencies. The experimental results show +the effectiveness of our proposed framework, establishing a new benchmark for +interpretable SCM. + +
+
+
+
+
+ + ♻ ☆ Editing Language Model-based Knowledge Graph Embeddings + + +
+ Recently decades have witnessed the empirical success of framing Knowledge +Graph (KG) embeddings via language models. However, language model-based KG +embeddings are usually deployed as static artifacts, making them difficult to +modify post-deployment without re-training after deployment. To address this +issue, we propose a new task of editing language model-based KG embeddings in +this paper. This task is designed to facilitate rapid, data-efficient updates +to KG embeddings without compromising the performance of other aspects. We +build four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and +evaluate several knowledge editing baselines demonstrating the limited ability +of previous models to handle the proposed challenging task. We further propose +a simple yet strong baseline dubbed KGEditor, which utilizes additional +parametric layers of the hyper network to edit/add facts. Our comprehensive +experimental results reveal that KGEditor excels in updating specific facts +without impacting the overall performance, even when faced with limited +training resources. Code and datasets are available in +https://github.com/zjunlp/PromptKG/tree/main/deltaKG. + +
+
+ comment: Work in progress and the project website is + https://zjunlp.github.io/project/KGE_Editing/ +
+
+
+
+
+ + ♻ ☆ Approximate Nearest Neighbour Phrase Mining for Contextual Speech + Recognition + + +
+ This paper presents an extension to train end-to-end Context-Aware +Transformer Transducer ( CATT ) models by using a simple, yet efficient method +of mining hard negative phrases from the latent space of the context encoder. +During training, given a reference query, we mine a number of similar phrases +using approximate nearest neighbour search. These sampled phrases are then used +as negative examples in the context list alongside random and ground truth +contextual information. By including approximate nearest neighbour phrases +(ANN-P) in the context list, we encourage the learned representation to +disambiguate between similar, but not identical, biasing phrases. This improves +biasing accuracy when there are several similar phrases in the biasing +inventory. We carry out experiments in a large-scale data regime obtaining up +to 7% relative word error rate reductions for the contextual portion of test +data. We also extend and evaluate CATT approach in streaming applications. + +
+
+ comment: Accepted to Interspeech 2023. 5 pages, 2 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Analyzing the Limits of Self-Supervision in Handling Bias in Language EMNLP + + +
+ Prompting inputs with natural language task descriptions has emerged as a +popular mechanism to elicit reasonably accurate outputs from large-scale +generative language models with little to no in-context supervision. This also +helps gain insight into how well language models capture the semantics of a +wide range of downstream tasks purely from self-supervised pre-training on +massive corpora of unlabeled text. Such models have naturally also been exposed +to a lot of undesirable content like racist and sexist language and there is +limited work on awareness of models along these dimensions. In this paper, we +define and comprehensively evaluate how well such language models capture the +semantics of four tasks for bias: diagnosis, identification, extraction and +rephrasing. We define three broad classes of task descriptions for these tasks: +statement, question, and completion, with numerous lexical variants within each +class. We study the efficacy of prompting for each task using these classes and +the null task description across several decoding methods and few-shot +examples. Our analyses indicate that language models are capable of performing +these tasks to widely varying degrees across different bias dimensions, such as +gender and political affiliation. We believe our work is an important step +towards unbiased language models by quantifying the limits of current +self-supervision objectives at accomplishing such sociologically challenging +tasks. + +
+
+ comment: Accepted at Findings of the Conference on Empirical Methods in + Natural Language Processing (EMNLP) 2022 +
+
+
+
+
+ + ♻ ☆ Rethinking the Role of Scale for In-Context Learning: An + Interpretability-based Case Study at 66 Billion Scale ACL + + +
+ Language models have been shown to perform better with an increase in scale +on a wide variety of tasks via the in-context learning paradigm. In this paper, +we investigate the hypothesis that the ability of a large language model to +in-context learn-perform a task is not uniformly spread across all of its +underlying components. Using a 66 billion parameter language model (OPT-66B) +across a diverse set of 14 downstream tasks, we find this is indeed the case: +$\sim$70% of attention heads and $\sim$20% of feed forward networks can be +removed with minimal decline in task performance. We find substantial overlap +in the set of attention heads (un)important for in-context learning across +tasks and number of in-context examples. We also address our hypothesis through +a task-agnostic lens, finding that a small set of attention heads in OPT-66B +score highly on their ability to perform primitive induction operations +associated with in-context learning, namely, prefix matching and copying. These +induction heads overlap with task-specific important heads, reinforcing +arguments by Olsson et al. (arXiv:2209.11895) regarding induction head +generality to more sophisticated behaviors associated with in-context learning. +Overall, our study provides several insights that indicate large language +models may be under-trained for in-context learning and opens up questions on +how to pre-train language models to more effectively perform in-context +learning. + +
+
+ comment: Accepted at Annual Meeting of the Association for Computational + Linguistics (ACL) 2023, Main Proceedings +
+
+
+
+
+ + ♻ ☆ EnrichEvent: Enriching Social Data with Contextual Information for + Emerging Event Extraction + + +
+ Social platforms have emerged as crucial platforms for disseminating +information and discussing real-life social events, which offers an excellent +opportunity for researchers to design and implement novel event detection +frameworks. However, most existing approaches merely exploit keyword burstiness +or network structures to detect unspecified events. Thus, they often fail to +identify unspecified events regarding the challenging nature of events and +social data. Social data, e.g., tweets, is characterized by misspellings, +incompleteness, word sense ambiguation, and irregular language, as well as +variation in aspects of opinions. Moreover, extracting discriminative features +and patterns for evolving events by exploiting the limited structural knowledge +is almost infeasible. To address these challenges, in this thesis, we propose a +novel framework, namely EnrichEvent, that leverages the lexical and contextual +representations of streaming social data. In particular, we leverage contextual +knowledge, as well as lexical knowledge, to detect semantically related tweets +and enhance the effectiveness of the event detection approaches. Eventually, +our proposed framework produces cluster chains for each event to show the +evolving variation of the event through time. We conducted extensive +experiments to evaluate our framework, validating its high performance and +effectiveness in detecting and distinguishing unspecified social events. + +
+
+
+
+
+ + ♻ ☆ Towards the extraction of robust sign embeddings for low resource sign + language recognition + + +
+ Isolated Sign Language Recognition (SLR) has mostly been applied on datasets +containing signs executed slowly and clearly by a limited group of signers. In +real-world scenarios, however, we are met with challenging visual conditions, +coarticulated signing, small datasets, and the need for signer independent +models. To tackle this difficult problem, we require a robust feature extractor +to process the sign language videos. One could expect human pose estimators to +be ideal candidates. However, due to a domain mismatch with their training sets +and challenging poses in sign language, they lack robustness on sign language +data and image-based models often still outperform keypoint-based models. +Furthermore, whereas the common practice of transfer learning with image-based +models yields even higher accuracy, keypoint-based models are typically trained +from scratch on every SLR dataset. These factors limit their usefulness for +SLR. From the existing literature, it is also not clear which, if any, pose +estimator performs best for SLR. We compare the three most popular pose +estimators for SLR: OpenPose, MMPose and MediaPipe. We show that through +keypoint normalization, missing keypoint imputation, and learning a pose +embedding, we can obtain significantly better results and enable transfer +learning. We show that keypoint-based embeddings contain cross-lingual +features: they can transfer between sign languages and achieve competitive +performance even when fine-tuning only the classifier layer of an SLR model on +a target sign language. We furthermore achieve better performance using +fine-tuned transferred embeddings than models trained only on the target sign +language. The embeddings can also be learned in a multilingual fashion. The +application of these embeddings could prove particularly useful for low +resource sign languages in the future. + +
+
+
+
+
+ + ♻ ☆ Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid + Essay in Education + + +
+ The recent large language models (LLMs), e.g., ChatGPT, have been able to +generate human-like and fluent responses when provided with specific +instructions. While admitting the convenience brought by technological +advancement, educators also have concerns that students might leverage LLMs to +complete their writing assignments and pass them off as their original work. +Although many AI content detection studies have been conducted as a result of +such concerns, most of these prior studies modeled AI content detection as a +classification problem, assuming that a text is either entirely human-written +or entirely AI-generated. In this study, we investigated AI content detection +in a rarely explored yet realistic setting where the text to be detected is +collaboratively written by human and generative LLMs (i.e., hybrid text). We +first formalized the detection task as identifying the transition points +between human-written content and AI-generated content from a given hybrid text +(boundary detection). Then we proposed a two-step approach where we (1) +separated AI-generated content from human-written content during the encoder +training process; and (2) calculated the distances between every two adjacent +prototypes and assumed that the boundaries exist between the two adjacent +prototypes that have the furthest distance from each other. Through extensive +experiments, we observed the following main findings: (1) the proposed approach +consistently outperformed the baseline methods across different experiment +settings; (2) the encoder training process can significantly boost the +performance of the proposed approach; (3) when detecting boundaries for +single-boundary hybrid essays, the proposed approach could be enhanced by +adopting a relatively large prototype size, leading to a 22% improvement in the +In-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation. + +
+
+ comment: 9 pages including references, 2 figures +
+
+
+
+
+ + ♻ ☆ CHAMPAGNE: Learning Real-world Conversation from Large-Scale Web Videos ICCV 2023 + + +
+ Visual information is central to conversation: body gestures and physical +behaviour, for example, contribute to meaning that transcends words alone. To +date, however, most neural conversational models are limited to just text. We +introduce CHAMPAGNE, a generative model of conversations that can account for +visual contexts. To train CHAMPAGNE, we collect and release YTD-18M, a +large-scale corpus of 18M video-based dialogues. YTD-18M is constructed from +web videos: crucial to our data collection pipeline is a pretrained language +model that converts error-prone automatic transcripts to a cleaner dialogue +format while maintaining meaning. Human evaluation reveals that YTD-18M is more +sensible and specific than prior resources (MMDialog, 1M dialogues), while +maintaining visual-groundedness. Experiments demonstrate that 1) CHAMPAGNE +learns to conduct conversation from YTD-18M; and 2) when fine-tuned, it +achieves state-of-the-art results on four vision-language tasks focused on +real-world conversations. We release data, models, and code. + +
+
+ comment: ICCV 2023, Project page: https://seungjuhan.me/champagne +
+
+
+
+
+ + ♻ ☆ SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and + Effective Hotword Customization Ability + + +
+ Hotword customization is one of the important issues remained in ASR field - +it is of value to enable users of ASR systems to customize names of entities, +persons and other phrases. The past few years have seen both implicit and +explicit modeling strategies for ASR contextualization developed. While these +approaches have performed adequately, they still exhibit certain shortcomings +such as instability in effectiveness. In this paper we propose +Semantic-augmented Contextual-Paraformer (SeACo-Paraformer) a novel NAR based +ASR system with flexible and effective hotword customization ability. It +combines the accuracy of the AED-based model, the efficiency of the NAR model, +and the excellent performance in contextualization. In 50,000 hours industrial +big data experiments, our proposed model outperforms strong baselines in +customization and general ASR tasks. Besides, we explore an efficient way to +filter large scale incoming hotwords for further improvement. The source codes +and industrial models proposed and compared are all opened as well as two +hotword test sets. + +
+
+ comment: updated draft +
+
+
+
+
+ + ♻ ☆ Mental-LLM: Leveraging Large Language Models for Mental Health + Prediction via Online Text Data + + +
+ Advances in large language models (LLMs) have empowered a variety of +applications. However, there is still a significant gap in research when it +comes to understanding and enhancing the capabilities of LLMs in the field of +mental health. In this work, we present the first comprehensive evaluation of +multiple LLMs, including Alpaca, Alpaca-LoRA, FLAN-T5, GPT-3.5, and GPT-4, on +various mental health prediction tasks via online text data. We conduct a broad +range of experiments, covering zero-shot prompting, few-shot prompting, and +instruction fine-tuning. The results indicate a promising yet limited +performance of LLMs with zero-shot and few-shot prompt designs for the mental +health tasks. More importantly, our experiments show that instruction +finetuning can significantly boost the performance of LLMs for all tasks +simultaneously. Our best-finetuned models, Mental-Alpaca and Mental-FLAN-T5, +outperform the best prompt design of GPT-3.5 (25 and 15 times bigger) by 10.9% +on balanced accuracy and the best of GPT-4 (250 and 150 times bigger) by 4.8%. +They further perform on par with the state-of-the-art task-specific language +model. We also conduct an exploratory case study on LLMs' capability on the +mental health reasoning tasks, illustrating the promising capability of certain +models such as GPT-4. We summarize our findings into a set of action guidelines +for potential methods to enhance LLMs' capability for mental health tasks. +Meanwhile, we also emphasize the important limitations before achieving +deployability in real-world mental health settings, such as known racial and +gender bias. We highlight the important ethical risks accompanying this line of +research. + +
+
+
+
+
+ + ♻ ☆ SPM: Structured Pretraining and Matching Architectures for Relevance + Modeling in Meituan Search CIKM '23 + + +
+ In e-commerce search, relevance between query and documents is an essential +requirement for satisfying user experience. Different from traditional +e-commerce platforms that offer products, users search on life service +platforms such as Meituan mainly for product providers, which usually have +abundant structured information, e.g. name, address, category, thousands of +products. Modeling search relevance with these rich structured contents is +challenging due to the following issues: (1) there is language distribution +discrepancy among different fields of structured document, making it difficult +to directly adopt off-the-shelf pretrained language model based methods like +BERT. (2) different fields usually have different importance and their length +vary greatly, making it difficult to extract document information helpful for +relevance matching. + To tackle these issues, in this paper we propose a novel two-stage +pretraining and matching architecture for relevance matching with rich +structured documents. At pretraining stage, we propose an effective pretraining +method that employs both query and multiple fields of document as inputs, +including an effective information compression method for lengthy fields. At +relevance matching stage, a novel matching method is proposed by leveraging +domain knowledge in search query to generate more effective document +representations for relevance scoring. Extensive offline experiments and online +A/B tests on millions of users verify that the proposed architectures +effectively improve the performance of relevance modeling. The model has +already been deployed online, serving the search traffic of Meituan for over a +year. + +
+
+ comment: Accepted by CIKM '23 +
+
+
+
+
+ + ♻ ☆ Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot + Task Generalization + + +
+ We investigate the emergent abilities of the recently proposed web-scale +speech model Whisper, by adapting it to unseen tasks with prompt engineering. +We selected three tasks: audio-visual speech recognition (AVSR), code-switched +speech recognition (CS-ASR), and speech translation (ST) on unseen language +pairs. We design task-specific prompts, by either leveraging another +large-scale model, or simply manipulating the special tokens in the default +prompts. Experiments show that compared to the default prompts, our proposed +prompts improve performance by 10% to 45% on the three zero-shot tasks, and +even outperform SotA supervised models on some datasets. In addition, our +experiments reveal many interesting properties of Whisper, including its +robustness to prompts, bias on accents, and the multilingual understanding in +its latent space. Code is available at +https://github.com/jasonppy/PromptingWhisper + +
+
+ comment: Interspeech 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 111 + +
+
+
+ + ☆ TeCH: Text-guided Reconstruction of Lifelike Clothed Humans + + +
+ Despite recent research advancements in reconstructing clothed humans from a +single image, accurately restoring the "unseen regions" with high-level details +remains an unsolved challenge that lacks attention. Existing methods often +generate overly smooth back-side surfaces with a blurry texture. But how to +effectively capture all visual attributes of an individual from a single image, +which are sufficient to reconstruct unseen areas (e.g., the back view)? +Motivated by the power of foundation models, TeCH reconstructs the 3D human by +leveraging 1) descriptive text prompts (e.g., garments, colors, hairstyles) +which are automatically generated via a garment parsing model and Visual +Question Answering (VQA), 2) a personalized fine-tuned Text-to-Image diffusion +model (T2I) which learns the "indescribable" appearance. To represent +high-resolution 3D clothed humans at an affordable cost, we propose a hybrid 3D +representation based on DMTet, which consists of an explicit body shape grid +and an implicit distance field. Guided by the descriptive prompts + +personalized T2I diffusion model, the geometry and texture of the 3D humans are +optimized through multi-view Score Distillation Sampling (SDS) and +reconstruction losses based on the original observation. TeCH produces +high-fidelity 3D clothed humans with consistent & delicate texture, and +detailed full-body geometry. Quantitative and qualitative experiments +demonstrate that TeCH outperforms the state-of-the-art methods in terms of +reconstruction accuracy and rendering quality. The code will be publicly +available for research purposes at https://huangyangyi.github.io/tech + +
+
+ comment: Project: https://huangyangyi.github.io/tech +
+
+
+
+
+ + ☆ MeViS: A Large-scale Benchmark for Video Segmentation with Motion + Expressions ICCV 2023 + + +
+ This paper strives for motion expressions guided video segmentation, which +focuses on segmenting objects in video content based on a sentence describing +the motion of the objects. Existing referring video object datasets typically +focus on salient objects and use language expressions that contain excessive +static attributes that could potentially enable the target object to be +identified in a single frame. These datasets downplay the importance of motion +in video content for language-guided video object segmentation. To investigate +the feasibility of using motion expressions to ground and segment objects in +videos, we propose a large-scale dataset called MeViS, which contains numerous +motion expressions to indicate target objects in complex environments. We +benchmarked 5 existing referring video object segmentation (RVOS) methods and +conducted a comprehensive comparison on the MeViS dataset. The results show +that current RVOS methods cannot effectively address motion expression-guided +video segmentation. We further analyze the challenges and propose a baseline +approach for the proposed MeViS dataset. The goal of our benchmark is to +provide a platform that enables the development of effective language-guided +video segmentation algorithms that leverage motion expressions as a primary cue +for object segmentation in complex video scenes. The proposed MeViS dataset has +been released at https://henghuiding.github.io/MeViS. + +
+
+ comment: ICCV 2023, Project Page: https://henghuiding.github.io/MeViS/ +
+
+
+
+
+ + ☆ InsightMapper: A Closer Look at Inner-instance Information for + Vectorized High-Definition Mapping + + +
+ Vectorized high-definition (HD) maps contain detailed information about +surrounding road elements, which are crucial for various downstream tasks in +modern autonomous driving vehicles, such as vehicle planning and control. +Recent works have attempted to directly detect the vectorized HD map as a point +set prediction task, resulting in significant improvements in detection +performance. However, these approaches fail to analyze and exploit the +inner-instance correlations between predicted points, impeding further +advancements. To address these challenges, we investigate the utilization of +inner-$\textbf{INS}$tance information for vectorized h$\textbf{IGH}$-definition +mapping through $\textbf{T}$ransformers and introduce InsightMapper. This paper +presents three novel designs within InsightMapper that leverage inner-instance +information in distinct ways, including hybrid query generation, inner-instance +query fusion, and inner-instance feature aggregation. Comparative experiments +are conducted on the NuScenes dataset, showcasing the superiority of our +proposed method. InsightMapper surpasses previous state-of-the-art (SOTA) +methods by 5.78 mAP and 5.12 TOPO, which assess topology correctness. +Simultaneously, InsightMapper maintains high efficiency during both training +and inference phases, resulting in remarkable comprehensive performance. The +project page for this work is available at +https://tonyxuqaq.github.io/projects/InsightMapper . + +
+
+ comment: Code and demo will be available at + https://tonyxuqaq.github.io/projects/InsightMapper +
+
+
+
+
+ + ☆ Ref-DVGO: Reflection-Aware Direct Voxel Grid Optimization for an + Improved Quality-Efficiency Trade-Off in Reflective Scene Reconstructio ICCV + + +
+ Neural Radiance Fields (NeRFs) have revolutionized the field of novel view +synthesis, demonstrating remarkable performance. However, the modeling and +rendering of reflective objects remain challenging problems. Recent methods +have shown significant improvements over the baselines in handling reflective +scenes, albeit at the expense of efficiency. In this work, we aim to strike a +balance between efficiency and quality. To this end, we investigate an +implicit-explicit approach based on conventional volume rendering to enhance +the reconstruction quality and accelerate the training and rendering processes. +We adopt an efficient density-based grid representation and reparameterize the +reflected radiance in our pipeline. Our proposed reflection-aware approach +achieves a competitive quality efficiency trade-off compared to competing +methods. Based on our experimental results, we propose and discuss hypotheses +regarding the factors influencing the results of density-based methods for +reconstructing reflective objects. The source code is available at: +https://github.com/gkouros/ref-dvgo + +
+
+ comment: 5 pages, 4 figures, 3 tables, ICCV TRICKY 2023 Workshop +
+
+
+
+
+ + ☆ Diagnosing Human-object Interaction Detectors + + +
+ Although we have witnessed significant progress in human-object interaction +(HOI) detection with increasingly high mAP (mean Average Precision), a single +mAP score is too concise to obtain an informative summary of a model's +performance and to understand why one approach is better than another. In this +paper, we introduce a diagnosis toolbox for analyzing the error sources of the +existing HOI detection models. We first conduct holistic investigations in the +pipeline of HOI detection, consisting of human-object pair detection and then +interaction classification. We define a set of errors and the oracles to fix +each of them. By measuring the mAP improvement obtained from fixing an error +using its oracle, we can have a detailed analysis of the significance of +different errors. We then delve into the human-object detection and interaction +classification, respectively, and check the model's behavior. For the first +detection task, we investigate both recall and precision, measuring the +coverage of ground-truth human-object pairs as well as the noisiness level in +the detections. For the second classification task, we compute mAP for +interaction classification only, without considering the detection scores. We +also measure the performance of the models in differentiating human-object +pairs with and without actual interactions using the AP (Average Precision) +score. Our toolbox is applicable for different methods across different +datasets and available at https://github.com/neu-vi/Diag-HOI. + +
+
+
+
+
+ + ☆ Likelihood-Based Text-to-Image Evaluation with Patch-Level Perceptual + and Semantic Credit Assignment + + +
+ Text-to-image synthesis has made encouraging progress and attracted lots of +public attention recently. However, popular evaluation metrics in this area, +like the Inception Score and Fr'echet Inception Distance, incur several issues. +First of all, they cannot explicitly assess the perceptual quality of generated +images and poorly reflect the semantic alignment of each text-image pair. Also, +they are inefficient and need to sample thousands of images to stabilise their +evaluation results. In this paper, we propose to evaluate text-to-image +generation performance by directly estimating the likelihood of the generated +images using a pre-trained likelihood-based text-to-image generative model, +i.e., a higher likelihood indicates better perceptual quality and better +text-image alignment. To prevent the likelihood of being dominated by the +non-crucial part of the generated image, we propose several new designs to +develop a credit assignment strategy based on the semantic and perceptual +significance of the image patches. In the experiments, we evaluate the proposed +metric on multiple popular text-to-image generation models and datasets in +accessing both the perceptual quality and the text-image alignment. Moreover, +it can successfully assess the generation ability of these models with as few +as a hundred samples, making it very efficient in practice. + +
+
+
+
+
+ + ☆ Painter: Teaching Auto-regressive Language Models to Draw Sketches + + +
+ Large language models (LLMs) have made tremendous progress in natural +language understanding and they have also been successfully adopted in other +domains such as computer vision, robotics, reinforcement learning, etc. In this +work, we apply LLMs to image generation tasks by directly generating the +virtual brush strokes to paint an image. We present Painter, an LLM that can +convert user prompts in text description format to sketches by generating the +corresponding brush strokes in an auto-regressive way. We construct Painter +based on off-the-shelf LLM that is pre-trained on a large text corpus, by +fine-tuning it on the new task while preserving language understanding +capabilities. We create a dataset of diverse multi-object sketches paired with +textual prompts that covers several object types and tasks. Painter can +generate sketches from text descriptions, remove objects from canvas, and +detect and classify objects in sketches. Although this is an unprecedented +pioneering work in using LLMs for auto-regressive image generation, the results +are very encouraging. + +
+
+
+
+
+ + ☆ Exploiting Point-Wise Attention in 6D Object Pose Estimation Based on + Bidirectional Prediction + + +
+ Traditional geometric registration based estimation methods only exploit the +CAD model implicitly, which leads to their dependence on observation quality +and deficiency to occlusion.To address the problem,the paper proposes a +bidirectional correspondence prediction network with a point-wise +attention-aware mechanism. This network not only requires the model points to +predict the correspondence but also explicitly models the geometric +similarities between observations and the model prior.} Our key insight is that +the correlations between each model point and scene point provide essential +information for learning point-pair matches. To further tackle the correlation +noises brought by feature distribution divergence, we design a simple but +effective pseudo-siamese network to improve feature homogeneity.Experimental +results on the public datasets of LineMOD, YCB-Video, and Occ-LineMOD show that +the proposed method achieves better performance than other state-of-the-art +methods under the same evaluation criteria. Its robustness in estimating poses +is greatly improved, especially in an environment with severe occlusions. + +
+
+
+
+
+ + ☆ Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse + Problems + + +
+ Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial +technologies in the field of medical imaging. Score-based models have proven to +be effective in addressing different inverse problems encountered in CT and +MRI, such as sparse-view CT and fast MRI reconstruction. However, these models +face challenges in achieving accurate three dimensional (3D) volumetric +reconstruction. The existing score-based models primarily focus on +reconstructing two dimensional (2D) data distribution, leading to +inconsistencies between adjacent slices in the reconstructed 3D volumetric +images. To overcome this limitation, we propose a novel two-and-a-half order +score-based model (TOSM). During the training phase, our TOSM learns data +distributions in 2D space, which reduces the complexity of training compared to +directly working on 3D volumes. However, in the reconstruction phase, the TOSM +updates the data distribution in 3D space, utilizing complementary scores along +three directions (sagittal, coronal, and transaxial) to achieve a more precise +reconstruction. The development of TOSM is built on robust theoretical +principles, ensuring its reliability and efficacy. Through extensive +experimentation on large-scale sparse-view CT and fast MRI datasets, our method +demonstrates remarkable advancements and attains state-of-the-art results in +solving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively +addresses the inter-slice inconsistency issue, resulting in high-quality 3D +volumetric reconstruction. + +
+
+
+
+
+ + ☆ ResBuilder: Automated Learning of Depth with Residual Structures + + +
+ In this work, we develop a neural architecture search algorithm, termed +Resbuilder, that develops ResNet architectures from scratch that achieve high +accuracy at moderate computational cost. It can also be used to modify existing +architectures and has the capability to remove and insert ResNet blocks, in +this way searching for suitable architectures in the space of ResNet +architectures. In our experiments on different image classification datasets, +Resbuilder achieves close to state-of-the-art performance while saving +computational cost compared to off-the-shelf ResNets. Noteworthy, we once tune +the parameters on CIFAR10 which yields a suitable default choice for all other +datasets. We demonstrate that this property generalizes even to industrial +applications by applying our method with default parameters on a proprietary +fraud detection dataset. + +
+
+
+
+
+ + ☆ Self-Supervised Online Camera Calibration for Automated Driving and + Parking Applications + + +
+ Camera-based perception systems play a central role in modern autonomous +vehicles. These camera based perception algorithms require an accurate +calibration to map the real world distances to image pixels. In practice, +calibration is a laborious procedure requiring specialised data collection and +careful tuning. This process must be repeated whenever the parameters of the +camera change, which can be a frequent occurrence in autonomous vehicles. Hence +there is a need to calibrate at regular intervals to ensure the camera is +accurate. Proposed is a deep learning framework to learn intrinsic and +extrinsic calibration of the camera in real time. The framework is +self-supervised and doesn't require any labelling or supervision to learn the +calibration parameters. The framework learns calibration without the need for +any physical targets or to drive the car on special planar surfaces. + +
+
+
+
+
+ + ☆ DeDoDe: Detect, Don't Describe -- Describe, Don't Detect for Local + Feature Matching + + +
+ Keypoint detection is a pivotal step in 3D reconstruction, whereby sets of +(up to) K points are detected in each view of a scene. Crucially, the detected +points need to be consistent between views, i.e., correspond to the same 3D +point in the scene. One of the main challenges with keypoint detection is the +formulation of the learning objective. Previous learning-based methods +typically jointly learn descriptors with keypoints, and treat the keypoint +detection as a binary classification task on mutual nearest neighbours. +However, basing keypoint detection on descriptor nearest neighbours is a proxy +task, which is not guaranteed to produce 3D-consistent keypoints. Furthermore, +this ties the keypoints to a specific descriptor, complicating downstream +usage. In this work, we instead learn keypoints directly from 3D consistency. +To this end, we train the detector to detect tracks from large-scale SfM. As +these points are often overly sparse, we derive a semi-supervised two-view +detection objective to expand this set to a desired number of detections. To +train a descriptor, we maximize the mutual nearest neighbour objective over the +keypoints with a separate network. Results show that our approach, DeDoDe, +achieves significant gains on multiple geometry benchmarks. Code is provided at +https://github.com/Parskatt/DeDoDe . + +
+
+
+
+
+ + ☆ Classification Committee for Active Deep Object Detection + + +
+ In object detection, the cost of labeling is much high because it needs not +only to confirm the categories of multiple objects in an image but also to +accurately determine the bounding boxes of each object. Thus, integrating +active learning into object detection will raise pretty positive significance. +In this paper, we propose a classification committee for active deep object +detection method by introducing a discrepancy mechanism of multiple classifiers +for samples' selection when training object detectors. The model contains a +main detector and a classification committee. The main detector denotes the +target object detector trained from a labeled pool composed of the selected +informative images. The role of the classification committee is to select the +most informative images according to their uncertainty values from the view of +classification, which is expected to focus more on the discrepancy and +representative of instances. Specifically, they compute the uncertainty for a +specified instance within the image by measuring its discrepancy output by the +committee pre-trained via the proposed Maximum Classifiers Discrepancy Group +Loss (MCDGL). The most informative images are finally determined by selecting +the ones with many high-uncertainty instances. Besides, to mitigate the impact +of interference instances, we design a Focus on Positive Instances Loss (FPIL) +to make the committee the ability to automatically focus on the representative +instances as well as precisely encode their discrepancies for the same +instance. Experiments are conducted on Pascal VOC and COCO datasets versus some +popular object detectors. And results show that our method outperforms the +state-of-the-art active learning methods, which verifies the effectiveness of +the proposed method. + +
+
+
+
+
+ + ☆ Hierarchical Uncertainty Estimation for Medical Image Segmentation + Networks + + +
+ Learning a medical image segmentation model is an inherently ambiguous task, +as uncertainties exist in both images (noise) and manual annotations (human +errors and bias) used for model training. To build a trustworthy image +segmentation model, it is important to not just evaluate its performance but +also estimate the uncertainty of the model prediction. Most state-of-the-art +image segmentation networks adopt a hierarchical encoder architecture, +extracting image features at multiple resolution levels from fine to coarse. In +this work, we leverage this hierarchical image representation and propose a +simple yet effective method for estimating uncertainties at multiple levels. +The multi-level uncertainties are modelled via the skip-connection module and +then sampled to generate an uncertainty map for the predicted image +segmentation. We demonstrate that a deep learning segmentation network such as +U-net, when implemented with such hierarchical uncertainty estimation module, +can achieve a high segmentation performance, while at the same time provide +meaningful uncertainty maps that can be used for out-of-distribution detection. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Learning to Distill Global Representation for Sparse-View CT ICCV 2023 + + +
+ Sparse-view computed tomography (CT) -- using a small number of projections +for tomographic reconstruction -- enables much lower radiation dose to patients +and accelerated data acquisition. The reconstructed images, however, suffer +from strong artifacts, greatly limiting their diagnostic value. Current trends +for sparse-view CT turn to the raw data for better information recovery. The +resultant dual-domain methods, nonetheless, suffer from secondary artifacts, +especially in ultra-sparse view scenarios, and their generalization to other +scanners/protocols is greatly limited. A crucial question arises: have the +image post-processing methods reached the limit? Our answer is not yet. In this +paper, we stick to image post-processing methods due to great flexibility and +propose global representation (GloRe) distillation framework for sparse-view +CT, termed GloReDi. First, we propose to learn GloRe with Fourier convolution, +so each element in GloRe has an image-wide receptive field. Second, unlike +methods that only use the full-view images for supervision, we propose to +distill GloRe from intermediate-view reconstructed images that are readily +available but not explored in previous literature. The success of GloRe +distillation is attributed to two key components: representation directional +distillation to align the GloRe directions, and band-pass-specific contrastive +distillation to gain clinically important details. Extensive experiments +demonstrate the superiority of the proposed GloReDi over the state-of-the-art +methods, including dual-domain ones. The source code is available at +https://github.com/longzilicart/GloReDi. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement: + Establishing a Novel Baseline and Benchmark + + +
+ The extraction of lakes from remote sensing images is a complex challenge due +to the varied lake shapes and data noise. Current methods rely on multispectral +image datasets, making it challenging to learn lake features accurately from +pixel arrangements. This, in turn, affects model learning and the creation of +accurate segmentation masks. This paper introduces a unified prompt-based +dataset construction approach that provides approximate lake locations using +point, box, and mask prompts. We also propose a two-stage prompt enhancement +framework, LEPrompter, which involves prompt-based and prompt-free stages +during training. The prompt-based stage employs a prompt encoder to extract +prior information, integrating prompt tokens and image embeddings through self- +and cross-attention in the prompt decoder. Prompts are deactivated once the +model is trained to ensure independence during inference, enabling automated +lake extraction. Evaluations on Surface Water and Qinghai-Tibet Plateau Lake +datasets show consistent performance improvements compared to the previous +state-of-the-art method. LEPrompter achieves mIoU scores of 91.48% and 97.43% +on the respective datasets without introducing additional parameters or GFLOPs. +Supplementary materials provide the source code, pre-trained models, and +detailed user studies. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Integrating Visual and Semantic Similarity Using Hierarchies for Image + Retrieval + + +
+ Most of the research in content-based image retrieval (CBIR) focus on +developing robust feature representations that can effectively retrieve +instances from a database of images that are visually similar to a query. +However, the retrieved images sometimes contain results that are not +semantically related to the query. To address this, we propose a method for +CBIR that captures both visual and semantic similarity using a visual +hierarchy. The hierarchy is constructed by merging classes with overlapping +features in the latent space of a deep neural network trained for +classification, assuming that overlapping classes share high visual and +semantic similarities. Finally, the constructed hierarchy is integrated into +the distance calculation metric for similarity search. Experiments on standard +datasets: CUB-200-2011 and CIFAR100, and a real-life use case using diatom +microscopy images show that our method achieves superior performance compared +to the existing methods on image retrieval. + +
+
+ comment: Accepted in ICVS 2023 +
+
+
+
+
+ + ☆ ALIP: Adaptive Language-Image Pre-training with Synthetic Caption ICCV2023 + + +
+ Contrastive Language-Image Pre-training (CLIP) has significantly boosted the +performance of various vision-language tasks by scaling up the dataset with +image-text pairs collected from the web. However, the presence of intrinsic +noise and unmatched image-text pairs in web data can potentially affect the +performance of representation learning. To address this issue, we first utilize +the OFA model to generate synthetic captions that focus on the image content. +The generated captions contain complementary information that is beneficial for +pre-training. Then, we propose an Adaptive Language-Image Pre-training (ALIP), +a bi-path model that integrates supervision from both raw text and synthetic +caption. As the core components of ALIP, the Language Consistency Gate (LCG) +and Description Consistency Gate (DCG) dynamically adjust the weights of +samples and image-text/caption pairs during the training process. Meanwhile, +the adaptive contrastive loss can effectively reduce the impact of noise data +and enhances the efficiency of pre-training data. We validate ALIP with +experiments on different scales of models and pre-training datasets. +Experiments results show that ALIP achieves state-of-the-art performance on +multiple downstream tasks including zero-shot image-text retrieval and linear +probe. To facilitate future research, the code and pre-trained models are +released at https://github.com/deepglint/ALIP. + +
+
+ comment: 15pages, 10figures, ICCV2023 +
+
+
+
+
+ + ☆ Tem-adapter: Adapting Image-Text Pretraining for Video Question Answer ICCV 2023 + + +
+ Video-language pre-trained models have shown remarkable success in guiding +video question-answering (VideoQA) tasks. However, due to the length of video +sequences, training large-scale video-based models incurs considerably higher +costs than training image-based ones. This motivates us to leverage the +knowledge from image-based pretraining, despite the obvious gaps between image +and video domains. To bridge these gaps, in this paper, we propose Tem-Adapter, +which enables the learning of temporal dynamics and complex semantics by a +visual Temporal Aligner and a textual Semantic Aligner. Unlike conventional +pretrained knowledge adaptation methods that only concentrate on the downstream +task objective, the Temporal Aligner introduces an extra language-guided +autoregressive task aimed at facilitating the learning of temporal +dependencies, with the objective of predicting future states based on +historical clues and language guidance that describes event progression. +Besides, to reduce the semantic gap and adapt the textual representation for +better event description, we introduce a Semantic Aligner that first designs a +template to fuse question and answer pairs as event descriptions and then +learns a Transformer decoder with the whole video sequence as guidance for +refinement. We evaluate Tem-Adapter and different pre-train transferring +methods on two VideoQA benchmarks, and the significant performance improvement +demonstrates the effectiveness of our method. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Prediction of post-radiotherapy recurrence volumes in head and neck + squamous cell carcinoma using 3D U-Net segmentation + + +
+ Locoregional recurrences (LRR) are still a frequent site of treatment failure +for head and neck squamous cell carcinoma (HNSCC) patients. + Identification of high risk subvolumes based on pretreatment imaging is key +to biologically targeted radiation therapy. We investigated the extent to which +a Convolutional neural network (CNN) is able to predict LRR volumes based on +pre-treatment 18F-fluorodeoxyglucose positron emission tomography +(FDG-PET)/computed tomography (CT) scans in HNSCC patients and thus the +potential to identify biological high risk volumes using CNNs. + For 37 patients who had undergone primary radiotherapy for oropharyngeal +squamous cell carcinoma, five oncologists contoured the relapse volumes on +recurrence CT scans. Datasets of pre-treatment FDG-PET/CT, gross tumour volume +(GTV) and contoured relapse for each of the patients were randomly divided into +training (n=23), validation (n=7) and test (n=7) datasets. We compared a CNN +trained from scratch, a pre-trained CNN, a SUVmax threshold approach, and using +the GTV directly. + The SUVmax threshold method included 5 out of the 7 relapse origin points +within a volume of median 4.6 cubic centimetres (cc). Both the GTV contour and +best CNN segmentations included the relapse origin 6 out of 7 times with median +volumes of 28 and 18 cc respectively. + The CNN included the same or greater number of relapse volume POs, with +significantly smaller relapse volumes. Our novel findings indicate that CNNs +may predict LRR, yet further work on dataset development is required to attain +clinically useful prediction accuracy. + +
+
+
+
+
+ + ☆ SIGMA: Scale-Invariant Global Sparse Shape Matching + + +
+ We propose a novel mixed-integer programming (MIP) formulation for generating +precise sparse correspondences for highly non-rigid shapes. To this end, we +introduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic +and extrinsic geometric information to measure the deformation quality induced +by predicted correspondences. We integrate the PLBO, together with an +orientation-aware regulariser, into a novel MIP formulation that can be solved +to global optimality for many practical problems. In contrast to previous +methods, our approach is provably invariant to rigid transformations and global +scaling, initialisation-free, has optimality guarantees, and scales to high +resolution meshes with (empirically observed) linear time. We show +state-of-the-art results for sparse non-rigid matching on several challenging +3D datasets, including data with inconsistent meshing, as well as applications +in mesh-to-point-cloud matching. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Robust Autonomous Vehicle Pursuit without Expert Steering Labels + + +
+ In this work, we present a learning method for lateral and longitudinal +motion control of an ego-vehicle for vehicle pursuit. The car being controlled +does not have a pre-defined route, rather it reactively adapts to follow a +target vehicle while maintaining a safety distance. To train our model, we do +not rely on steering labels recorded from an expert driver but effectively +leverage a classical controller as an offline label generation tool. In +addition, we account for the errors in the predicted control values, which can +lead to a loss of tracking and catastrophic crashes of the controlled vehicle. +To this end, we propose an effective data augmentation approach, which allows +to train a network capable of handling different views of the target vehicle. +During the pursuit, the target vehicle is firstly localized using a +Convolutional Neural Network. The network takes a single RGB image along with +cars' velocities and estimates the target vehicle's pose with respect to the +ego-vehicle. This information is then fed to a Multi-Layer Perceptron, which +regresses the control commands for the ego-vehicle, namely throttle and +steering angle. We extensively validate our approach using the CARLA simulator +on a wide range of terrains. Our method demonstrates real-time performance and +robustness to different scenarios including unseen trajectories and high route +completion. The project page containing code and multimedia can be publicly +accessed here: https://changyaozhou.github.io/Autonomous-Vehicle-Pursuit/. + +
+
+ comment: 9 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Automated Semiconductor Defect Inspection in Scanning Electron + Microscope Images: a Systematic Review + + +
+ A growing need exists for efficient and accurate methods for detecting +defects in semiconductor materials and devices. These defects can have a +detrimental impact on the efficiency of the manufacturing process, because they +cause critical failures and wafer-yield limitations. As nodes and patterns get +smaller, even high-resolution imaging techniques such as Scanning Electron +Microscopy (SEM) produce noisy images due to operating close to sensitivity +levels and due to varying physical properties of different underlayers or +resist materials. This inherent noise is one of the main challenges for defect +inspection. One promising approach is the use of machine learning algorithms, +which can be trained to accurately classify and locate defects in semiconductor +samples. Recently, convolutional neural networks have proved to be particularly +useful in this regard. This systematic review provides a comprehensive overview +of the state of automated semiconductor defect inspection on SEM images, +including the most recent innovations and developments. 38 publications were +selected on this topic, indexed in IEEE Xplore and SPIE databases. For each of +these, the application, methodology, dataset, results, limitations and future +work were summarized. A comprehensive overview and analysis of their methods is +provided. Finally, promising avenues for future work in the field of SEM-based +defect inspection are suggested. + +
+
+ comment: 16 pages, 12 figures, 3 tables +
+
+
+
+
+ + ☆ Agglomerative Transformer for Human-Object Interaction Detection ICCV'23 + + +
+ We propose an agglomerative Transformer (AGER) that enables Transformer-based +human-object interaction (HOI) detectors to flexibly exploit extra +instance-level cues in a single-stage and end-to-end manner for the first time. +AGER acquires instance tokens by dynamically clustering patch tokens and +aligning cluster centers to instances with textual guidance, thus enjoying two +benefits: 1) Integrality: each instance token is encouraged to contain all +discriminative feature regions of an instance, which demonstrates a significant +improvement in the extraction of different instance-level cues and subsequently +leads to a new state-of-the-art performance of HOI detection with 36.75 mAP on +HICO-Det. 2) Efficiency: the dynamical clustering mechanism allows AGER to +generate instance tokens jointly with the feature learning of the Transformer +encoder, eliminating the need of an additional object detector or instance +decoder in prior methods, thus allowing the extraction of desirable extra cues +for HOI detection in a single-stage and end-to-end pipeline. Concretely, AGER +reduces GFLOPs by 8.5% and improves FPS by 36%, even compared to a vanilla +DETR-like pipeline without extra cue extraction. + +
+
+ comment: Accepted by ICCV'23 +
+
+
+
+
+ + ☆ Diff-CAPTCHA: An Image-based CAPTCHA with Security Enhanced by Denoising + Diffusion Model + + +
+ To enhance the security of text CAPTCHAs, various methods have been employed, +such as adding the interference lines on the text, randomly distorting the +characters, and overlapping multiple characters. These methods partly increase +the difficulty of automated segmentation and recognition attacks. However, +facing the rapid development of the end-to-end breaking algorithms, their +security has been greatly weakened. The diffusion model is a novel image +generation model that can generate the text images with deep fusion of +characters and background images. In this paper, an image-click CAPTCHA scheme +called Diff-CAPTCHA is proposed based on denoising diffusion models. The +background image and characters of the CAPTCHA are treated as a whole to guide +the generation process of a diffusion model, thus weakening the character +features available for machine learning, enhancing the diversity of character +features in the CAPTCHA, and increasing the difficulty of breaking algorithms. +To evaluate the security of Diff-CAPTCHA, this paper develops several attack +methods, including end-to-end attacks based on Faster R-CNN and two-stage +attacks, and Diff-CAPTCHA is compared with three baseline schemes, including +commercial CAPTCHA scheme and security-enhanced CAPTCHA scheme based on style +transfer. The experimental results show that diffusion models can effectively +enhance CAPTCHA security while maintaining good usability in human testing. + +
+
+
+
+
+ + ☆ DeepContrast: Deep Tissue Contrast Enhancement using Synthetic Data + Degradations and OOD Model Predictions + + +
+ Microscopy images are crucial for life science research, allowing detailed +inspection and characterization of cellular and tissue-level structures and +functions. However, microscopy data are unavoidably affected by image +degradations, such as noise, blur, or others. Many such degradations also +contribute to a loss of image contrast, which becomes especially pronounced in +deeper regions of thick samples. Today, best performing methods to increase the +quality of images are based on Deep Learning approaches, which typically +require ground truth (GT) data during training. Our inability to counteract +blurring and contrast loss when imaging deep into samples prevents the +acquisition of such clean GT data. The fact that the forward process of +blurring and contrast loss deep into tissue can be modeled, allowed us to +propose a new method that can circumvent the problem of unobtainable GT data. +To this end, we first synthetically degraded the quality of microscopy images +even further by using an approximate forward model for deep tissue image +degradations. Then we trained a neural network that learned the inverse of this +degradation function from our generated pairs of raw and degraded images. We +demonstrated that networks trained in this way can be used out-of-distribution +(OOD) to improve the quality of less severely degraded images, e.g. the raw +data imaged in a microscope. Since the absolute level of degradation in such +microscopy images can be stronger than the additional degradation introduced by +our forward model, we also explored the effect of iterative predictions. Here, +we observed that in each iteration the measured image contrast kept improving +while detailed structures in the images got increasingly removed. Therefore, +dependent on the desired downstream analysis, a balance between contrast +improvement and retention of image details has to be found. + +
+
+ comment: 8 pages, 7 figures, 1 table +
+
+
+
+
+ + ☆ KernelWarehouse: Towards Parameter-Efficient Dynamic Convolution + + +
+ Dynamic convolution learns a linear mixture of $n$ static kernels weighted +with their sample-dependent attentions, demonstrating superior performance +compared to normal convolution. However, existing designs are +parameter-inefficient: they increase the number of convolutional parameters by +$n$ times. This and the optimization difficulty lead to no research progress in +dynamic convolution that can allow us to use a significant large value of $n$ +(e.g., $n>100$ instead of typical setting $n<10$) to push forward the +performance boundary. In this paper, we propose $KernelWarehouse$, a more +general form of dynamic convolution, which can strike a favorable trade-off +between parameter efficiency and representation power. Its key idea is to +redefine the basic concepts of "$kernels$" and "$assembling$ $kernels$" in +dynamic convolution from the perspective of reducing kernel dimension and +increasing kernel number significantly. In principle, KernelWarehouse enhances +convolutional parameter dependencies within the same layer and across +successive layers via tactful kernel partition and warehouse sharing, yielding +a high degree of freedom to fit a desired parameter budget. We validate our +method on ImageNet and MS-COCO datasets with different ConvNet architectures, +and show that it attains state-of-the-art results. For instance, the +ResNet18|ResNet50|MobileNetV2|ConvNeXt-Tiny model trained with KernelWarehouse +on ImageNet reaches 76.05%|81.05%|75.52%|82.51% top-1 accuracy. Thanks to its +flexible design, KernelWarehouse can even reduce the model size of a ConvNet +while improving the accuracy, e.g., our ResNet18 model with 36.45%|65.10% +parameter reduction to the baseline shows 2.89%|2.29% absolute improvement to +top-1 accuracy. + +
+
+ comment: This research work was completed and submitted in early May 2023. + Code and pre-trained models are available at + https://github.com/OSVAI/KernelWarehouse +
+
+
+
+
+ + ☆ Membrane Potential Batch Normalization for Spiking Neural Networks ICCV2023 + + +
+ As one of the energy-efficient alternatives of conventional neural networks +(CNNs), spiking neural networks (SNNs) have gained more and more interest +recently. To train the deep models, some effective batch normalization (BN) +techniques are proposed in SNNs. All these BNs are suggested to be used after +the convolution layer as usually doing in CNNs. However, the spiking neuron is +much more complex with the spatio-temporal dynamics. The regulated data flow +after the BN layer will be disturbed again by the membrane potential updating +operation before the firing function, i.e., the nonlinear activation. +Therefore, we advocate adding another BN layer before the firing function to +normalize the membrane potential again, called MPBN. To eliminate the induced +time cost of MPBN, we also propose a training-inference-decoupled +re-parameterization technique to fold the trained MPBN into the firing +threshold. With the re-parameterization technique, the MPBN will not introduce +any extra time burden in the inference. Furthermore, the MPBN can also adopt +the element-wised form, while these BNs after the convolution layer can only +use the channel-wised form. Experimental results show that the proposed MPBN +performs well on both popular non-spiking static and neuromorphic datasets. Our +code is open-sourced at \href{https://github.com/yfguo91/MPBN}{MPBN}. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ GAEI-UNet: Global Attention and Elastic Interaction U-Net for Vessel + Image Segmentation + + +
+ Vessel image segmentation plays a pivotal role in medical diagnostics, aiding +in the early detection and treatment of vascular diseases. While segmentation +based on deep learning has shown promising results, effectively segmenting +small structures and maintaining connectivity between them remains challenging. +To address these limitations, we propose GAEI-UNet, a novel model that combines +global attention and elastic interaction-based techniques. GAEI-UNet leverages +global spatial and channel context information to enhance high-level semantic +understanding within the U-Net architecture, enabling precise segmentation of +small vessels. Additionally, we adopt an elastic interaction-based loss +function to improve connectivity among these fine structures. By capturing the +forces generated by misalignment between target and predicted shapes, our model +effectively learns to preserve the correct topology of vessel networks. +Evaluation on retinal vessel dataset -- DRIVE demonstrates the superior +performance of GAEI-UNet in terms of SE and connectivity of small structures, +without significantly increasing computational complexity. This research aims +to advance the field of vessel image segmentation, providing more accurate and +reliable diagnostic tools for the medical community. The implementation code is +available on Code. + +
+
+ comment: BIBM 2023 Under Review +
+
+
+
+
+ + ☆ Denoising Diffusion Probabilistic Model for Retinal Image Generation and + Segmentation + + +
+ Experts use retinal images and vessel trees to detect and diagnose various +eye, blood circulation, and brain-related diseases. However, manual +segmentation of retinal images is a time-consuming process that requires high +expertise and is difficult due to privacy issues. Many methods have been +proposed to segment images, but the need for large retinal image datasets +limits the performance of these methods. Several methods synthesize deep +learning models based on Generative Adversarial Networks (GAN) to generate +limited sample varieties. This paper proposes a novel Denoising Diffusion +Probabilistic Model (DDPM) that outperformed GANs in image synthesis. We +developed a Retinal Trees (ReTree) dataset consisting of retinal images, +corresponding vessel trees, and a segmentation network based on DDPM trained +with images from the ReTree dataset. In the first stage, we develop a two-stage +DDPM that generates vessel trees from random numbers belonging to a standard +normal distribution. Later, the model is guided to generate fundus images from +given vessel trees and random distribution. The proposed dataset has been +evaluated quantitatively and qualitatively. Quantitative evaluation metrics +include Frechet Inception Distance (FID) score, Jaccard similarity coefficient, +Cohen's kappa, Matthew's Correlation Coefficient (MCC), precision, recall, +F1-score, and accuracy. We trained the vessel segmentation model with synthetic +data to validate our dataset's efficiency and tested it on authentic data. Our +developed dataset and source code is available at +https://github.com/AAleka/retree. + +
+
+ comment: International Conference on Computational Photography 2023 (ICCP + 2023) +
+
+
+
+
+ + ☆ Improving Depth Gradient Continuity in Transformers: A Comparative Study + on Monocular Depth Estimation with CNN + + +
+ Monocular depth estimation is an ongoing challenge in computer vision. Recent +progress with Transformer models has demonstrated notable advantages over +conventional CNNs in this area. However, there's still a gap in understanding +how these models prioritize different regions in 2D images and how these +regions affect depth estimation performance. To explore the differences between +Transformers and CNNs, we employ a sparse pixel approach to contrastively +analyze the distinctions between the two. Our findings suggest that while +Transformers excel in handling global context and intricate textures, they lag +behind CNNs in preserving depth gradient continuity. To further enhance the +performance of Transformer models in monocular depth estimation, we propose the +Depth Gradient Refinement (DGR) module that refines depth estimation through +high-order differentiation, feature fusion, and recalibration. Additionally, we +leverage optimal transport theory, treating depth maps as spatial probability +distributions, and employ the optimal transport distance as a loss function to +optimize our model. Experimental results demonstrate that models integrated +with the plug-and-play Depth Gradient Refinement (DGR) module and the proposed +loss function enhance performance without increasing complexity and +computational costs. This research not only offers fresh insights into the +distinctions between Transformers and CNNs in depth estimation but also paves +the way for novel depth estimation methodologies. + +
+
+
+
+
+ + ☆ AdaBrowse: Adaptive Video Browser for Efficient Continuous Sign Language + Recognition + + +
+ Raw videos have been proven to own considerable feature redundancy where in +many cases only a portion of frames can already meet the requirements for +accurate recognition. In this paper, we are interested in whether such +redundancy can be effectively leveraged to facilitate efficient inference in +continuous sign language recognition (CSLR). We propose a novel adaptive model +(AdaBrowse) to dynamically select a most informative subsequence from input +video sequences by modelling this problem as a sequential decision task. In +specific, we first utilize a lightweight network to quickly scan input videos +to extract coarse features. Then these features are fed into a policy network +to intelligently select a subsequence to process. The corresponding subsequence +is finally inferred by a normal CSLR model for sentence prediction. As only a +portion of frames are processed in this procedure, the total computations can +be considerably saved. Besides temporal redundancy, we are also interested in +whether the inherent spatial redundancy can be seamlessly integrated together +to achieve further efficiency, i.e., dynamically selecting a lowest input +resolution for each sample, whose model is referred to as AdaBrowse+. Extensive +experimental results on four large-scale CSLR datasets, i.e., PHOENIX14, +PHOENIX14-T, CSL-Daily and CSL, demonstrate the effectiveness of AdaBrowse and +AdaBrowse+ by achieving comparable accuracy with state-of-the-art methods with +1.44$\times$ throughput and 2.12$\times$ fewer FLOPs. Comparisons with other +commonly-used 2D CNNs and adaptive efficient methods verify the effectiveness +of AdaBrowse. Code is available at +\url{https://github.com/hulianyuyy/AdaBrowse}. + +
+
+ comment: ACMMM2023 +
+
+
+
+
+ + ☆ Visually-Aware Context Modeling for News Image Captioning + + +
+ The goal of News Image Captioning is to generate an image caption according +to the content of both a news article and an image. To leverage the visual +information effectively, it is important to exploit the connection between the +context in the articles/captions and the images. Psychological studies indicate +that human faces in images draw higher attention priorities. On top of that, +humans often play a central role in news stories, as also proven by the +face-name co-occurrence pattern we discover in existing News Image Captioning +datasets. Therefore, we design a face-naming module for faces in images and +names in captions/articles to learn a better name embedding. Apart from names, +which can be directly linked to an image area (faces), news image captions +mostly contain context information that can only be found in the article. +Humans typically address this by searching for relevant information from the +article based on the image. To emulate this thought process, we design a +retrieval strategy using CLIP to retrieve sentences that are semantically close +to the image. We conduct extensive experiments to demonstrate the efficacy of +our framework. Without using additional paired data, we establish the new +state-of-the-art performance on two News Image Captioning datasets, exceeding +the previous state-of-the-art by 5 CIDEr points. We will release code upon +acceptance. + +
+
+
+
+
+ + ☆ Stable and Causal Inference for Discriminative Self-supervised Deep + Visual Representations ICCV 2023 + + +
+ In recent years, discriminative self-supervised methods have made significant +strides in advancing various visual tasks. The central idea of learning a data +encoder that is robust to data distortions/augmentations is straightforward yet +highly effective. Although many studies have demonstrated the empirical success +of various learning methods, the resulting learned representations can exhibit +instability and hinder downstream performance. In this study, we analyze +discriminative self-supervised methods from a causal perspective to explain +these unstable behaviors and propose solutions to overcome them. Our approach +draws inspiration from prior works that empirically demonstrate the ability of +discriminative self-supervised methods to demix ground truth causal sources to +some extent. Unlike previous work on causality-empowered representation +learning, we do not apply our solutions during the training process but rather +during the inference process to improve time efficiency. Through experiments on +both controlled image datasets and realistic image datasets, we show that our +proposed solutions, which involve tempering a linear transformation with +controlled synthetic data, are effective in addressing these issues. + +
+
+ comment: ICCV 2023 accepted paper +
+
+
+
+
+ + ☆ Dual-Stream Diffusion Net for Text-to-Video Generation + + +
+ With the emerging diffusion models, recently, text-to-video generation has +aroused increasing attention. But an important bottleneck therein is that +generative videos often tend to carry some flickers and artifacts. In this +work, we propose a dual-stream diffusion net (DSDN) to improve the consistency +of content variations in generating videos. In particular, the designed two +diffusion streams, video content and motion branches, could not only run +separately in their private spaces for producing personalized video variations +as well as content, but also be well-aligned between the content and motion +domains through leveraging our designed cross-transformer interaction module, +which would benefit the smoothness of generated videos. Besides, we also +introduce motion decomposer and combiner to faciliate the operation on video +motion. Qualitative and quantitative experiments demonstrate that our method +could produce amazing continuous videos with fewer flickers. + +
+
+ comment: 8pages, 7 figures +
+
+
+
+
+ + ☆ ECPC-IDS:A benchmark endometrail cancer PET/CT image dataset for + evaluation of semantic segmentation and detection of hypermetabolic regions + + +
+ Endometrial cancer is one of the most common tumors in the female +reproductive system and is the third most common gynecological malignancy that +causes death after ovarian and cervical cancer. Early diagnosis can +significantly improve the 5-year survival rate of patients. With the +development of artificial intelligence, computer-assisted diagnosis plays an +increasingly important role in improving the accuracy and objectivity of +diagnosis, as well as reducing the workload of doctors. However, the absence of +publicly available endometrial cancer image datasets restricts the application +of computer-assisted diagnostic techniques.In this paper, a publicly available +Endometrial Cancer PET/CT Image Dataset for Evaluation of Semantic Segmentation +and Detection of Hypermetabolic Regions (ECPC-IDS) are published. Specifically, +the segmentation section includes PET and CT images, with a total of 7159 +images in multiple formats. In order to prove the effectiveness of segmentation +methods on ECPC-IDS, five classical deep learning semantic segmentation methods +are selected to test the image segmentation task. The object detection section +also includes PET and CT images, with a total of 3579 images and XML files with +annotation information. Six deep learning methods are selected for experiments +on the detection task.This study conduct extensive experiments using deep +learning-based semantic segmentation and object detection methods to +demonstrate the differences between various methods on ECPC-IDS. As far as we +know, this is the first publicly available dataset of endometrial cancer with a +large number of multiple images, including a large amount of information +required for image and target detection. ECPC-IDS can aid researchers in +exploring new algorithms to enhance computer-assisted technology, benefiting +both clinical doctors and patients greatly. + +
+
+ comment: 14 pages,6 figures +
+
+
+
+
+ + ☆ Leveraging Next-Active Objects for Context-Aware Anticipation in + Egocentric Videos WACV'24 + + +
+ Objects are crucial for understanding human-object interactions. By +identifying the relevant objects, one can also predict potential future +interactions or actions that may occur with these objects. In this paper, we +study the problem of Short-Term Object interaction anticipation (STA) and +propose NAOGAT (Next-Active-Object Guided Anticipation Transformer), a +multi-modal end-to-end transformer network, that attends to objects in observed +frames in order to anticipate the next-active-object (NAO) and, eventually, to +guide the model to predict context-aware future actions. The task is +challenging since it requires anticipating future action along with the object +with which the action occurs and the time after which the interaction will +begin, a.k.a. the time to contact (TTC). Compared to existing video modeling +architectures for action anticipation, NAOGAT captures the relationship between +objects and the global scene context in order to predict detections for the +next active object and anticipate relevant future actions given these +detections, leveraging the objects' dynamics to improve accuracy. One of the +key strengths of our approach, in fact, is its ability to exploit the motion +dynamics of objects within a given clip, which is often ignored by other +models, and separately decoding the object-centric and motion-centric +information. Through our experiments, we show that our model outperforms +existing methods on two separate datasets, Ego4D and EpicKitchens-100 ("Unseen +Set"), as measured by several additional metrics, such as time to contact, and +next-active-object localization. The code will be available upon acceptance. + +
+
+ comment: Accepted in WACV'24 +
+
+
+
+
+ + ☆ Improving Audio-Visual Segmentation with Bidirectional Generation + + +
+ The aim of audio-visual segmentation (AVS) is to precisely differentiate +audible objects within videos down to the pixel level. Traditional approaches +often tackle this challenge by combining information from various modalities, +where the contribution of each modality is implicitly or explicitly modeled. +Nevertheless, the interconnections between different modalities tend to be +overlooked in audio-visual modeling. In this paper, inspired by the human +ability to mentally simulate the sound of an object and its visual appearance, +we introduce a bidirectional generation framework. This framework establishes +robust correlations between an object's visual characteristics and its +associated sound, thereby enhancing the performance of AVS. To achieve this, we +employ a visual-to-audio projection component that reconstructs audio features +from object segmentation masks and minimizes reconstruction errors. Moreover, +recognizing that many sounds are linked to object movements, we introduce an +implicit volumetric motion estimation module to handle temporal dynamics that +may be challenging to capture using conventional optical flow methods. To +showcase the effectiveness of our approach, we conduct comprehensive +experiments and analyses on the widely recognized AVSBench benchmark. As a +result, we establish a new state-of-the-art performance level in the AVS +benchmark, particularly excelling in the challenging MS3 subset which involves +segmenting multiple sound sources. To facilitate reproducibility, we plan to +release both the source code and the pre-trained model. + +
+
+ comment: Dawei Hao and Yuxin Mao contribute equality to this paper. Yiran + Zhong is the corresponding author. The code will be released at + https://github.com/OpenNLPLab/AVS-bidirectional +
+
+
+
+
+ + ☆ CARE: A Large Scale CT Image Dataset and Clinical Applicable Benchmark + Model for Rectal Cancer Segmentation + + +
+ Rectal cancer segmentation of CT image plays a crucial role in timely +clinical diagnosis, radiotherapy treatment, and follow-up. Although current +segmentation methods have shown promise in delineating cancerous tissues, they +still encounter challenges in achieving high segmentation precision. These +obstacles arise from the intricate anatomical structures of the rectum and the +difficulties in performing differential diagnosis of rectal cancer. +Additionally, a major obstacle is the lack of a large-scale, finely annotated +CT image dataset for rectal cancer segmentation. To address these issues, this +work introduces a novel large scale rectal cancer CT image dataset CARE with +pixel-level annotations for both normal and cancerous rectum, which serves as a +valuable resource for algorithm research and clinical application development. +Moreover, we propose a novel medical cancer lesion segmentation benchmark model +named U-SAM. The model is specifically designed to tackle the challenges posed +by the intricate anatomical structures of abdominal organs by incorporating +prompt information. U-SAM contains three key components: promptable information +(e.g., points) to aid in target area localization, a convolution module for +capturing low-level lesion details, and skip-connections to preserve and +recover spatial information during the encoding-decoding process. To evaluate +the effectiveness of U-SAM, we systematically compare its performance with +several popular segmentation methods on the CARE dataset. The generalization of +the model is further verified on the WORD dataset. Extensive experiments +demonstrate that the proposed U-SAM outperforms state-of-the-art methods on +these two datasets. These experiments can serve as the baseline for future +research and clinical application development. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Computer vision-enriched discrete choice models, with an application to + residential location choice + + +
+ Visual imagery is indispensable to many multi-attribute decision situations. +Examples of such decision situations in travel behaviour research include +residential location choices, vehicle choices, tourist destination choices, and +various safety-related choices. However, current discrete choice models cannot +handle image data and thus cannot incorporate information embedded in images +into their representations of choice behaviour. This gap between discrete +choice models' capabilities and the real-world behaviour it seeks to model +leads to incomplete and, possibly, misleading outcomes. To solve this gap, this +study proposes "Computer Vision-enriched Discrete Choice Models" (CV-DCMs). +CV-DCMs can handle choice tasks involving numeric attributes and images by +integrating computer vision and traditional discrete choice models. Moreover, +because CV-DCMs are grounded in random utility maximisation principles, they +maintain the solid behavioural foundation of traditional discrete choice +models. We demonstrate the proposed CV-DCM by applying it to data obtained +through a novel stated choice experiment involving residential location +choices. In this experiment, respondents faced choice tasks with trade-offs +between commute time, monthly housing cost and street-level conditions, +presented using images. As such, this research contributes to the growing body +of literature in the travel behaviour field that seeks to integrate discrete +choice modelling and machine learning. + +
+
+
+
+
+ + ☆ Detecting Olives with Synthetic or Real Data? Olive the Above + + +
+ Modern robotics has enabled the advancement in yield estimation for precision +agriculture. However, when applied to the olive industry, the high variation of +olive colors and their similarity to the background leaf canopy presents a +challenge. Labeling several thousands of very dense olive grove images for +segmentation is a labor-intensive task. This paper presents a novel approach to +detecting olives without the need to manually label data. In this work, we +present the world's first olive detection dataset comprised of synthetic and +real olive tree images. This is accomplished by generating an auto-labeled +photorealistic 3D model of an olive tree. Its geometry is then simplified for +lightweight rendering purposes. In addition, experiments are conducted with a +mix of synthetically generated and real images, yielding an improvement of up +to 66% compared to when only using a small sample of real data. When access to +real, human-labeled data is limited, a combination of mostly synthetic data and +a small amount of real data can enhance olive detection. + +
+
+
+
+
+ + ☆ OnUVS: Online Feature Decoupling Framework for High-Fidelity Ultrasound + Video Synthesis + + +
+ Ultrasound (US) imaging is indispensable in clinical practice. To diagnose +certain diseases, sonographers must observe corresponding dynamic anatomic +structures to gather comprehensive information. However, the limited +availability of specific US video cases causes teaching difficulties in +identifying corresponding diseases, which potentially impacts the detection +rate of such cases. The synthesis of US videos may represent a promising +solution to this issue. Nevertheless, it is challenging to accurately animate +the intricate motion of dynamic anatomic structures while preserving image +fidelity. To address this, we present a novel online feature-decoupling +framework called OnUVS for high-fidelity US video synthesis. Our highlights can +be summarized by four aspects. First, we introduced anatomic information into +keypoint learning through a weakly-supervised training strategy, resulting in +improved preservation of anatomical integrity and motion while minimizing the +labeling burden. Second, to better preserve the integrity and textural +information of US images, we implemented a dual-decoder that decouples the +content and textural features in the generator. Third, we adopted a +multiple-feature discriminator to extract a comprehensive range of visual cues, +thereby enhancing the sharpness and fine details of the generated videos. +Fourth, we constrained the motion trajectories of keypoints during online +learning to enhance the fluidity of generated videos. Our validation and user +studies on in-house echocardiographic and pelvic floor US videos showed that +OnUVS synthesizes US videos with high fidelity. + +
+
+ comment: 14 pages, 13 figures and 6 tables +
+
+
+
+
+ + ☆ SceNeRFlow: Time-Consistent Reconstruction of General Dynamic Scenes + + +
+ Existing methods for the 4D reconstruction of general, non-rigidly deforming +objects focus on novel-view synthesis and neglect correspondences. However, +time consistency enables advanced downstream tasks like 3D editing, motion +analysis, or virtual-asset creation. We propose SceNeRFlow to reconstruct a +general, non-rigid scene in a time-consistent manner. Our dynamic-NeRF method +takes multi-view RGB videos and background images from static cameras with +known camera parameters as input. It then reconstructs the deformations of an +estimated canonical model of the geometry and appearance in an online fashion. +Since this canonical model is time-invariant, we obtain correspondences even +for long-term, long-range motions. We employ neural scene representations to +parametrize the components of our method. Like prior dynamic-NeRF methods, we +use a backwards deformation model. We find non-trivial adaptations of this +model necessary to handle larger motions: We decompose the deformations into a +strongly regularized coarse component and a weakly regularized fine component, +where the coarse component also extends the deformation field into the space +surrounding the object, which enables tracking over time. We show +experimentally that, unlike prior work that only handles small motion, our +method enables the reconstruction of studio-scale motions. + +
+
+ comment: Project page: https://vcai.mpi-inf.mpg.de/projects/scenerflow/ +
+
+
+
+
+ + ☆ MultiMediate'23: Engagement Estimation and Bodily Behaviour Recognition + in Social Interactions + + +
+ Automatic analysis of human behaviour is a fundamental prerequisite for the +creation of machines that can effectively interact with- and support humans in +social interactions. In MultiMediate'23, we address two key human social +behaviour analysis tasks for the first time in a controlled challenge: +engagement estimation and bodily behaviour recognition in social interactions. +This paper describes the MultiMediate'23 challenge and presents novel sets of +annotations for both tasks. For engagement estimation we collected novel +annotations on the NOvice eXpert Interaction (NOXI) database. For bodily +behaviour recognition, we annotated test recordings of the MPIIGroupInteraction +corpus with the BBSI annotation scheme. In addition, we present baseline +results for both challenge tasks. + +
+
+ comment: ACM MultiMedia'23 +
+
+
+
+
+ + ☆ Contrastive Learning for Lane Detection via cross-similarity + + +
+ Detecting road lanes is challenging due to intricate markings vulnerable to +unfavorable conditions. Lane markings have strong shape priors, but their +visibility is easily compromised. Factors like lighting, weather, vehicles, +pedestrians, and aging colors challenge the detection. A large amount of data +is required to train a lane detection approach that can withstand natural +variations caused by low visibility. This is because there are numerous lane +shapes and natural variations that exist. Our solution, Contrastive Learning +for Lane Detection via cross-similarity (CLLD), is a self-supervised learning +method that tackles this challenge by enhancing lane detection models +resilience to real-world conditions that cause lane low visibility. CLLD is a +novel multitask contrastive learning that trains lane detection approaches to +detect lane markings even in low visible situations by integrating local +feature contrastive learning (CL) with our new proposed operation +cross-similarity. Local feature CL focuses on extracting features for small +image parts, which is necessary to localize lane segments, while +cross-similarity captures global features to detect obscured lane segments +using their surrounding. We enhance cross-similarity by randomly masking parts +of input images for augmentation. Evaluated on benchmark datasets, CLLD +outperforms state-of-the-art contrastive learning, especially in +visibility-impairing conditions like shadows. Compared to supervised learning, +CLLD excels in scenarios like shadows and crowded scenes. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ DDF-HO: Hand-Held Object Reconstruction via Conditional Directed + Distance Field + + +
+ Reconstructing hand-held objects from a single RGB image is an important and +challenging problem. Existing works utilizing Signed Distance Fields (SDF) +reveal limitations in comprehensively capturing the complex hand-object +interactions, since SDF is only reliable within the proximity of the target, +and hence, infeasible to simultaneously encode local hand and object cues. To +address this issue, we propose DDF-HO, a novel approach leveraging Directed +Distance Field (DDF) as the shape representation. Unlike SDF, DDF maps a ray in +3D space, consisting of an origin and a direction, to corresponding DDF values, +including a binary visibility signal determining whether the ray intersects the +objects and a distance value measuring the distance from origin to target in +the given direction. We randomly sample multiple rays and collect local to +global geometric features for them by introducing a novel 2D ray-based feature +aggregation scheme and a 3D intersection-aware hand pose embedding, combining +2D-3D features to model hand-object interactions. Extensive experiments on +synthetic and real-world datasets demonstrate that DDF-HO consistently +outperforms all baseline methods by a large margin, especially under Chamfer +Distance, with about 80% leap forward. Codes and trained models will be +released soon. + +
+
+
+
+
+ + ☆ Inherent Redundancy in Spiking Neural Networks ICCV2023 + + +
+ Spiking Neural Networks (SNNs) are well known as a promising energy-efficient +alternative to conventional artificial neural networks. Subject to the +preconceived impression that SNNs are sparse firing, the analysis and +optimization of inherent redundancy in SNNs have been largely overlooked, thus +the potential advantages of spike-based neuromorphic computing in accuracy and +energy efficiency are interfered. In this work, we pose and focus on three key +questions regarding the inherent redundancy in SNNs. We argue that the +redundancy is induced by the spatio-temporal invariance of SNNs, which enhances +the efficiency of parameter utilization but also invites lots of noise spikes. +Further, we analyze the effect of spatio-temporal invariance on the +spatio-temporal dynamics and spike firing of SNNs. Then, motivated by these +analyses, we propose an Advance Spatial Attention (ASA) module to harness SNNs' +redundancy, which can adaptively optimize their membrane potential distribution +by a pair of individual spatial attention sub-modules. In this way, noise spike +features are accurately regulated. Experimental results demonstrate that the +proposed method can significantly drop the spike firing with better performance +than state-of-the-art SNN baselines. Our code is available in +\url{https://github.com/BICLab/ASA-SNN}. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ How To Overcome Confirmation Bias in Semi-Supervised Image + Classification By Active Learning ECML + + +
+ Do we need active learning? The rise of strong deep semi-supervised methods +raises doubt about the usability of active learning in limited labeled data +settings. This is caused by results showing that combining semi-supervised +learning (SSL) methods with a random selection for labeling can outperform +existing active learning (AL) techniques. However, these results are obtained +from experiments on well-established benchmark datasets that can overestimate +the external validity. However, the literature lacks sufficient research on the +performance of active semi-supervised learning methods in realistic data +scenarios, leaving a notable gap in our understanding. Therefore we present +three data challenges common in real-world applications: between-class +imbalance, within-class imbalance, and between-class similarity. These +challenges can hurt SSL performance due to confirmation bias. We conduct +experiments with SSL and AL on simulated data challenges and find that random +sampling does not mitigate confirmation bias and, in some cases, leads to worse +performance than supervised learning. In contrast, we demonstrate that AL can +overcome confirmation bias in SSL in these realistic settings. Our results +provide insights into the potential of combining active and semi-supervised +learning in the presence of common real-world challenges, which is a promising +direction for robust methods when learning with limited labeled data in +real-world applications. + +
+
+ comment: Accepted @ ECML PKDD 2023. This is the author's version of the work. + The definitive Version of Record will be published in the Proceedings of ECML + PKDD 2023 +
+
+
+
+
+ + ☆ Low-Light Image Enhancement with Illumination-Aware Gamma Correction and + Complete Image Modelling Network ICCV 2023 + + +
+ This paper presents a novel network structure with illumination-aware gamma +correction and complete image modelling to solve the low-light image +enhancement problem. Low-light environments usually lead to less informative +large-scale dark areas, directly learning deep representations from low-light +images is insensitive to recovering normal illumination. We propose to +integrate the effectiveness of gamma correction with the strong modelling +capacities of deep networks, which enables the correction factor gamma to be +learned in a coarse to elaborate manner via adaptively perceiving the deviated +illumination. Because exponential operation introduces high computational +complexity, we propose to use Taylor Series to approximate gamma correction, +accelerating the training and inference speed. Dark areas usually occupy large +scales in low-light images, common local modelling structures, e.g., CNN, +SwinIR, are thus insufficient to recover accurate illumination across whole +low-light images. We propose a novel Transformer block to completely simulate +the dependencies of all pixels across images via a local-to-global hierarchical +attention mechanism, so that dark areas could be inferred by borrowing the +information from far informative regions in a highly effective manner. +Extensive experiments on several benchmark datasets demonstrate that our +approach outperforms state-of-the-art methods. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ MEDOE: A Multi-Expert Decoder and Output Ensemble Framework for + Long-tailed Semantic Segmentation + + +
+ Long-tailed distribution of semantic categories, which has been often ignored +in conventional methods, causes unsatisfactory performance in semantic +segmentation on tail categories. In this paper, we focus on the problem of +long-tailed semantic segmentation. Although some long-tailed recognition +methods (e.g., re-sampling/re-weighting) have been proposed in other problems, +they can probably compromise crucial contextual information and are thus hardly +adaptable to the problem of long-tailed semantic segmentation. To address this +issue, we propose MEDOE, a novel framework for long-tailed semantic +segmentation via contextual information ensemble-and-grouping. The proposed +two-sage framework comprises a multi-expert decoder (MED) and a multi-expert +output ensemble (MOE). Specifically, the MED includes several "experts". Based +on the pixel frequency distribution, each expert takes the dataset masked +according to the specific categories as input and generates contextual +information self-adaptively for classification; The MOE adopts learnable +decision weights for the ensemble of the experts' outputs. As a model-agnostic +framework, our MEDOE can be flexibly and efficiently coupled with various +popular deep neural networks (e.g., DeepLabv3+, OCRNet, and PSPNet) to improve +their performance in long-tailed semantic segmentation. Experimental results +show that the proposed framework outperforms the current methods on both +Cityscapes and ADE20K datasets by up to 1.78% in mIoU and 5.89% in mAcc. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ☆ Neural Spherical Harmonics for structurally coherent continuous + representation of diffusion MRI signal MICCAI 2023 + + +
+ We present a novel way to model diffusion magnetic resonance imaging (dMRI) +datasets, that benefits from the structural coherence of the human brain while +only using data from a single subject. Current methods model the dMRI signal in +individual voxels, disregarding the intervoxel coherence that is present. We +use a neural network to parameterize a spherical harmonics series (NeSH) to +represent the dMRI signal of a single subject from the Human Connectome Project +dataset, continuous in both the angular and spatial domain. The reconstructed +dMRI signal using this method shows a more structurally coherent representation +of the data. Noise in gradient images is removed and the fiber orientation +distribution functions show a smooth change in direction along a fiber tract. +We showcase how the reconstruction can be used to calculate mean diffusivity, +fractional anisotropy, and total apparent fiber density. These results can be +achieved with a single model architecture, tuning only one hyperparameter. In +this paper we also demonstrate how upsampling in both the angular and spatial +domain yields reconstructions that are on par or better than existing methods. + +
+
+ comment: 12 pages, 6 figures, accepted for cdMRI workshop at MICCAI 2023 +
+
+
+
+
+ + ☆ Explainable Multi-View Deep Networks Methodology for Experimental + Physics + + +
+ Physical experiments often involve multiple imaging representations, such as +X-ray scans and microscopic images. Deep learning models have been widely used +for supervised analysis in these experiments. Combining different image +representations is frequently required to analyze and make a decision properly. +Consequently, multi-view data has emerged - datasets where each sample is +described by views from different angles, sources, or modalities. These +problems are addressed with the concept of multi-view learning. Understanding +the decision-making process of deep learning models is essential for reliable +and credible analysis. Hence, many explainability methods have been devised +recently. Nonetheless, there is a lack of proper explainability in multi-view +models, which are challenging to explain due to their architectures. In this +paper, we suggest different multi-view architectures for the vision domain, +each suited to another problem, and we also present a methodology for +explaining these models. To demonstrate the effectiveness of our methodology, +we focus on the domain of High Energy Density Physics (HEDP) experiments, where +multiple imaging representations are used to assess the quality of foam +samples. We apply our methodology to classify the foam samples quality using +the suggested multi-view architectures. Through experimental results, we +showcase the improvement of accurate architecture choice on both accuracy - 78% +to 84% and AUC - 83% to 93% and present a trade-off between performance and +explainability. Specifically, we demonstrate that our approach enables the +explanation of individual one-view models, providing insights into the +decision-making process of each view. This understanding enhances the +interpretability of the overall multi-view model. The sources of this work are +available at: +https://github.com/Scientific-Computing-Lab-NRCN/Multi-View-Explainability. + +
+
+
+
+
+ + ☆ Self-Reference Deep Adaptive Curve Estimation for Low-Light Image + Enhancement + + +
+ In this paper, we propose a 2-stage low-light image enhancement method called +Self-Reference Deep Adaptive Curve Estimation (Self-DACE). In the first stage, +we present an intuitive, lightweight, fast, and unsupervised luminance +enhancement algorithm. The algorithm is based on a novel low-light enhancement +curve that can be used to locally boost image brightness. We also propose a new +loss function with a simplified physical model designed to preserve natural +images' color, structure, and fidelity. We use a vanilla CNN to map each pixel +through deep Adaptive Adjustment Curves (AAC) while preserving the local image +structure. Secondly, we introduce the corresponding denoising scheme to remove +the latent noise in the darkness. We approximately model the noise in the dark +and deploy a Denoising-Net to estimate and remove the noise after the first +stage. Exhaustive qualitative and quantitative analysis shows that our method +outperforms existing state-of-the-art algorithms on multiple real-world +datasets. + +
+
+
+
+
+ + ☆ Automatic Vision-Based Parking Slot Detection and Occupancy + Classification + + +
+ Parking guidance information (PGI) systems are used to provide information to +drivers about the nearest parking lots and the number of vacant parking slots. +Recently, vision-based solutions started to appear as a cost-effective +alternative to standard PGI systems based on hardware sensors mounted on each +parking slot. Vision-based systems provide information about parking occupancy +based on images taken by a camera that is recording a parking lot. However, +such systems are challenging to develop due to various possible viewpoints, +weather conditions, and object occlusions. Most notably, they require manual +labeling of parking slot locations in the input image which is sensitive to +camera angle change, replacement, or maintenance. In this paper, the algorithm +that performs Automatic Parking Slot Detection and Occupancy Classification +(APSD-OC) solely on input images is proposed. Automatic parking slot detection +is based on vehicle detections in a series of parking lot images upon which +clustering is applied in bird's eye view to detect parking slots. Once the +parking slots positions are determined in the input image, each detected +parking slot is classified as occupied or vacant using a specifically trained +ResNet34 deep classifier. The proposed approach is extensively evaluated on +well-known publicly available datasets (PKLot and CNRPark+EXT), showing high +efficiency in parking slot detection and robustness to the presence of illegal +parking or passing vehicles. Trained classifier achieves high accuracy in +parking slot occupancy classification. + +
+
+ comment: 39 pages, 8 figures, 9 tables +
+
+
+
+
+ + ☆ Unsupervised Domain Adaptive Detection with Network Stability Analysis + + +
+ Domain adaptive detection aims to improve the generality of a detector, +learned from the labeled source domain, on the unlabeled target domain. In this +work, drawing inspiration from the concept of stability from the control theory +that a robust system requires to remain consistent both externally and +internally regardless of disturbances, we propose a novel framework that +achieves unsupervised domain adaptive detection through stability analysis. In +specific, we treat discrepancies between images and regions from different +domains as disturbances, and introduce a novel simple but effective Network +Stability Analysis (NSA) framework that considers various disturbances for +domain adaptation. Particularly, we explore three types of perturbations +including heavy and light image-level disturbances and instancelevel +disturbance. For each type, NSA performs external consistency analysis on the +outputs from raw and perturbed images and/or internal consistency analysis on +their features, using teacher-student models. By integrating NSA into Faster +R-CNN, we immediately achieve state-of-the-art results. In particular, we set a +new record of 52.7% mAP on Cityscapes-to-FoggyCityscapes, showing the potential +of NSA for domain adaptive detection. It is worth noticing, our NSA is designed +for general purpose, and thus applicable to one-stage detection model (e.g., +FCOS) besides the adopted one, as shown by experiments. +https://github.com/tiankongzhang/NSA. + +
+
+
+
+
+ + ☆ AATCT-IDS: A Benchmark Abdominal Adipose Tissue CT Image Dataset for + Image Denoising, Semantic Segmentation, and Radiomics Evaluation + + +
+ Methods: In this study, a benchmark \emph{Abdominal Adipose Tissue CT Image +Dataset} (AATTCT-IDS) containing 300 subjects is prepared and published. +AATTCT-IDS publics 13,732 raw CT slices, and the researchers individually +annotate the subcutaneous and visceral adipose tissue regions of 3,213 of those +slices that have the same slice distance to validate denoising methods, train +semantic segmentation models, and study radiomics. For different tasks, this +paper compares and analyzes the performance of various methods on AATTCT-IDS by +combining the visualization results and evaluation data. Thus, verify the +research potential of this data set in the above three types of tasks. + Results: In the comparative study of image denoising, algorithms using a +smoothing strategy suppress mixed noise at the expense of image details and +obtain better evaluation data. Methods such as BM3D preserve the original image +structure better, although the evaluation data are slightly lower. The results +show significant differences among them. In the comparative study of semantic +segmentation of abdominal adipose tissue, the segmentation results of adipose +tissue by each model show different structural characteristics. Among them, +BiSeNet obtains segmentation results only slightly inferior to U-Net with the +shortest training time and effectively separates small and isolated adipose +tissue. In addition, the radiomics study based on AATTCT-IDS reveals three +adipose distributions in the subject population. + Conclusion: AATTCT-IDS contains the ground truth of adipose tissue regions in +abdominal CT slices. This open-source dataset can attract researchers to +explore the multi-dimensional characteristics of abdominal adipose tissue and +thus help physicians and patients in clinical practice. AATCT-IDS is freely +published for non-commercial purpose at: +\url{https://figshare.com/articles/dataset/AATTCT-IDS/23807256}. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ☆ Interpretability Benchmark for Evaluating Spatial Misalignment of + Prototypical Parts Explanations + + +
+ Prototypical parts-based networks are becoming increasingly popular due to +their faithful self-explanations. However, their similarity maps are calculated +in the penultimate network layer. Therefore, the receptive field of the +prototype activation region often depends on parts of the image outside this +region, which can lead to misleading interpretations. We name this undesired +behavior a spatial explanation misalignment and introduce an interpretability +benchmark with a set of dedicated metrics for quantifying this phenomenon. In +addition, we propose a method for misalignment compensation and apply it to +existing state-of-the-art models. We show the expressiveness of our benchmark +and the effectiveness of the proposed compensation methodology through +extensive empirical studies. + +
+
+ comment: Under review. Code will be release upon acceptance +
+
+
+
+
+ + ☆ Learning to Generate Semantic Layouts for Higher Text-Image + Correspondence in Text-to-Image Synthesis ICCV 2023 + + +
+ Existing text-to-image generation approaches have set high standards for +photorealism and text-image correspondence, largely benefiting from web-scale +text-image datasets, which can include up to 5~billion pairs. However, +text-to-image generation models trained on domain-specific datasets, such as +urban scenes, medical images, and faces, still suffer from low text-image +correspondence due to the lack of text-image pairs. Additionally, collecting +billions of text-image pairs for a specific domain can be time-consuming and +costly. Thus, ensuring high text-image correspondence without relying on +web-scale text-image datasets remains a challenging task. In this paper, we +present a novel approach for enhancing text-image correspondence by leveraging +available semantic layouts. Specifically, we propose a Gaussian-categorical +diffusion process that simultaneously generates both images and corresponding +layout pairs. Our experiments reveal that we can guide text-to-image generation +models to be aware of the semantics of different image regions, by training the +model to generate semantic labels for each pixel. We demonstrate that our +approach achieves higher text-image correspondence compared to existing +text-to-image generation approaches in the Multi-Modal CelebA-HQ and the +Cityscapes dataset, where text-image pairs are scarce. Codes are available in +this https://pmh9960.github.io/research/GCDP + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Conditional Perceptual Quality Preserving Image Compression + + +
+ We propose conditional perceptual quality, an extension of the perceptual +quality defined in \citet{blau2018perception}, by conditioning it on user +defined information. Specifically, we extend the original perceptual quality +$d(p_{X},p_{\hat{X}})$ to the conditional perceptual quality +$d(p_{X|Y},p_{\hat{X}|Y})$, where $X$ is the original image, $\hat{X}$ is the +reconstructed, $Y$ is side information defined by user and $d(.,.)$ is +divergence. We show that conditional perceptual quality has similar theoretical +properties as rate-distortion-perception trade-off \citep{blau2019rethinking}. +Based on these theoretical results, we propose an optimal framework for +conditional perceptual quality preserving compression. Experimental results +show that our codec successfully maintains high perceptual quality and semantic +quality at all bitrate. Besides, by providing a lowerbound of common randomness +required, we settle the previous arguments on whether randomness should be +incorporated into generator for (conditional) perceptual quality compression. +The source code is provided in supplementary material. + +
+
+
+
+
+ + ☆ SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech + Separation + + +
+ The integration of different modalities, such as audio and visual +information, plays a crucial role in human perception of the surrounding +environment. Recent research has made significant progress in designing fusion +modules for audio-visual speech separation. However, they predominantly focus +on multi-modal fusion architectures situated either at the top or bottom +positions, rather than comprehensively considering multi-modal fusion at +various hierarchical positions within the network. In this paper, we propose a +novel model called self- and cross-attention network (SCANet), which leverages +the attention mechanism for efficient audio-visual feature fusion. SCANet +consists of two types of attention blocks: self-attention (SA) and +cross-attention (CA) blocks, where the CA blocks are distributed at the top +(TCA), middle (MCA) and bottom (BCA) of SCANet. These blocks maintain the +ability to learn modality-specific features and enable the extraction of +different semantics from audio-visual features. Comprehensive experiments on +three standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2) +demonstrate the effectiveness of SCANet, outperforming existing +state-of-the-art (SOTA) methods while maintaining comparable inference time. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ S2R: Exploring a Double-Win Transformer-Based Framework for Ideal and + Blind Super-Resolution + + +
+ Nowadays, deep learning based methods have demonstrated impressive +performance on ideal super-resolution (SR) datasets, but most of these methods +incur dramatically performance drops when directly applied in real-world SR +reconstruction tasks with unpredictable blur kernels. To tackle this issue, +blind SR methods are proposed to improve the visual results on random blur +kernels, which causes unsatisfactory reconstruction effects on ideal +low-resolution images similarly. In this paper, we propose a double-win +framework for ideal and blind SR task, named S2R, including a light-weight +transformer-based SR model (S2R transformer) and a novel coarse-to-fine +training strategy, which can achieve excellent visual results on both ideal and +random fuzzy conditions. On algorithm level, S2R transformer smartly combines +some efficient and light-weight blocks to enhance the representation ability of +extracted features with relatively low number of parameters. For training +strategy, a coarse-level learning process is firstly performed to improve the +generalization of the network with the help of a large-scale external dataset, +and then, a fast fine-tune process is developed to transfer the pre-trained +model to real-world SR tasks by mining the internal features of the image. +Experimental results show that the proposed S2R outperforms other single-image +SR models in ideal SR condition with only 578K parameters. Meanwhile, it can +achieve better visual results than regular blind SR models in blind fuzzy +conditions with only 10 gradient updates, which improve convergence speed by +300 times, significantly accelerating the transfer-learning process in +real-world situations. + +
+
+
+
+
+ + ☆ GPA-3D: Geometry-aware Prototype Alignment for Unsupervised Domain + Adaptive 3D Object Detection from Point Clouds ICCV 2023 + + +
+ LiDAR-based 3D detection has made great progress in recent years. However, +the performance of 3D detectors is considerably limited when deployed in unseen +environments, owing to the severe domain gap problem. Existing domain adaptive +3D detection methods do not adequately consider the problem of the +distributional discrepancy in feature space, thereby hindering generalization +of detectors across domains. In this work, we propose a novel unsupervised +domain adaptive \textbf{3D} detection framework, namely \textbf{G}eometry-aware +\textbf{P}rototype \textbf{A}lignment (\textbf{GPA-3D}), which explicitly +leverages the intrinsic geometric relationship from point cloud objects to +reduce the feature discrepancy, thus facilitating cross-domain transferring. +Specifically, GPA-3D assigns a series of tailored and learnable prototypes to +point cloud objects with distinct geometric structures. Each prototype aligns +BEV (bird's-eye-view) features derived from corresponding point cloud objects +on source and target domains, reducing the distributional discrepancy and +achieving better adaptation. The evaluation results obtained on various +benchmarks, including Waymo, nuScenes and KITTI, demonstrate the superiority of +our GPA-3D over the state-of-the-art approaches for different adaptation +scenarios. The MindSpore version code will be publicly available at +\url{https://github.com/Liz66666/GPA3D}. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ SYENet: A Simple Yet Effective Network for Multiple Low-Level Vision + Tasks with Real-time Performance on Mobile Device + + +
+ With the rapid development of AI hardware accelerators, applying deep +learning-based algorithms to solve various low-level vision tasks on mobile +devices has gradually become possible. However, two main problems still need to +be solved: task-specific algorithms make it difficult to integrate them into a +single neural network architecture, and large amounts of parameters make it +difficult to achieve real-time inference. To tackle these problems, we propose +a novel network, SYENet, with only $~$6K parameters, to handle multiple +low-level vision tasks on mobile devices in a real-time manner. The SYENet +consists of two asymmetrical branches with simple building blocks. To +effectively connect the results by asymmetrical branches, a Quadratic +Connection Unit(QCU) is proposed. Furthermore, to improve performance, a new +Outlier-Aware Loss is proposed to process the image. The proposed method proves +its superior performance with the best PSNR as compared with other networks in +real-time applications such as Image Signal Processing(ISP), Low-Light +Enhancement(LLE), and Super-Resolution(SR) with 2K60FPS throughput on Qualcomm +8 Gen 1 mobile SoC(System-on-Chip). Particularly, for ISP task, SYENet got the +highest score in MAI 2022 Learned Smartphone ISP challenge. + +
+
+
+
+
+ + ☆ Ranking-aware Uncertainty for Text-guided Image Retrieval + + +
+ Text-guided image retrieval is to incorporate conditional text to better +capture users' intent. Traditionally, the existing methods focus on minimizing +the embedding distances between the source inputs and the targeted image, using +the provided triplets $\langle$source image, source text, target +image$\rangle$. However, such triplet optimization may limit the learned +retrieval model to capture more detailed ranking information, e.g., the +triplets are one-to-one correspondences and they fail to account for +many-to-many correspondences arising from semantic diversity in feedback +languages and images. To capture more ranking information, we propose a novel +ranking-aware uncertainty approach to model many-to-many correspondences by +only using the provided triplets. We introduce uncertainty learning to learn +the stochastic ranking list of features. Specifically, our approach mainly +comprises three components: (1) In-sample uncertainty, which aims to capture +semantic diversity using a Gaussian distribution derived from both combined and +target features; (2) Cross-sample uncertainty, which further mines the ranking +information from other samples' distributions; and (3) Distribution +regularization, which aligns the distributional representations of source +inputs and targeted image. Compared to the existing state-of-the-art methods, +our proposed method achieves significant results on two public datasets for +composed image retrieval. + +
+
+
+
+
+ + ☆ OmniZoomer: Learning to Move and Zoom in on Sphere at High-Resolution ICCV 2023 + + +
+ Omnidirectional images (ODIs) have become increasingly popular, as their +large field-of-view (FoV) can offer viewers the chance to freely choose the +view directions in immersive environments such as virtual reality. The M\"obius +transformation is typically employed to further provide the opportunity for +movement and zoom on ODIs, but applying it to the image level often results in +blurry effect and aliasing problem. In this paper, we propose a novel deep +learning-based approach, called \textbf{OmniZoomer}, to incorporate the +M\"obius transformation into the network for movement and zoom on ODIs. By +learning various transformed feature maps under different conditions, the +network is enhanced to handle the increasing edge curvatures, which alleviates +the blurry effect. Moreover, to address the aliasing problem, we propose two +key components. Firstly, to compensate for the lack of pixels for describing +curves, we enhance the feature maps in the high-resolution (HR) space and +calculate the transformed index map with a spatial index generation module. +Secondly, considering that ODIs are inherently represented in the spherical +space, we propose a spherical resampling module that combines the index map and +HR feature maps to transform the feature maps for better spherical correlation. +The transformed feature maps are decoded to output a zoomed ODI. Experiments +show that our method can produce HR and high-quality ODIs with the flexibility +to move and zoom in to the object of interest. Project page is available at +http://vlislab22.github.io/OmniZoomer/. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ View Consistent Purification for Accurate Cross-View Localization ICCV 2023 + + +
+ This paper proposes a fine-grained self-localization method for outdoor +robotics that utilizes a flexible number of onboard cameras and readily +accessible satellite images. The proposed method addresses limitations in +existing cross-view localization methods that struggle to handle noise sources +such as moving objects and seasonal variations. It is the first sparse +visual-only method that enhances perception in dynamic environments by +detecting view-consistent key points and their corresponding deep features from +ground and satellite views, while removing off-the-ground objects and +establishing homography transformation between the two views. Moreover, the +proposed method incorporates a spatial embedding approach that leverages camera +intrinsic and extrinsic information to reduce the ambiguity of purely visual +matching, leading to improved feature matching and overall pose estimation +accuracy. The method exhibits strong generalization and is robust to +environmental changes, requiring only geo-poses as ground truth. Extensive +experiments on the KITTI and Ford Multi-AV Seasonal datasets demonstrate that +our proposed method outperforms existing state-of-the-art methods, achieving +median spatial accuracy errors below $0.5$ meters along the lateral and +longitudinal directions, and a median orientation accuracy error below 2 +degrees. + +
+
+ comment: Accepted for ICCV 2023 +
+
+
+
+
+ + ☆ Snapshot High Dynamic Range Imaging with a Polarization Camera + + +
+ High dynamic range (HDR) images are important for a range of tasks, from +navigation to consumer photography. Accordingly, a host of specialized HDR +sensors have been developed, the most successful of which are based on +capturing variable per-pixel exposures. In essence, these methods capture an +entire exposure bracket sequence at once in a single shot. This paper presents +a straightforward but highly effective approach for turning an off-the-shelf +polarization camera into a high-performance HDR camera. By placing a linear +polarizer in front of the polarization camera, we are able to simultaneously +capture four images with varied exposures, which are determined by the +orientation of the polarizer. We develop an outlier-robust and self-calibrating +algorithm to reconstruct an HDR image (at a single polarity) from these +measurements. Finally, we demonstrate the efficacy of our approach with +extensive real-world experiments. + +
+
+ comment: 9 pages, 10 figures +
+
+
+
+
+ + ☆ DragNUWA: Fine-grained Control in Video Generation by Integrating Text, + Image, and Trajectory + + +
+ Controllable video generation has gained significant attention in recent +years. However, two main limitations persist: Firstly, most existing works +focus on either text, image, or trajectory-based control, leading to an +inability to achieve fine-grained control in videos. Secondly, trajectory +control research is still in its early stages, with most experiments being +conducted on simple datasets like Human3.6M. This constraint limits the models' +capability to process open-domain images and effectively handle complex curved +trajectories. In this paper, we propose DragNUWA, an open-domain +diffusion-based video generation model. To tackle the issue of insufficient +control granularity in existing works, we simultaneously introduce text, image, +and trajectory information to provide fine-grained control over video content +from semantic, spatial, and temporal perspectives. To resolve the problem of +limited open-domain trajectory control in current research, We propose +trajectory modeling with three aspects: a Trajectory Sampler (TS) to enable +open-domain control of arbitrary trajectories, a Multiscale Fusion (MF) to +control trajectories in different granularities, and an Adaptive Training (AT) +strategy to generate consistent videos following trajectories. Our experiments +validate the effectiveness of DragNUWA, demonstrating its superior performance +in fine-grained control in video generation. The homepage link is +\url{https://www.microsoft.com/en-us/research/project/dragnuwa/} + +
+
+
+
+
+ + ☆ Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme + Detection ACM MM + + +
+ Hateful meme detection is a challenging multimodal task that requires +comprehension of both vision and language, as well as cross-modal interactions. +Recent studies have tried to fine-tune pre-trained vision-language models +(PVLMs) for this task. However, with increasing model sizes, it becomes +important to leverage powerful PVLMs more efficiently, rather than simply +fine-tuning them. Recently, researchers have attempted to convert meme images +into textual captions and prompt language models for predictions. This approach +has shown good performance but suffers from non-informative image captions. +Considering the two factors mentioned above, we propose a probing-based +captioning approach to leverage PVLMs in a zero-shot visual question answering +(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful +content-related questions and use the answers as image captions (which we call +Pro-Cap), so that the captions contain information critical for hateful content +detection. The good performance of models with Pro-Cap on three benchmarks +validates the effectiveness and generalization of the proposed method. + +
+
+ comment: Camera-ready for 23, ACM MM +
+
+
+
+
+ + ♻ ☆ SHERF: Generalizable Human NeRF from a Single Image ICCV2023 + + +
+ Existing Human NeRF methods for reconstructing 3D humans typically rely on +multiple 2D images from multi-view cameras or monocular videos captured from +fixed camera views. However, in real-world scenarios, human images are often +captured from random camera angles, presenting challenges for high-quality 3D +human reconstruction. In this paper, we propose SHERF, the first generalizable +Human NeRF model for recovering animatable 3D humans from a single input image. +SHERF extracts and encodes 3D human representations in canonical space, +enabling rendering and animation from free views and poses. To achieve +high-fidelity novel view and pose synthesis, the encoded 3D human +representations should capture both global appearance and local fine-grained +textures. To this end, we propose a bank of 3D-aware hierarchical features, +including global, point-level, and pixel-aligned features, to facilitate +informative encoding. Global features enhance the information extracted from +the single input image and complement the information missing from the partial +2D observation. Point-level features provide strong clues of 3D human +structure, while pixel-aligned features preserve more fine-grained details. To +effectively integrate the 3D-aware hierarchical feature bank, we design a +feature fusion transformer. Extensive experiments on THuman, RenderPeople, +ZJU_MoCap, and HuMMan datasets demonstrate that SHERF achieves state-of-the-art +performance, with better generalizability for novel view and pose synthesis. + +
+
+ comment: Accepted by ICCV2023. Project webpage: + https://skhu101.github.io/SHERF/ +
+
+
+
+
+ + ♻ ☆ Normalizing Flows for Human Pose Anomaly Detection + + +
+ Video anomaly detection is an ill-posed problem because it relies on many +parameters such as appearance, pose, camera angle, background, and more. We +distill the problem to anomaly detection of human pose, thus decreasing the +risk of nuisance parameters such as appearance affecting the result. Focusing +on pose alone also has the side benefit of reducing bias against distinct +minority groups. Our model works directly on human pose graph sequences and is +exceptionally lightweight (~1K parameters), capable of running on any machine +able to run the pose estimation with negligible additional resources. We +leverage the highly compact pose representation in a normalizing flows +framework, which we extend to tackle the unique characteristics of +spatio-temporal pose data and show its advantages in this use case. The +algorithm is quite general and can handle training data of only normal examples +as well as a supervised setting that consists of labeled normal and abnormal +examples. We report state-of-the-art results on two anomaly detection +benchmarks - the unsupervised ShanghaiTech dataset and the recent supervised +UBnormal dataset. + +
+
+
+
+
+ + ♻ ☆ DINAR: Diffusion Inpainting of Neural Textures for One-Shot Human + Avatars + + +
+ We present DINAR, an approach for creating realistic rigged fullbody avatars +from single RGB images. Similarly to previous works, our method uses neural +textures combined with the SMPL-X body model to achieve photo-realistic quality +of avatars while keeping them easy to animate and fast to infer. To restore the +texture, we use a latent diffusion model and show how such model can be trained +in the neural texture space. The use of the diffusion model allows us to +realistically reconstruct large unseen regions such as the back of a person +given the frontal view. The models in our pipeline are trained using 2D images +and videos only. In the experiments, our approach achieves state-of-the-art +rendering quality and good generalization to new poses and viewpoints. In +particular, the approach improves state-of-the-art on the SnapshotPeople public +benchmark. + +
+
+
+
+
+ + ♻ ☆ EndoDepthL: Lightweight Endoscopic Monocular Depth Estimation with + CNN-Transformer + + +
+ In this study, we address the key challenges concerning the accuracy and +effectiveness of depth estimation for endoscopic imaging, with a particular +emphasis on real-time inference and the impact of light reflections. We propose +a novel lightweight solution named EndoDepthL that integrates Convolutional +Neural Networks (CNN) and Transformers to predict multi-scale depth maps. Our +approach includes optimizing the network architecture, incorporating +multi-scale dilated convolution, and a multi-channel attention mechanism. We +also introduce a statistical confidence boundary mask to minimize the impact of +reflective areas. To better evaluate the performance of monocular depth +estimation in endoscopic imaging, we propose a novel complexity evaluation +metric that considers network parameter size, floating-point operations, and +inference frames per second. We comprehensively evaluate our proposed method +and compare it with existing baseline solutions. The results demonstrate that +EndoDepthL ensures depth estimation accuracy with a lightweight structure. + +
+
+
+
+
+ + ♻ ☆ Adaptive Split-Fusion Transformer + + +
+ Neural networks for visual content understanding have recently evolved from +convolutional ones (CNNs) to transformers. The prior (CNN) relies on +small-windowed kernels to capture the regional clues, demonstrating solid local +expressiveness. On the contrary, the latter (transformer) establishes +long-range global connections between localities for holistic learning. +Inspired by this complementary nature, there is a growing interest in designing +hybrid models to best utilize each technique. Current hybrids merely replace +convolutions as simple approximations of linear projection or juxtapose a +convolution branch with attention, without concerning the importance of +local/global modeling. To tackle this, we propose a new hybrid named Adaptive +Split-Fusion Transformer (ASF-former) to treat convolutional and attention +branches differently with adaptive weights. Specifically, an ASF-former encoder +equally splits feature channels into half to fit dual-path inputs. Then, the +outputs of dual-path are fused with weighting scalars calculated from visual +cues. We also design the convolutional path compactly for efficiency concerns. +Extensive experiments on standard benchmarks, such as ImageNet-1K, CIFAR-10, +and CIFAR-100, show that our ASF-former outperforms its CNN, transformer +counterparts, and hybrid pilots in terms of accuracy (83.9% on ImageNet-1K), +under similar conditions (12.9G MACs/56.7M Params, without large-scale +pre-training). The code is available at: +https://github.com/szx503045266/ASF-former. + +
+
+
+
+
+ + ♻ ☆ Black Box Few-Shot Adaptation for Vision-Language models ICCV 2023 + + +
+ Vision-Language (V-L) models trained with contrastive learning to align the +visual and language modalities have been shown to be strong few-shot learners. +Soft prompt learning is the method of choice for few-shot downstream adaption +aiming to bridge the modality gap caused by the distribution shift induced by +the new domain. While parameter-efficient, prompt learning still requires +access to the model weights and can be computationally infeasible for large +models with billions of parameters. To address these shortcomings, in this +work, we describe a black-box method for V-L few-shot adaptation that (a) +operates on pre-computed image and text features and hence works without access +to the model's weights, (b) it is orders of magnitude faster at training time, +(c) it is amenable to both supervised and unsupervised training, and (d) it can +be even used to align image and text features computed from uni-modal models. +To achieve this, we propose Linear Feature Alignment (LFA), a simple linear +approach for V-L re-alignment in the target domain. LFA is initialized from a +closed-form solution to a least-squares problem and then it is iteratively +updated by minimizing a re-ranking loss. Despite its simplicity, our approach +can even surpass soft-prompt learning methods as shown by extensive experiments +on 11 image and 2 video datasets. + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ HGCN-GJS: Hierarchical Graph Convolutional Network with Groupwise Joint + Sampling for Trajectory Prediction IROS 2022 + + +
+ Accurate pedestrian trajectory prediction is of great importance for +downstream tasks such as autonomous driving and mobile robot navigation. Fully +investigating the social interactions within the crowd is crucial for accurate +pedestrian trajectory prediction. However, most existing methods do not capture +group level interactions well, focusing only on pairwise interactions and +neglecting group-wise interactions. In this work, we propose a hierarchical +graph convolutional network, HGCN-GJS, for trajectory prediction which well +leverages group level interactions within the crowd. Furthermore, we introduce +a novel joint sampling scheme for modeling the joint distribution of multiple +pedestrians in the future trajectories. Based on the group information, this +scheme associates the trajectory of one person with the trajectory of other +people in the group, but maintains the independence of the trajectories of +outsiders. We demonstrate the performance of our network on several trajectory +prediction datasets, achieving state-of-the-art results on all datasets +considered. + +
+
+ comment: 8 pages, 8 figures, accepted by IROS 2022 +
+
+
+
+
+ + ♻ ☆ Cross Contrasting Feature Perturbation for Domain Generalization + + +
+ Domain generalization (DG) aims to learn a robust model from source domains +that generalize well on unseen target domains. Recent studies focus on +generating novel domain samples or features to diversify distributions +complementary to source domains. Yet, these approaches can hardly deal with the +restriction that the samples synthesized from various domains can cause +semantic distortion. In this paper, we propose an online one-stage Cross +Contrasting Feature Perturbation (CCFP) framework to simulate domain shift by +generating perturbed features in the latent space while regularizing the model +prediction against domain shift. Different from the previous fixed synthesizing +strategy, we design modules with learnable feature perturbations and semantic +consistency constraints. In contrast to prior work, our method does not use any +generative-based models or domain labels. We conduct extensive experiments on a +standard DomainBed benchmark with a strict evaluation protocol for a fair +comparison. Comprehensive experiments show that our method outperforms the +previous state-of-the-art, and quantitative analyses illustrate that our +approach can alleviate the domain shift problem in out-of-distribution (OOD) +scenarios. + +
+
+
+
+
+ + ♻ ☆ EfficientTrain: Exploring Generalized Curriculum Learning for Training + Visual Backbones ICCV 2023 + + +
+ The superior performance of modern deep networks usually comes with a costly +training procedure. This paper presents a new curriculum learning approach for +the efficient training of visual backbones (e.g., vision Transformers). Our +work is inspired by the inherent learning dynamics of deep networks: we +experimentally show that at an earlier training stage, the model mainly learns +to recognize some 'easier-to-learn' discriminative patterns within each +example, e.g., the lower-frequency components of images and the original +information before data augmentation. Driven by this phenomenon, we propose a +curriculum where the model always leverages all the training data at each +epoch, while the curriculum starts with only exposing the 'easier-to-learn' +patterns of each example, and introduces gradually more difficult patterns. To +implement this idea, we 1) introduce a cropping operation in the Fourier +spectrum of the inputs, which enables the model to learn from only the +lower-frequency components efficiently, 2) demonstrate that exposing the +features of original images amounts to adopting weaker data augmentation, and +3) integrate 1) and 2) and design a curriculum learning schedule with a +greedy-search algorithm. The resulting approach, EfficientTrain, is simple, +general, yet surprisingly effective. As an off-the-shelf method, it reduces the +wall-time training cost of a wide variety of popular models (e.g., ResNet, +ConvNeXt, DeiT, PVT, Swin, and CSWin) by >1.5x on ImageNet-1K/22K without +sacrificing accuracy. It is also effective for self-supervised learning (e.g., +MAE). Code is available at https://github.com/LeapLabTHU/EfficientTrain. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ DiffIR: Efficient Diffusion Model for Image Restoration ICCV2023 + + +
+ Diffusion model (DM) has achieved SOTA performance by modeling the image +synthesis process into a sequential application of a denoising network. +However, different from image synthesis, image restoration (IR) has a strong +constraint to generate results in accordance with ground-truth. Thus, for IR, +traditional DMs running massive iterations on a large model to estimate whole +images or feature maps is inefficient. To address this issue, we propose an +efficient DM for IR (DiffIR), which consists of a compact IR prior extraction +network (CPEN), dynamic IR transformer (DIRformer), and denoising network. +Specifically, DiffIR has two training stages: pretraining and training DM. In +pretraining, we input ground-truth images into CPEN$_{S1}$ to capture a compact +IR prior representation (IPR) to guide DIRformer. In the second stage, we train +the DM to directly estimate the same IRP as pretrained CPEN$_{S1}$ only using +LQ images. We observe that since the IPR is only a compact vector, DiffIR can +use fewer iterations than traditional DM to obtain accurate estimations and +generate more stable and realistic results. Since the iterations are few, our +DiffIR can adopt a joint optimization of CPEN$_{S2}$, DIRformer, and denoising +network, which can further reduce the estimation error influence. We conduct +extensive experiments on several IR tasks and achieve SOTA performance while +consuming less computational costs. Code is available at +\url{https://github.com/Zj-BinXia/DiffIR}. + +
+
+ comment: This paper is accepted by ICCV2023. Codes and models are available at + https://github.com/Zj-BinXia/DiffIR +
+
+
+
+
+ + ♻ ☆ MixCycle: Mixup Assisted Semi-Supervised 3D Single Object Tracking with + Cycle Consistency ICCV23 + + +
+ 3D single object tracking (SOT) is an indispensable part of automated +driving. Existing approaches rely heavily on large, densely labeled datasets. +However, annotating point clouds is both costly and time-consuming. Inspired by +the great success of cycle tracking in unsupervised 2D SOT, we introduce the +first semi-supervised approach to 3D SOT. Specifically, we introduce two +cycle-consistency strategies for supervision: 1) Self tracking cycles, which +leverage labels to help the model converge better in the early stages of +training; 2) forward-backward cycles, which strengthen the tracker's robustness +to motion variations and the template noise caused by the template update +strategy. Furthermore, we propose a data augmentation strategy named SOTMixup +to improve the tracker's robustness to point cloud diversity. SOTMixup +generates training samples by sampling points in two point clouds with a mixing +rate and assigns a reasonable loss weight for training according to the mixing +rate. The resulting MixCycle approach generalizes to appearance matching-based +trackers. On the KITTI benchmark, based on the P2B tracker, MixCycle trained +with $\textbf{10\%}$ labels outperforms P2B trained with $\textbf{100\%}$ +labels, and achieves a $\textbf{28.4\%}$ precision improvement when using +$\textbf{1\%}$ labels. Our code will be released at +\url{https://github.com/Mumuqiao/MixCycle}. + +
+
+ comment: Accepted by ICCV23 +
+
+
+
+
+ + ♻ ☆ Adaptive Segmentation Network for Scene Text Detection + + +
+ Inspired by deep convolution segmentation algorithms, scene text detectors +break the performance ceiling of datasets steadily. However, these methods +often encounter threshold selection bottlenecks and have poor performance on +text instances with extreme aspect ratios. In this paper, we propose to +automatically learn the discriminate segmentation threshold, which +distinguishes text pixels from background pixels for segmentation-based scene +text detectors and then further reduces the time-consuming manual parameter +adjustment. Besides, we design a Global-information Enhanced Feature Pyramid +Network (GE-FPN) for capturing text instances with macro size and extreme +aspect ratios. Following the GE-FPN, we introduce a cascade optimization +structure to further refine the text instances. Finally, together with the +proposed threshold learning strategy and text detection structure, we design an +Adaptive Segmentation Network (ASNet) for scene text detection. Extensive +experiments are carried out to demonstrate that the proposed ASNet can achieve +the state-of-the-art performance on four text detection benchmarks, i.e., ICDAR +2015, MSRA-TD500, ICDAR 2017 MLT and CTW1500. The ablation experiments also +verify the effectiveness of our contributions. + +
+
+
+
+
+ + ♻ ☆ End-to-end Remote Sensing Change Detection of Unregistered Bi-temporal + Images for Natural Disasters + + +
+ Change detection based on remote sensing images has been a prominent area of +interest in the field of remote sensing. Deep networks have demonstrated +significant success in detecting changes in bi-temporal remote sensing images +and have found applications in various fields. Given the degradation of natural +environments and the frequent occurrence of natural disasters, accurately and +swiftly identifying damaged buildings in disaster-stricken areas through remote +sensing images holds immense significance. This paper aims to investigate +change detection specifically for natural disasters. Considering that existing +public datasets used in change detection research are registered, which does +not align with the practical scenario where bi-temporal images are not matched, +this paper introduces an unregistered end-to-end change detection synthetic +dataset called xBD-E2ECD. Furthermore, we propose an end-to-end change +detection network named E2ECDNet, which takes an unregistered bi-temporal image +pair as input and simultaneously generates the flow field prediction result and +the change detection prediction result. It is worth noting that our E2ECDNet +also supports change detection for registered image pairs, as registration can +be seen as a special case of non-registration. Additionally, this paper +redefines the criteria for correctly predicting a positive case and introduces +neighborhood-based change detection evaluation metrics. The experimental +results have demonstrated significant improvements. + +
+
+
+
+
+ + ♻ ☆ LiDAR Meta Depth Completion IROS 2023 + + +
+ Depth estimation is one of the essential tasks to be addressed when creating +mobile autonomous systems. While monocular depth estimation methods have +improved in recent times, depth completion provides more accurate and reliable +depth maps by additionally using sparse depth information from other sensors +such as LiDAR. However, current methods are specifically trained for a single +LiDAR sensor. As the scanning pattern differs between sensors, every new sensor +would require re-training a specialized depth completion model, which is +computationally inefficient and not flexible. Therefore, we propose to +dynamically adapt the depth completion model to the used sensor type enabling +LiDAR adaptive depth completion. Specifically, we propose a meta depth +completion network that uses data patterns derived from the data to learn a +task network to alter weights of the main depth completion network to solve a +given depth completion task effectively. The method demonstrates a strong +capability to work on multiple LiDAR scanning patterns and can also generalize +to scanning patterns that are unseen during training. While using a single +model, our method yields significantly better results than a non-adaptive +baseline trained on different LiDAR patterns. It outperforms LiDAR-specific +expert models for very sparse cases. These advantages allow flexible deployment +of a single depth completion model on different sensors, which could also prove +valuable to process the input of nascent LiDAR technology with adaptive instead +of fixed scanning patterns. + +
+
+ comment: Accepted at IROS 2023, v2 has updated author list and fixed a figure + caption +
+
+
+
+
+ + ♻ ☆ HFGD: High-level Feature Guided Decoder for Semantic Segmentation + + +
+ Existing pyramid-based upsamplers (e.g. SemanticFPN), although efficient, +usually produce less accurate results compared to dilation-based models when +using the same backbone. This is partially caused by the contaminated +high-level features since they are fused and fine-tuned with noisy low-level +features on limited data. To address this issue, we propose to use powerful +pretrained high-level features as guidance (HFG) when learning to upsample the +fine-grained low-level features. Specifically, the class tokens are trained +along with only the high-level features from the backbone. These class tokens +are reused by the upsampler for classification, guiding the upsampler features +to more discriminative backbone features. One key design of the HFG is to +protect the high-level features from being contaminated with proper +stop-gradient operations so that the backbone does not update according to the +gradient from the upsampler. To push the upper limit of HFG, we introduce an +context augmentation encoder (CAE) that can efficiently and effectively +operates on low-resolution high-level feature, resulting in improved +representation and thus better guidance. We evaluate the proposed method on +three benchmarks: Pascal Context, COCOStuff164k, and Cityscapes. Our method +achieves state-of-the-art results among methods that do not use extra training +data, demonstrating its effectiveness and generalization ability. The complete +code will be released + +
+
+ comment: Revised version, refactored presentation and added more experiments +
+
+
+
+
+ + ♻ ☆ SEMI-DiffusionInst: A Diffusion Model Based Approach for Semiconductor + Defect Classification and Segmentation + + +
+ With continuous progression of Moore's Law, integrated circuit (IC) device +complexity is also increasing. Scanning Electron Microscope (SEM) image based +extensive defect inspection and accurate metrology extraction are two main +challenges in advanced node (2 nm and beyond) technology. Deep learning (DL) +algorithm based computer vision approaches gained popularity in semiconductor +defect inspection over last few years. In this research work, a new +semiconductor defect inspection framework "SEMI-DiffusionInst" is investigated +and compared to previous frameworks. To the best of the authors' knowledge, +this work is the first demonstration to accurately detect and precisely segment +semiconductor defect patterns by using a diffusion model. Different feature +extractor networks as backbones and data sampling strategies are investigated +towards achieving a balanced trade-off between precision and computing +efficiency. Our proposed approach outperforms previous work on overall mAP and +performs comparatively better or as per for almost all defect classes (per +class APs). The bounding box and segmentation mAPs achieved by the proposed +SEMI-DiffusionInst model are improved by 3.83% and 2.10%, respectively. Among +individual defect types, precision on line collapse and thin bridge defects are +improved approximately 15\% on detection task for both defect types. It has +also been shown that by tuning inference hyperparameters, inference time can be +improved significantly without compromising model precision. Finally, certain +limitations and future work strategy to overcome them are discussed. + +
+
+ comment: 6 pages, 5 figures, To be published by IEEE in the proceedings of the + 2023 ELMAR conference +
+
+
+
+
+ + ♻ ☆ Unsafe Diffusion: On the Generation of Unsafe Images and Hateful Memes + From Text-To-Image Models + + +
+ State-of-the-art Text-to-Image models like Stable Diffusion and DALLE$\cdot$2 +are revolutionizing how people generate visual content. At the same time, +society has serious concerns about how adversaries can exploit such models to +generate unsafe images. In this work, we focus on demystifying the generation +of unsafe images and hateful memes from Text-to-Image models. We first +construct a typology of unsafe images consisting of five categories (sexually +explicit, violent, disturbing, hateful, and political). Then, we assess the +proportion of unsafe images generated by four advanced Text-to-Image models +using four prompt datasets. We find that these models can generate a +substantial percentage of unsafe images; across four models and four prompt +datasets, 14.56% of all generated images are unsafe. When comparing the four +models, we find different risk levels, with Stable Diffusion being the most +prone to generating unsafe content (18.92% of all generated images are unsafe). +Given Stable Diffusion's tendency to generate more unsafe content, we evaluate +its potential to generate hateful meme variants if exploited by an adversary to +attack a specific individual or community. We employ three image editing +methods, DreamBooth, Textual Inversion, and SDEdit, which are supported by +Stable Diffusion. Our evaluation result shows that 24% of the generated images +using DreamBooth are hateful meme variants that present the features of the +original hateful meme and the target individual/community; these generated +images are comparable to hateful meme variants collected from the real world. +Overall, our results demonstrate that the danger of large-scale generation of +unsafe images is imminent. We discuss several mitigating measures, such as +curating training data, regulating prompts, and implementing safety filters, +and encourage better safeguard tools to be developed to prevent unsafe +generation. + +
+
+ comment: To Appear in the ACM Conference on Computer and Communications + Security, November 26, 2023 +
+
+
+
+
+ + ♻ ☆ 3D-aware Blending with Generative NeRFs ICCV 2023 + + +
+ Image blending aims to combine multiple images seamlessly. It remains +challenging for existing 2D-based methods, especially when input images are +misaligned due to differences in 3D camera poses and object shapes. To tackle +these issues, we propose a 3D-aware blending method using generative Neural +Radiance Fields (NeRF), including two key components: 3D-aware alignment and +3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of +the reference image with respect to generative NeRFs and then perform 3D local +alignment for each part. To further leverage 3D information of the generative +NeRF, we propose 3D-aware blending that directly blends images on the NeRF's +latent representation space, rather than raw pixel space. Collectively, our +method outperforms existing 2D baselines, as validated by extensive +quantitative and qualitative evaluations with FFHQ and AFHQ-Cat. + +
+
+ comment: ICCV 2023, Project page: https://blandocs.github.io/blendnerf +
+
+
+
+
+ + ♻ ☆ Source-free Depth for Object Pop-out ICCV 2023 + + +
+ Depth cues are known to be useful for visual perception. However, direct +measurement of depth is often impracticable. Fortunately, though, modern +learning-based methods offer promising depth maps by inference in the wild. In +this work, we adapt such depth inference models for object segmentation using +the objects' "pop-out" prior in 3D. The "pop-out" is a simple composition prior +that assumes objects reside on the background surface. Such compositional prior +allows us to reason about objects in the 3D space. More specifically, we adapt +the inferred depth maps such that objects can be localized using only 3D +information. Such separation, however, requires knowledge about contact surface +which we learn using the weak supervision of the segmentation mask. Our +intermediate representation of contact surface, and thereby reasoning about +objects purely in 3D, allows us to better transfer the depth knowledge into +semantics. The proposed adaptation method uses only the depth model without +needing the source data used for training, making the learning process +efficient and practical. Our experiments on eight datasets of two challenging +tasks, namely camouflaged object detection and salient object detection, +consistently demonstrate the benefit of our method in terms of both performance +and generalizability. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ BlindHarmony: "Blind" Harmonization for MR Images via Flow model ICCV 2023 + + +
+ In MRI, images of the same contrast (e.g., T$_1$) from the same subject can +exhibit noticeable differences when acquired using different hardware, +sequences, or scan parameters. These differences in images create a domain gap +that needs to be bridged by a step called image harmonization, to process the +images successfully using conventional or deep learning-based image analysis +(e.g., segmentation). Several methods, including deep learning-based +approaches, have been proposed to achieve image harmonization. However, they +often require datasets from multiple domains for deep learning training and may +still be unsuccessful when applied to images from unseen domains. To address +this limitation, we propose a novel concept called `Blind Harmonization', which +utilizes only target domain data for training but still has the capability to +harmonize images from unseen domains. For the implementation of blind +harmonization, we developed BlindHarmony using an unconditional flow model +trained on target domain data. The harmonized image is optimized to have a +correlation with the input source domain image while ensuring that the latent +vector of the flow model is close to the center of the Gaussian distribution. +BlindHarmony was evaluated on both simulated and real datasets and compared to +conventional methods. BlindHarmony demonstrated noticeable performance on both +datasets, highlighting its potential for future use in clinical settings. The +source code is available at: https://github.com/SNU-LIST/BlindHarmony + +
+
+ comment: ICCV 2023 accepted. 9 pages and 5 Figures for manuscipt, + supplementary included +
+
+
+
+
+ + ♻ ☆ STS-GAN: Can We Synthesize Solid Texture with High Fidelity from + Arbitrary 2D Exemplar? + + +
+ Solid texture synthesis (STS), an effective way to extend a 2D exemplar to a +3D solid volume, exhibits advantages in computational photography. However, +existing methods generally fail to accurately learn arbitrary textures, which +may result in the failure to synthesize solid textures with high fidelity. In +this paper, we propose a novel generative adversarial nets-based framework +(STS-GAN) to extend the given 2D exemplar to arbitrary 3D solid textures. In +STS-GAN, multi-scale 2D texture discriminators evaluate the similarity between +the given 2D exemplar and slices from the generated 3D texture, promoting the +3D texture generator synthesizing realistic solid textures. Finally, +experiments demonstrate that the proposed method can generate high-fidelity +solid textures with similar visual characteristics to the 2D exemplar. + +
+
+
+
+
+ + ♻ ☆ Seeing through the Brain: Image Reconstruction of Visual Perception from + Human Brain Signals + + +
+ Seeing is believing, however, the underlying mechanism of how human visual +perceptions are intertwined with our cognitions is still a mystery. Thanks to +the recent advances in both neuroscience and artificial intelligence, we have +been able to record the visually evoked brain activities and mimic the visual +perception ability through computational approaches. In this paper, we pay +attention to visual stimuli reconstruction by reconstructing the observed +images based on portably accessible brain signals, i.e., electroencephalography +(EEG) data. Since EEG signals are dynamic in the time-series format and are +notorious to be noisy, processing and extracting useful information requires +more dedicated efforts; In this paper, we propose a comprehensive pipeline, +named NeuroImagen, for reconstructing visual stimuli images from EEG signals. +Specifically, we incorporate a novel multi-level perceptual information +decoding to draw multi-grained outputs from the given EEG data. A latent +diffusion model will then leverage the extracted information to reconstruct the +high-resolution visual stimuli images. The experimental results have +illustrated the effectiveness of image reconstruction and superior quantitative +performance of our proposed method. + +
+
+ comment: A preprint version of an ongoing work +
+
+
+
+
+ + ♻ ☆ ACTIVE: Towards Highly Transferable 3D Physical Camouflage for Universal + and Robust Vehicle Evasion ICCV 2023 + + +
+ Adversarial camouflage has garnered attention for its ability to attack +object detectors from any viewpoint by covering the entire object's surface. +However, universality and robustness in existing methods often fall short as +the transferability aspect is often overlooked, thus restricting their +application only to a specific target with limited performance. To address +these challenges, we present Adversarial Camouflage for Transferable and +Intensive Vehicle Evasion (ACTIVE), a state-of-the-art physical camouflage +attack framework designed to generate universal and robust adversarial +camouflage capable of concealing any 3D vehicle from detectors. Our framework +incorporates innovative techniques to enhance universality and robustness, +including a refined texture rendering that enables common texture application +to different vehicles without being constrained to a specific texture map, a +novel stealth loss that renders the vehicle undetectable, and a smooth and +camouflage loss to enhance the naturalness of the adversarial camouflage. Our +extensive experiments on 15 different models show that ACTIVE consistently +outperforms existing works on various public detectors, including the latest +YOLOv7. Notably, our universality evaluations reveal promising transferability +to other vehicle classes, tasks (segmentation models), and the real world, not +just other vehicles. + +
+
+ comment: Accepted for ICCV 2023. Main Paper with Supplementary Material. + Project Page: https://islab-ai.github.io/active-iccv2023/ +
+
+
+
+
+ + ♻ ☆ VM-NeRF: Tackling Sparsity in NeRF with View Morphing + + +
+ NeRF aims to learn a continuous neural scene representation by using a finite +set of input images taken from various viewpoints. A well-known limitation of +NeRF methods is their reliance on data: the fewer the viewpoints, the higher +the likelihood of overfitting. This paper addresses this issue by introducing a +novel method to generate geometrically consistent image transitions between +viewpoints using View Morphing. Our VM-NeRF approach requires no prior +knowledge about the scene structure, as View Morphing is based on the +fundamental principles of projective geometry. VM-NeRF tightly integrates this +geometric view generation process during the training procedure of standard +NeRF approaches. Notably, our method significantly improves novel view +synthesis, particularly when only a few views are available. Experimental +evaluation reveals consistent improvement over current methods that handle +sparse viewpoints in NeRF models. We report an increase in PSNR of up to 1.8dB +and 1.0dB when training uses eight and four views, respectively. Source code: +\url{https://github.com/mbortolon97/VM-NeRF} + +
+
+ comment: ICIAP 2023 +
+
+
+
+
+ + ♻ ☆ Towards the extraction of robust sign embeddings for low resource sign + language recognition + + +
+ Isolated Sign Language Recognition (SLR) has mostly been applied on datasets +containing signs executed slowly and clearly by a limited group of signers. In +real-world scenarios, however, we are met with challenging visual conditions, +coarticulated signing, small datasets, and the need for signer independent +models. To tackle this difficult problem, we require a robust feature extractor +to process the sign language videos. One could expect human pose estimators to +be ideal candidates. However, due to a domain mismatch with their training sets +and challenging poses in sign language, they lack robustness on sign language +data and image-based models often still outperform keypoint-based models. +Furthermore, whereas the common practice of transfer learning with image-based +models yields even higher accuracy, keypoint-based models are typically trained +from scratch on every SLR dataset. These factors limit their usefulness for +SLR. From the existing literature, it is also not clear which, if any, pose +estimator performs best for SLR. We compare the three most popular pose +estimators for SLR: OpenPose, MMPose and MediaPipe. We show that through +keypoint normalization, missing keypoint imputation, and learning a pose +embedding, we can obtain significantly better results and enable transfer +learning. We show that keypoint-based embeddings contain cross-lingual +features: they can transfer between sign languages and achieve competitive +performance even when fine-tuning only the classifier layer of an SLR model on +a target sign language. We furthermore achieve better performance using +fine-tuned transferred embeddings than models trained only on the target sign +language. The embeddings can also be learned in a multilingual fashion. The +application of these embeddings could prove particularly useful for low +resource sign languages in the future. + +
+
+
+
+
+ + ♻ ☆ Instruct-NeuralTalker: Editing Audio-Driven Talking Radiance Fields with + Instructions + + +
+ Recent neural talking radiance field methods have shown great success in +photorealistic audio-driven talking face synthesis. In this paper, we propose a +novel interactive framework that utilizes human instructions to edit such +implicit neural representations to achieve real-time personalized talking face +generation. Given a short speech video, we first build an efficient talking +radiance field, and then apply the latest conditional diffusion model for image +editing based on the given instructions and guiding implicit representation +optimization towards the editing target. To ensure audio-lip synchronization +during the editing process, we propose an iterative dataset updating strategy +and utilize a lip-edge loss to constrain changes in the lip region. We also +introduce a lightweight refinement network for complementing image details and +achieving controllable detail generation in the final rendered image. Our +method also enables real-time rendering at up to 30FPS on consumer hardware. +Multiple metrics and user verification show that our approach provides a +significant improvement in rendering quality compared to state-of-the-art +methods. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ 3D-TOGO: Towards Text-Guided Cross-Category 3D Object Generation + + +
+ Text-guided 3D object generation aims to generate 3D objects described by +user-defined captions, which paves a flexible way to visualize what we +imagined. Although some works have been devoted to solving this challenging +task, these works either utilize some explicit 3D representations (e.g., mesh), +which lack texture and require post-processing for rendering photo-realistic +views; or require individual time-consuming optimization for every single case. +Here, we make the first attempt to achieve generic text-guided cross-category +3D object generation via a new 3D-TOGO model, which integrates a text-to-views +generation module and a views-to-3D generation module. The text-to-views +generation module is designed to generate different views of the target 3D +object given an input caption. prior-guidance, caption-guidance and view +contrastive learning are proposed for achieving better view-consistency and +caption similarity. Meanwhile, a pixelNeRF model is adopted for the views-to-3D +generation module to obtain the implicit 3D neural representation from the +previously-generated views. Our 3D-TOGO model generates 3D objects in the form +of the neural radiance field with good texture and requires no time-cost +optimization for every single caption. Besides, 3D-TOGO can control the +category, color and shape of generated 3D objects with the input caption. +Extensive experiments on the largest 3D object dataset (i.e., ABO) are +conducted to verify that 3D-TOGO can better generate high-quality 3D objects +according to the input captions across 98 different categories, in terms of +PSNR, SSIM, LPIPS and CLIP-score, compared with text-NeRF and Dreamfields. + +
+
+
+
+
+ + ♻ ☆ HyperSparse Neural Networks: Shifting Exploration to Exploitation + through Adaptive Regularization ICCV'23 + + +
+ Sparse neural networks are a key factor in developing resource-efficient +machine learning applications. We propose the novel and powerful sparse +learning method Adaptive Regularized Training (ART) to compress dense into +sparse networks. Instead of the commonly used binary mask during training to +reduce the number of model weights, we inherently shrink weights close to zero +in an iterative manner with increasing weight regularization. Our method +compresses the pre-trained model knowledge into the weights of highest +magnitude. Therefore, we introduce a novel regularization loss named +HyperSparse that exploits the highest weights while conserving the ability of +weight exploration. Extensive experiments on CIFAR and TinyImageNet show that +our method leads to notable performance gains compared to other sparsification +methods, especially in extremely high sparsity regimes up to 99.8 percent model +sparsity. Additional investigations provide new insights into the patterns that +are encoded in weights with high magnitudes. + +
+
+ comment: ICCV'23 Workshops +
+
+
+
+
+ + ♻ ☆ Social Occlusion Inference with Vectorized Representation for Autonomous + Driving + + +
+ Autonomous vehicles must be capable of handling the occlusion of the +environment to ensure safe and efficient driving. In urban environment, +occlusion often arises due to other vehicles obscuring the perception of the +ego vehicle. Since the occlusion condition can impact the trajectories of +vehicles, the behavior of other vehicles is helpful in making inferences about +the occlusion as a remedy for perceptual deficiencies. This paper introduces a +novel social occlusion inference approach that learns a mapping from agent +trajectories and scene context to an occupancy grid map (OGM) representing the +view of ego vehicle. Specially, vectorized features are encoded through the +polyline encoder to aggregate features of vectors into features of polylines. A +transformer module is then utilized to model the high-order interactions of +polylines. Importantly, occlusion queries are proposed to fuse polyline +features and generate the OGM without the input of visual modality. To verify +the performance of vectorized representation, we design a baseline based on a +fully transformer encoder-decoder architecture mapping the OGM with occlusion +and historical trajectories information to the ground truth OGM. We evaluate +our approach on an unsignalized intersection in the INTERACTION dataset, which +outperforms the state-of-the-art results. + +
+
+
+
+
+ + ♻ ☆ On the Effectiveness of Spectral Discriminators for Perceptual Quality + Improvement ICCV 2023 + + +
+ Several recent studies advocate the use of spectral discriminators, which +evaluate the Fourier spectra of images for generative modeling. However, the +effectiveness of the spectral discriminators is not well interpreted yet. We +tackle this issue by examining the spectral discriminators in the context of +perceptual image super-resolution (i.e., GAN-based SR), as SR image quality is +susceptible to spectral changes. Our analyses reveal that the spectral +discriminator indeed performs better than the ordinary (a.k.a. spatial) +discriminator in identifying the differences in the high-frequency range; +however, the spatial discriminator holds an advantage in the low-frequency +range. Thus, we suggest that the spectral and spatial discriminators shall be +used simultaneously. Moreover, we improve the spectral discriminators by first +calculating the patch-wise Fourier spectrum and then aggregating the spectra by +Transformer. We verify the effectiveness of the proposed method twofold. On the +one hand, thanks to the additional spectral discriminator, our obtained SR +images have their spectra better aligned to those of the real images, which +leads to a better PD tradeoff. On the other hand, our ensembled discriminator +predicts the perceptual quality more accurately, as evidenced in the +no-reference image quality assessment task. + +
+
+ comment: Accepted to ICCV 2023. Code and Models are publicly available at + https://github.com/Luciennnnnnn/DualFormer +
+
+
+
+
+ + ♻ ☆ DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using + Pre-trained Diffusion Models ICCV2023 + + +
+ Given a classifier, the inherent property of semantic Out-of-Distribution +(OOD) samples is that their contents differ from all legal classes in terms of +semantics, namely semantic mismatch. There is a recent work that directly +applies it to OOD detection, which employs a conditional Generative Adversarial +Network (cGAN) to enlarge semantic mismatch in the image space. While achieving +remarkable OOD detection performance on small datasets, it is not applicable to +ImageNet-scale datasets due to the difficulty in training cGANs with both input +images and labels as conditions. As diffusion models are much easier to train +and amenable to various conditions compared to cGANs, in this work, we propose +to directly use pre-trained diffusion models for semantic mismatch-guided OOD +detection, named DiffGuard. Specifically, given an OOD input image and the +predicted label from the classifier, we try to enlarge the semantic difference +between the reconstructed OOD image under these conditions and the original +input image. We also present several test-time techniques to further strengthen +such differences. Experimental results show that DiffGuard is effective on both +Cifar-10 and hard cases of the large-scale ImageNet, and it can be easily +combined with existing OOD detection techniques to achieve state-of-the-art OOD +detection results. + +
+
+ comment: Accepted by ICCV2023, with supplementary materials +
+
+
+
+
+ + ♻ ☆ SAMFlow: Eliminating Any Fragmentation in Optical Flow with Segment + Anything Model + + +
+ Optical Flow Estimation aims to find the 2D dense motion field between two +frames. Due to the limitation of model structures and training datasets, +existing methods often rely too much on local clues and ignore the integrity of +objects, resulting in fragmented motion estimation. Through theoretical +analysis, we find the pre-trained large vision models are helpful in optical +flow estimation, and we notice that the recently famous Segment Anything Model +(SAM) demonstrates a strong ability to segment complete objects, which is +suitable for solving the fragmentation problem. We thus propose a solution to +embed the frozen SAM image encoder into FlowFormer to enhance object +perception. To address the challenge of in-depth utilizing SAM in +non-segmentation tasks like optical flow estimation, we propose an Optical Flow +Task-Specific Adaption scheme, including a Context Fusion Module to fuse the +SAM encoder with the optical flow context encoder, and a Context Adaption +Module to adapt the SAM features for optical flow task with Learned +Task-Specific Embedding. Our proposed SAMFlow model reaches 0.86/2.10 +clean/final EPE and 3.55/12.32 EPE/F1-all on Sintel and KITTI-15 training set, +surpassing Flowformer by 8.5%/9.9% and 13.2%/16.3%. Furthermore, our model +achieves state-of-the-art performance on the Sintel and KITTI-15 benchmarks, +ranking #1 among all two-frame methods on Sintel clean pass. + +
+
+
+
+
+ + ♻ ☆ Scalable Surface Water Mapping up to Fine-scale using Geometric Features + of Water from Topographic Airborne LiDAR Data + + +
+ Despite substantial technological advancements, the comprehensive mapping of +surface water, particularly smaller bodies (<1ha), continues to be a challenge +due to a lack of robust, scalable methods. Standard methods require either +training labels or site-specific parameter tuning, which complicates automated +mapping and introduces biases related to training data and parameters. The +reliance on water's reflectance properties, including LiDAR intensity, further +complicates the matter, as higher-resolution images inherently produce more +noise. To mitigate these difficulties, we propose a unique method that focuses +on the geometric characteristics of water instead of its variable reflectance +properties. Unlike preceding approaches, our approach relies entirely on 3D +coordinate observations from airborne LiDAR data, taking advantage of the +principle that connected surface water remains flat due to gravity. By +harnessing this natural law in conjunction with connectivity, our method can +accurately and scalably identify small water bodies, eliminating the need for +training labels or repetitive parameter tuning. Consequently, our approach +enables the creation of comprehensive 3D topographic maps that include both +water and terrain, all performed in an unsupervised manner using only airborne +laser scanning data, potentially enhancing the process of generating reliable +3D topographic maps. We validated our method across extensive and diverse +landscapes, while comparing it to highly competitive Normalized Difference +Water Index (NDWI)-based methods and assessing it using a reference surface +water map. In conclusion, our method offers a new approach to address +persistent difficulties in robust, scalable surface water mapping and 3D +topographic mapping, using solely airborne LiDAR data. + +
+
+
+
+
+ + ♻ ☆ An unsupervised, open-source workflow for 2D and 3D building mapping + from airborne LiDAR data + + +
+ Despite the substantial demand for high-quality, large-area building maps, no +established open-source workflow for generating 2D and 3D maps currently +exists. This study introduces an automated, open-source workflow for +large-scale 2D and 3D building mapping utilizing airborne LiDAR data. Uniquely, +our workflow operates entirely unsupervised, eliminating the need for any +training procedures. We have integrated a specifically tailored DTM generation +algorithm into our workflow to prevent errors in complex urban landscapes, +especially around highways and overpasses. Through fine rasterization of LiDAR +point clouds, we've enhanced building-tree differentiation, reduced errors near +water bodies, and augmented computational efficiency by introducing a new +planarity calculation. Our workflow offers a practical and scalable solution +for the mass production of rasterized 2D and 3D building maps from raw airborne +LiDAR data. Also, we elaborate on the influence of parameters and potential +error sources to provide users with practical guidance. Our method's robustness +has been rigorously optimized and tested using an extensive dataset (> 550 +km$^2$), and further validated through comparison with deep learning-based and +hand-digitized products. Notably, through these unparalleled, large-scale +comparisons, we offer a valuable analysis of large-scale building maps +generated via different methodologies, providing insightful evaluations of the +effectiveness of each approach. We anticipate that our highly scalable building +mapping workflow will facilitate the production of reliable 2D and 3D building +maps, fostering advances in large-scale urban analysis. The code will be +released upon publication. + +
+
+
+
+
+ + ♻ ☆ 3D Semantic Subspace Traverser: Empowering 3D Generative Model with + Shape Editing Capability ICCV 2023 + + +
+ Shape generation is the practice of producing 3D shapes as various +representations for 3D content creation. Previous studies on 3D shape +generation have focused on shape quality and structure, without or less +considering the importance of semantic information. Consequently, such +generative models often fail to preserve the semantic consistency of shape +structure or enable manipulation of the semantic attributes of shapes during +generation. In this paper, we proposed a novel semantic generative model named +3D Semantic Subspace Traverser that utilizes semantic attributes for +category-specific 3D shape generation and editing. Our method utilizes implicit +functions as the 3D shape representation and combines a novel latent-space GAN +with a linear subspace model to discover semantic dimensions in the local +latent space of 3D shapes. Each dimension of the subspace corresponds to a +particular semantic attribute, and we can edit the attributes of generated +shapes by traversing the coefficients of those dimensions. Experimental results +demonstrate that our method can produce plausible shapes with complex +structures and enable the editing of semantic attributes. The code and trained +models are available at +https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser + +
+
+ comment: Published in ICCV 2023. Code: + https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser +
+
+
+
+
+ + ♻ ☆ Swin3D: A Pretrained Transformer Backbone for 3D Indoor Scene + Understanding + + +
+ The use of pretrained backbones with fine-tuning has been successful for 2D +vision and natural language processing tasks, showing advantages over +task-specific networks. In this work, we introduce a pretrained 3D backbone, +called {\SST}, for 3D indoor scene understanding. We design a 3D Swin +transformer as our backbone network, which enables efficient self-attention on +sparse voxels with linear memory complexity, making the backbone scalable to +large models and datasets. We also introduce a generalized contextual relative +positional embedding scheme to capture various irregularities of point signals +for improved network performance. We pretrained a large {\SST} model on a +synthetic Structured3D dataset, which is an order of magnitude larger than the +ScanNet dataset. Our model pretrained on the synthetic dataset not only +generalizes well to downstream segmentation and detection on real 3D point +datasets, but also outperforms state-of-the-art methods on downstream tasks +with +2.3 mIoU and +2.2 mIoU on S3DIS Area5 and 6-fold semantic segmentation, ++1.8 mIoU on ScanNet segmentation (val), +1.9 mAP@0.5 on ScanNet detection, and ++8.1 mAP@0.5 on S3DIS detection. A series of extensive ablation studies further +validate the scalability, generality, and superior performance enabled by our +approach. The code and models are available at +https://github.com/microsoft/Swin3D . + +
+
+ comment: Project page: https://yukichiii.github.io/project/swin3D/swin3D.html +
+
+
+
+
+ + ♻ ☆ Interaction-Aware Personalized Vehicle Trajectory Prediction Using + Temporal Graph Neural Networks + + +
+ Accurate prediction of vehicle trajectories is vital for advanced driver +assistance systems and autonomous vehicles. Existing methods mainly rely on +generic trajectory predictions derived from large datasets, overlooking the +personalized driving patterns of individual drivers. To address this gap, we +propose an approach for interaction-aware personalized vehicle trajectory +prediction that incorporates temporal graph neural networks. Our method +utilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to +model the spatio-temporal interactions between target vehicles and their +surrounding traffic. To personalize the predictions, we establish a pipeline +that leverages transfer learning: the model is initially pre-trained on a +large-scale trajectory dataset and then fine-tuned for each driver using their +specific driving data. We employ human-in-the-loop simulation to collect +personalized naturalistic driving trajectories and corresponding surrounding +vehicle trajectories. Experimental results demonstrate the superior performance +of our personalized GCN-LSTM model, particularly for longer prediction +horizons, compared to its generic counterpart. Moreover, the personalized model +outperforms individual models created without pre-training, emphasizing the +significance of pre-training on a large dataset to avoid overfitting. By +incorporating personalization, our approach enhances trajectory prediction +accuracy. + +
+
+
+
+
+ + ♻ ☆ High-Performance Fine Defect Detection in Artificial Leather Using Dual + Feature Pool Object Detection + + +
+ In this study, the structural problems of the YOLOv5 model were analyzed +emphatically. Based on the characteristics of fine defects in artificial +leather, four innovative structures, namely DFP, IFF, AMP, and EOS, were +designed. These advancements led to the proposal of a high-performance +artificial leather fine defect detection model named YOLOD. YOLOD demonstrated +outstanding performance on the artificial leather defect dataset, achieving an +impressive increase of 11.7% - 13.5% in AP_50 compared to YOLOv5, along with a +significant reduction of 5.2% - 7.2% in the error detection rate. Moreover, +YOLOD also exhibited remarkable performance on the general MS-COCO dataset, +with an increase of 0.4% - 2.6% in AP compared to YOLOv5, and a rise of 2.5% - +4.1% in AP_S compared to YOLOv5. These results demonstrate the superiority of +YOLOD in both artificial leather defect detection and general object detection +tasks, making it a highly efficient and effective model for real-world +applications. + +
+
+
+
+
+ + ♻ ☆ YOLOCS: Object Detection based on Dense Channel Compression for Feature + Spatial Solidification + + +
+ In this study, we examine the associations between channel features and +convolutional kernels during the processes of feature purification and gradient +backpropagation, with a focus on the forward and backward propagation within +the network. Consequently, we propose a method called Dense Channel Compression +for Feature Spatial Solidification. Drawing upon the central concept of this +method, we introduce two innovative modules for backbone and head networks: the +Dense Channel Compression for Feature Spatial Solidification Structure (DCFS) +and the Asymmetric Multi-Level Compression Decoupled Head (ADH). When +integrated into the YOLOv5 model, these two modules demonstrate exceptional +performance, resulting in a modified model referred to as YOLOCS. Evaluated on +the MSCOCO dataset, the large, medium, and small YOLOCS models yield AP of +50.1%, 47.6%, and 42.5%, respectively. Maintaining inference speeds remarkably +similar to those of the YOLOv5 model, the large, medium, and small YOLOCS +models surpass the YOLOv5 model's AP by 1.1%, 2.3%, and 5.2%, respectively. + +
+
+
+
+
+ + ♻ ☆ A Two-Step Deep Learning Method for 3DCT-2DUS Kidney Registration During + Breathing + + +
+ This work proposed a novel deep registration pipeline for 3D CT and 2D U/S +kidney scans of free breathing, which consists of a feature network, and a +3D-2D CNN-based registration network. The feature network has handcraft texture +feature layers to reduce the semantic gap. The registration network is +encoder-decoder structure with loss of feature-image-motion (FIM), which +enables hierarchical regression at decoder layers and avoids multiple network +concatenation. It was first pretrained with retrospective datasets cum training +data generation strategy, then adapted to specific patient data under +unsupervised one-cycle transfer learning in onsite application. The experiment +was on 132 U/S sequences, 39 multiple phase CT and 210 public single phase CT +images, and 25 pairs of CT and U/S sequences. It resulted in mean contour +distance (MCD) of 0.94 mm between kidneys on CT and U/S images and MCD of 1.15 +mm on CT and reference CT images. For datasets with small transformations, it +resulted in MCD of 0.82 and 1.02 mm respectively. For large transformations, it +resulted in MCD of 1.10 and 1.28 mm respectively. This work addressed +difficulties in 3DCT-2DUS kidney registration during free breathing via novel +network structures and training strategy. + +
+
+ comment: 16 pages, 8 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Convex Decomposition of Indoor Scenes + + +
+ We describe a method to parse a complex, cluttered indoor scene into +primitives which offer a parsimonious abstraction of scene structure. Our +primitives are simple convexes. Our method uses a learned regression procedure +to parse a scene into a fixed number of convexes from RGBD input, and can +optionally accept segmentations to improve the decomposition. The result is +then polished with a descent method which adjusts the convexes to produce a +very good fit, and greedily removes superfluous primitives. Because the entire +scene is parsed, we can evaluate using traditional depth, normal, and +segmentation error metrics. Our evaluation procedure demonstrates that the +error from our primitive representation is comparable to that of predicting +depth from a single image. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Ablating Concepts in Text-to-Image Diffusion Models ICCV 2023 + + +
+ Large-scale text-to-image diffusion models can generate high-fidelity images +with powerful compositional ability. However, these models are typically +trained on an enormous amount of Internet data, often containing copyrighted +material, licensed images, and personal photos. Furthermore, they have been +found to replicate the style of various living artists or memorize exact +training samples. How can we remove such copyrighted concepts or images without +retraining the model from scratch? To achieve this goal, we propose an +efficient method of ablating concepts in the pretrained model, i.e., preventing +the generation of a target concept. Our algorithm learns to match the image +distribution for a target style, instance, or text prompt we wish to ablate to +the distribution corresponding to an anchor concept. This prevents the model +from generating target concepts given its text condition. Extensive experiments +show that our method can successfully prevent the generation of the ablated +concept while preserving closely related concepts in the model. + +
+
+ comment: ICCV 2023. Project website: https://www.cs.cmu.edu/~concept-ablation/ +
+
+
+
+
+
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ A Bi-Step Grounding Paradigm for Large Language Models in Recommendation + Systems + + +
+ As the focus on Large Language Models (LLMs) in the field of recommendation +intensifies, the optimization of LLMs for recommendation purposes (referred to +as LLM4Rec) assumes a crucial role in augmenting their effectiveness in +providing recommendations. However, existing approaches for LLM4Rec often +assess performance using restricted sets of candidates, which may not +accurately reflect the models' overall ranking capabilities. In this paper, our +objective is to investigate the comprehensive ranking capacity of LLMs and +propose a two-step grounding framework known as BIGRec (Bi-step Grounding +Paradigm for Recommendation). It initially grounds LLMs to the recommendation +space by fine-tuning them to generate meaningful tokens for items and +subsequently identifies appropriate actual items that correspond to the +generated tokens. By conducting extensive experiments on two datasets, we +substantiate the superior performance, capacity for handling few-shot +scenarios, and versatility across multiple domains exhibited by BIGRec. +Furthermore, we observe that the marginal benefits derived from increasing the +quantity of training samples are modest for BIGRec, implying that LLMs possess +the limited capability to assimilate statistical information, such as +popularity and collaborative filtering, due to their robust semantic priors. +These findings also underline the efficacy of integrating diverse statistical +information into the LLM4Rec framework, thereby pointing towards a potential +avenue for future research. Our code and data are available at +https://github.com/SAI990323/Grounding4Rec. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Knowledge-Enhanced Multi-Label Few-Shot Product Attribute-Value + Extraction CIKM 2023 + + +
+ Existing attribute-value extraction (AVE) models require large quantities of +labeled data for training. However, new products with new attribute-value pairs +enter the market every day in real-world e-Commerce. Thus, we formulate AVE in +multi-label few-shot learning (FSL), aiming to extract unseen attribute value +pairs based on a small number of training examples. We propose a +Knowledge-Enhanced Attentive Framework (KEAF) based on prototypical networks, +leveraging the generated label description and category information to learn +more discriminative prototypes. Besides, KEAF integrates with hybrid attention +to reduce noise and capture more informative semantics for each class by +calculating the label-relevant and query-related weights. To achieve +multi-label inference, KEAF further learns a dynamic threshold by integrating +the semantic information from both the support set and the query set. Extensive +experiments with ablation studies conducted on two datasets demonstrate that +KEAF outperforms other SOTA models for information extraction in FSL. The code +can be found at: https://github.com/gjiaying/KEAF + +
+
+ comment: 6 pages, 2 figures, published in CIKM 2023 +
+
+
+
+
+ + ☆ Content-based Recommendation Engine for Video Streaming Platform + + +
+ Recommendation engine suggest content, product or services to the user by +using machine learning algorithm. This paper proposed a content-based +recommendation engine for providing video suggestion to the user based on their +previous interests and choices. We will use TF-IDF text vectorization method to +determine the relevance of words in a document. Then we will find out the +similarity between each content by calculating cosine similarity between them. +Finally, engine will recommend videos to the users based on the obtained +similarity score value. In addition, we will measure the engine's performance +by computing precision, recall, and F1 core of the proposed system. + +
+
+
+
+
+ + ☆ Advancing continual lifelong learning in neural information retrieval: + definition, dataset, framework, and empirical evaluation + + +
+ Continual learning refers to the capability of a machine learning model to +learn and adapt to new information, without compromising its performance on +previously learned tasks. Although several studies have investigated continual +learning methods for information retrieval tasks, a well-defined task +formulation is still lacking, and it is unclear how typical learning strategies +perform in this context. To address this challenge, a systematic task +formulation of continual neural information retrieval is presented, along with +a multiple-topic dataset that simulates continuous information retrieval. A +comprehensive continual neural information retrieval framework consisting of +typical retrieval models and continual learning strategies is then proposed. +Empirical evaluations illustrate that the proposed framework can successfully +prevent catastrophic forgetting in neural information retrieval and enhance +performance on previously learned tasks. The results indicate that +embedding-based retrieval models experience a decline in their continual +learning performance as the topic shift distance and dataset volume of new +tasks increase. In contrast, pretraining-based models do not show any such +correlation. Adopting suitable learning strategies can mitigate the effects of +topic shift and data augmentation. + +
+
+ comment: Submitted to Information Sciences +
+
+
+
+
+ + ☆ Is Meta-Learning the Right Approach for the Cold-Start Problem in + Recommender Systems? + + +
+ Recommender systems have become fundamental building blocks of modern online +products and services, and have a substantial impact on user experience. In the +past few years, deep learning methods have attracted a lot of research, and are +now heavily used in modern real-world recommender systems. Nevertheless, +dealing with recommendations in the cold-start setting, e.g., when a user has +done limited interactions in the system, is a problem that remains far from +solved. Meta-learning techniques, and in particular optimization-based +meta-learning, have recently become the most popular approaches in the academic +research literature for tackling the cold-start problem in deep learning models +for recommender systems. However, current meta-learning approaches are not +practical for real-world recommender systems, which have billions of users and +items, and strict latency requirements. In this paper we show that it is +possible to obtaining similar, or higher, performance on commonly used +benchmarks for the cold-start problem without using meta-learning techniques. +In more detail, we show that, when tuned correctly, standard and widely adopted +deep learning models perform just as well as newer meta-learning models. We +further show that an extremely simple modular approach using common +representation learning techniques, can perform comparably to meta-learning +techniques specifically designed for the cold-start setting while being much +more easily deployable in real-world applications. + +
+
+
+
+
+ + ☆ Phase Retrieval with Background Information: Decreased References and + Efficient Methods + + +
+ Fourier phase retrieval(PR) is a severely ill-posed inverse problem that +arises in various applications. To guarantee a unique solution and relieve the +dependence on the initialization, background information can be exploited as a +structural priors. However, the requirement for the background information may +be challenging when moving to the high-resolution imaging. At the same time, +the previously proposed projected gradient descent(PGD) method also demands +much background information. + In this paper, we present an improved theoretical result about the demand for +the background information, along with two Douglas Rachford(DR) based methods. +Analytically, we demonstrate that the background required to ensure a unique +solution can be decreased by nearly $1/2$ for the 2-D signals compared to the +1-D signals. By generalizing the results into $d$-dimension, we show that the +length of the background information more than $(2^{\frac{d+1}{d}}-1)$ folds of +the signal is sufficient to ensure the uniqueness. At the same time, we also +analyze the stability and robustness of the model when measurements and +background information are corrupted by the noise. Furthermore, two methods +called Background Douglas-Rachford (BDR) and Convex Background Douglas-Rachford +(CBDR) are proposed. BDR which is a kind of non-convex method is proven to have +the local R-linear convergence rate under mild assumptions. Instead, CBDR +method uses the techniques of convexification and can be proven to own a global +convergence guarantee as long as the background information is sufficient. To +support this, a new property called F-RIP is established. We test the +performance of the proposed methods through simulations as well as real +experimental measurements, and demonstrate that they achieve a higher recovery +rate with less background information compared to the PGD method. + +
+
+
+
+
+ + ☆ Pre-training with Large Language Model-based Document Expansion for + Dense Passage Retrieval + + +
+ In this paper, we systematically study the potential of pre-training with +Large Language Model(LLM)-based document expansion for dense passage retrieval. +Concretely, we leverage the capabilities of LLMs for document expansion, i.e. +query generation, and effectively transfer expanded knowledge to retrievers +using pre-training strategies tailored for passage retrieval. These strategies +include contrastive learning and bottlenecked query generation. Furthermore, we +incorporate a curriculum learning strategy to reduce the reliance on LLM +inferences. Experimental results demonstrate that pre-training with LLM-based +document expansion significantly boosts the retrieval performance on +large-scale web-search tasks. Our work shows strong zero-shot and out-of-domain +retrieval abilities, making it more widely applicable for retrieval when +initializing with no human-labeled data. + +
+
+ comment: 10 pages, 3 tables, 4 figures, under review +
+
+
+
+
+ + ☆ Uncovering User Interest from Biased and Noised Watch Time in Video + Recommendation + + +
+ In the video recommendation, watch time is commonly adopted as an indicator +of user interest. However, watch time is not only influenced by the matching of +users' interests but also by other factors, such as duration bias and noisy +watching. Duration bias refers to the tendency for users to spend more time on +videos with longer durations, regardless of their actual interest level. Noisy +watching, on the other hand, describes users taking time to determine whether +they like a video or not, which can result in users spending time watching +videos they do not like. Consequently, the existence of duration bias and noisy +watching make watch time an inadequate label for indicating user interest. +Furthermore, current methods primarily address duration bias and ignore the +impact of noisy watching, which may limit their effectiveness in uncovering +user interest from watch time. In this study, we first analyze the generation +mechanism of users' watch time from a unified causal viewpoint. Specifically, +we considered the watch time as a mixture of the user's actual interest level, +the duration-biased watch time, and the noisy watch time. To mitigate both the +duration bias and noisy watching, we propose Debiased and Denoised watch time +Correction (D$^2$Co), which can be divided into two steps: First, we employ a +duration-wise Gaussian Mixture Model plus frequency-weighted moving average for +estimating the bias and noise terms; then we utilize a sensitivity-controlled +correction function to separate the user interest from the watch time, which is +robust to the estimation error of bias and noise terms. The experiments on two +public video recommendation datasets and online A/B testing indicate the +effectiveness of the proposed method. + +
+
+ comment: Accepted by Recsys'23 +
+
+
+
+
+ + ☆ Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme + Detection ACM MM + + +
+ Hateful meme detection is a challenging multimodal task that requires +comprehension of both vision and language, as well as cross-modal interactions. +Recent studies have tried to fine-tune pre-trained vision-language models +(PVLMs) for this task. However, with increasing model sizes, it becomes +important to leverage powerful PVLMs more efficiently, rather than simply +fine-tuning them. Recently, researchers have attempted to convert meme images +into textual captions and prompt language models for predictions. This approach +has shown good performance but suffers from non-informative image captions. +Considering the two factors mentioned above, we propose a probing-based +captioning approach to leverage PVLMs in a zero-shot visual question answering +(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful +content-related questions and use the answers as image captions (which we call +Pro-Cap), so that the captions contain information critical for hateful content +detection. The good performance of models with Pro-Cap on three benchmarks +validates the effectiveness and generalization of the proposed method. + +
+
+ comment: Camera-ready for 23, ACM MM +
+
+
+
+
+ + ♻ ☆ LLM-Rec: Personalized Recommendation via Prompting Large Language Models + + +
+ We investigate various prompting strategies for enhancing personalized +recommendation performance with large language models (LLMs) through input +augmentation. Our proposed approach, termed LLM-Rec, encompasses four distinct +prompting strategies: (1) basic prompting, (2) recommendation-driven prompting, +(3) engagement-guided prompting, and (4) recommendation-driven + +engagement-guided prompting. Our empirical experiments show that incorporating +the augmented input text generated by LLM leads to improved recommendation +performance. Recommendation-driven and engagement-guided prompting strategies +are found to elicit LLM's understanding of global and local item +characteristics. This finding highlights the importance of leveraging diverse +prompts and input augmentation techniques to enhance the recommendation +capabilities with LLMs. + +
+
+
+
+
+ + ♻ ☆ Editing Language Model-based Knowledge Graph Embeddings + + +
+ Recently decades have witnessed the empirical success of framing Knowledge +Graph (KG) embeddings via language models. However, language model-based KG +embeddings are usually deployed as static artifacts, making them difficult to +modify post-deployment without re-training after deployment. To address this +issue, we propose a new task of editing language model-based KG embeddings in +this paper. This task is designed to facilitate rapid, data-efficient updates +to KG embeddings without compromising the performance of other aspects. We +build four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and +evaluate several knowledge editing baselines demonstrating the limited ability +of previous models to handle the proposed challenging task. We further propose +a simple yet strong baseline dubbed KGEditor, which utilizes additional +parametric layers of the hyper network to edit/add facts. Our comprehensive +experimental results reveal that KGEditor excels in updating specific facts +without impacting the overall performance, even when faced with limited +training resources. Code and datasets are available in +https://github.com/zjunlp/PromptKG/tree/main/deltaKG. + +
+
+ comment: Work in progress and the project website is + https://zjunlp.github.io/project/KGE_Editing/ +
+
+
+
+
+ + ♻ ☆ A Survey on Point-of-Interest Recommendations Leveraging Heterogeneous + Data + + +
+ Tourism is an important application domain for recommender systems. In this +domain, recommender systems are for example tasked with providing personalized +recommendations for transportation, accommodation, points-of-interest (POIs), +or tourism services. Among these tasks, in particular the problem of +recommending POIs that are of likely interest to individual tourists has gained +growing attention in recent years. Providing POI recommendations to tourists +\emph{during their trip} can however be especially challenging due to the +variability of the users' context. With the rapid development of the Web and +today's multitude of online services, vast amounts of data from various sources +have become available, and these heterogeneous data sources represent a huge +potential to better address the challenges of in-trip POI recommendation +problems. In this work, we provide a comprehensive survey of published research +on POI recommendation between 2017 and 2022 from the perspective of +heterogeneous data sources. Specifically, we investigate which types of data +are used in the literature and which technical approaches and evaluation +methods are predominant. Among other aspects, we find that today's research +works often focus on a narrow range of data sources, leaving great potential +for future works that better utilize heterogeneous data sources and diverse +data types for improved in-trip recommendations. + +
+
+ comment: 35 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ SPM: Structured Pretraining and Matching Architectures for Relevance + Modeling in Meituan Search CIKM '23 + + +
+ In e-commerce search, relevance between query and documents is an essential +requirement for satisfying user experience. Different from traditional +e-commerce platforms that offer products, users search on life service +platforms such as Meituan mainly for product providers, which usually have +abundant structured information, e.g. name, address, category, thousands of +products. Modeling search relevance with these rich structured contents is +challenging due to the following issues: (1) there is language distribution +discrepancy among different fields of structured document, making it difficult +to directly adopt off-the-shelf pretrained language model based methods like +BERT. (2) different fields usually have different importance and their length +vary greatly, making it difficult to extract document information helpful for +relevance matching. + To tackle these issues, in this paper we propose a novel two-stage +pretraining and matching architecture for relevance matching with rich +structured documents. At pretraining stage, we propose an effective pretraining +method that employs both query and multiple fields of document as inputs, +including an effective information compression method for lengthy fields. At +relevance matching stage, a novel matching method is proposed by leveraging +domain knowledge in search query to generate more effective document +representations for relevance scoring. Extensive offline experiments and online +A/B tests on millions of users verify that the proposed architectures +effectively improve the performance of relevance modeling. The model has +already been deployed online, serving the search traffic of Meituan for over a +year. + +
+
+ comment: Accepted by CIKM '23 +
+
+
+
+
+ + ♻ ☆ Beyond Semantics: Learning a Behavior Augmented Relevance Model with + Self-supervised Learning + + +
+ Relevance modeling aims to locate desirable items for corresponding queries, +which is crucial for search engines to ensure user experience. Although most +conventional approaches address this problem by assessing the semantic +similarity between the query and item, pure semantic matching is not +everything. + +
+
+ comment: Partial content +
+
+
+
+
+
+
+
+ + Machine Learning 114 + +
+
+
+ + ☆ Proprioceptive Learning with Soft Polyhedral Networks + + +
+ Proprioception is the "sixth sense" that detects limb postures with motor +neurons. It requires a natural integration between the musculoskeletal systems +and sensory receptors, which is challenging among modern robots that aim for +lightweight, adaptive, and sensitive designs at a low cost. Here, we present +the Soft Polyhedral Network with an embedded vision for physical interactions, +capable of adaptive kinesthesia and viscoelastic proprioception by learning +kinetic features. This design enables passive adaptations to omni-directional +interactions, visually captured by a miniature high-speed motion tracking +system embedded inside for proprioceptive learning. The results show that the +soft network can infer real-time 6D forces and torques with accuracies of +0.25/0.24/0.35 N and 0.025/0.034/0.006 Nm in dynamic interactions. We also +incorporate viscoelasticity in proprioception during static adaptation by +adding a creep and relaxation modifier to refine the predicted results. The +proposed soft network combines simplicity in design, omni-adaptation, and +proprioceptive sensing with high accuracy, making it a versatile solution for +robotics at a low cost with more than 1 million use cycles for tasks such as +sensitive and competitive grasping, and touch-based geometry reconstruction. +This study offers new insights into vision-based proprioception for soft robots +in adaptive grasping, soft manipulation, and human-robot interaction. + +
+
+ comment: 20 pages, 10 figures, 2 tables, submitted to the International + Journal of Robotics Research for review +
+
+
+
+
+ + ☆ Can Transformers Learn Optimal Filtering for Unknown Systems? + + +
+ Transformers have demonstrated remarkable success in natural language +processing; however, their potential remains mostly unexplored for problems +arising in dynamical systems. In this work, we investigate the optimal output +estimation problem using transformers, which generate output predictions using +all the past ones. We train the transformer using various systems drawn from a +prior distribution and then evaluate its performance on previously unseen +systems from the same distribution. As a result, the obtained transformer acts +like a prediction algorithm that learns in-context and quickly adapts to and +predicts well for different systems - thus we call it meta-output-predictor +(MOP). MOP matches the performance of the optimal output estimator, based on +Kalman filter, for most linear dynamical systems even though it does not have +access to a model. We observe via extensive numerical experiments that MOP also +performs well in challenging scenarios with non-i.i.d. noise, time-varying +dynamics, and nonlinear dynamics like a quadrotor system with unknown +parameters. To further support this observation, in the second part of the +paper, we provide statistical guarantees on the performance of MOP and quantify +the required amount of training to achieve a desired excess risk during +test-time. Finally, we point out some limitations of MOP by identifying two +classes of problems MOP fails to perform well, highlighting the need for +caution when using transformers for control and estimation. + +
+
+
+
+
+ + ☆ Painter: Teaching Auto-regressive Language Models to Draw Sketches + + +
+ Large language models (LLMs) have made tremendous progress in natural +language understanding and they have also been successfully adopted in other +domains such as computer vision, robotics, reinforcement learning, etc. In this +work, we apply LLMs to image generation tasks by directly generating the +virtual brush strokes to paint an image. We present Painter, an LLM that can +convert user prompts in text description format to sketches by generating the +corresponding brush strokes in an auto-regressive way. We construct Painter +based on off-the-shelf LLM that is pre-trained on a large text corpus, by +fine-tuning it on the new task while preserving language understanding +capabilities. We create a dataset of diverse multi-object sketches paired with +textual prompts that covers several object types and tasks. Painter can +generate sketches from text descriptions, remove objects from canvas, and +detect and classify objects in sketches. Although this is an unprecedented +pioneering work in using LLMs for auto-regressive image generation, the results +are very encouraging. + +
+
+
+
+
+ + ☆ Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse + Problems + + +
+ Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial +technologies in the field of medical imaging. Score-based models have proven to +be effective in addressing different inverse problems encountered in CT and +MRI, such as sparse-view CT and fast MRI reconstruction. However, these models +face challenges in achieving accurate three dimensional (3D) volumetric +reconstruction. The existing score-based models primarily focus on +reconstructing two dimensional (2D) data distribution, leading to +inconsistencies between adjacent slices in the reconstructed 3D volumetric +images. To overcome this limitation, we propose a novel two-and-a-half order +score-based model (TOSM). During the training phase, our TOSM learns data +distributions in 2D space, which reduces the complexity of training compared to +directly working on 3D volumes. However, in the reconstruction phase, the TOSM +updates the data distribution in 3D space, utilizing complementary scores along +three directions (sagittal, coronal, and transaxial) to achieve a more precise +reconstruction. The development of TOSM is built on robust theoretical +principles, ensuring its reliability and efficacy. Through extensive +experimentation on large-scale sparse-view CT and fast MRI datasets, our method +demonstrates remarkable advancements and attains state-of-the-art results in +solving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively +addresses the inter-slice inconsistency issue, resulting in high-quality 3D +volumetric reconstruction. + +
+
+
+
+
+ + ☆ Autoencoding a Soft Touch to Learn Grasping from On-land to Underwater + + +
+ Robots play a critical role as the physical agent of human operators in +exploring the ocean. However, it remains challenging to grasp objects reliably +while fully submerging under a highly pressurized aquatic environment with +little visible light, mainly due to the fluidic interference on the tactile +mechanics between the finger and object surfaces. This study investigates the +transferability of grasping knowledge from on-land to underwater via a +vision-based soft robotic finger that learns 6D forces and torques (FT) using a +Supervised Variational Autoencoder (SVAE). A high-framerate camera captures the +whole-body deformations while a soft robotic finger interacts with physical +objects on-land and underwater. Results show that the trained SVAE model +learned a series of latent representations of the soft mechanics transferrable +from land to water, presenting a superior adaptation to the changing +environments against commercial FT sensors. Soft, delicate, and reactive +grasping enabled by tactile intelligence enhances the gripper's underwater +interaction with improved reliability and robustness at a much-reduced cost, +paving the path for learning-based intelligent grasping to support fundamental +scientific discoveries in environmental and ocean research. + +
+
+ comment: 17 pages, 5 figures, 1 table, submitted to Advanced Intelligent + Systems for review +
+
+
+
+
+ + ☆ ResBuilder: Automated Learning of Depth with Residual Structures + + +
+ In this work, we develop a neural architecture search algorithm, termed +Resbuilder, that develops ResNet architectures from scratch that achieve high +accuracy at moderate computational cost. It can also be used to modify existing +architectures and has the capability to remove and insert ResNet blocks, in +this way searching for suitable architectures in the space of ResNet +architectures. In our experiments on different image classification datasets, +Resbuilder achieves close to state-of-the-art performance while saving +computational cost compared to off-the-shelf ResNets. Noteworthy, we once tune +the parameters on CIFAR10 which yields a suitable default choice for all other +datasets. We demonstrate that this property generalizes even to industrial +applications by applying our method with default parameters on a proprietary +fraud detection dataset. + +
+
+
+
+
+ + ☆ Time Travel in LLMs: Tracing Data Contamination in Large Language Models + + +
+ Data contamination, i.e., the presence of test data from downstream tasks in +the training data of large language models (LLMs), is a potential major issue +in understanding LLMs' effectiveness on other tasks. We propose a +straightforward yet effective method for identifying data contamination within +LLMs. At its core, our approach starts by identifying potential contamination +in individual instances that are drawn from a small random sample; using this +information, our approach then assesses if an entire dataset partition is +contaminated. To estimate contamination of individual instances, we employ +"guided instruction:" a prompt consisting of the dataset name, partition type, +and the initial segment of a reference instance, asking the LLM to complete it. +An instance is flagged as contaminated if the LLM's output either exactly or +closely matches the latter segment of the reference. To understand if an entire +partition is contaminated, we propose two ideas. The first idea marks a dataset +partition as contaminated if the average overlap score with the reference +instances (as measured by ROUGE or BLEURT) is statistically significantly +better with the guided instruction vs. a general instruction that does not +include the dataset and partition name. The second idea marks a dataset as +contaminated if a classifier based on GPT-4 with in-context learning prompting +marks multiple instances as contaminated. Our best method achieves an accuracy +between 92% and 100% in detecting if an LLM is contaminated with seven +datasets, containing train and test/validation partitions, when contrasted with +manual evaluation by human expert. Further, our findings indicate that GPT-4 is +contaminated with AG News, WNLI, and XSum datasets. + +
+
+ comment: v1 preprint +
+
+
+
+
+ + ☆ Label Propagation Techniques for Artifact Detection in Imbalanced + Classes using Photoplethysmogram Signals + + +
+ Photoplethysmogram (PPG) signals are widely used in healthcare for monitoring +vital signs, but they are susceptible to motion artifacts that can lead to +inaccurate interpretations. In this study, the use of label propagation +techniques to propagate labels among PPG samples is explored, particularly in +imbalanced class scenarios where clean PPG samples are significantly +outnumbered by artifact-contaminated samples. With a precision of 91%, a recall +of 90% and an F1 score of 90% for the class without artifacts, the results +demonstrate its effectiveness in labeling a medical dataset, even when clean +samples are rare. For the classification of artifacts our study compares +supervised classifiers such as conventional classifiers and neural networks +(MLP, Transformers, FCN) with the semi-supervised label propagation algorithm. +With a precision of 89%, a recall of 95% and an F1 score of 92%, the KNN +supervised model gives good results, but the semi-supervised algorithm performs +better in detecting artifacts. The findings suggest that the semi-supervised +algorithm label propagation hold promise for artifact detection in PPG signals, +which can enhance the reliability of PPG-based health monitoring systems in +real-world applications. + +
+
+ comment: Under preparation to submit to IEEE for possible publications +
+
+
+
+
+ + ☆ LLM4TS: Two-Stage Fine-Tuning for Time-Series Forecasting with + Pre-Trained LLMs + + +
+ In this work, we leverage pre-trained Large Language Models (LLMs) to enhance +time-series forecasting. Mirroring the growing interest in unifying models for +Natural Language Processing and Computer Vision, we envision creating an +analogous model for long-term time-series forecasting. Due to limited +large-scale time-series data for building robust foundation models, our +approach LLM4TS focuses on leveraging the strengths of pre-trained LLMs. By +combining time-series patching with temporal encoding, we have enhanced the +capability of LLMs to handle time-series data effectively. Inspired by the +supervised fine-tuning in chatbot domains, we prioritize a two-stage +fine-tuning process: first conducting supervised fine-tuning to orient the LLM +towards time-series data, followed by task-specific downstream fine-tuning. +Furthermore, to unlock the flexibility of pre-trained LLMs without extensive +parameter adjustments, we adopt several Parameter-Efficient Fine-Tuning (PEFT) +techniques. Drawing on these innovations, LLM4TS has yielded state-of-the-art +results in long-term forecasting. Our model has also shown exceptional +capabilities as both a robust representation learner and an effective few-shot +learner, thanks to the knowledge transferred from the pre-trained LLM. + +
+
+
+
+
+ + ☆ An Expert's Guide to Training Physics-informed Neural Networks + + +
+ Physics-informed neural networks (PINNs) have been popularized as a deep +learning framework that can seamlessly synthesize observational data and +partial differential equation (PDE) constraints. Their practical effectiveness +however can be hampered by training pathologies, but also oftentimes by poor +choices made by users who lack deep learning expertise. In this paper we +present a series of best practices that can significantly improve the training +efficiency and overall accuracy of PINNs. We also put forth a series of +challenging benchmark problems that highlight some of the most prominent +difficulties in training PINNs, and present comprehensive and fully +reproducible ablation studies that demonstrate how different architecture +choices and training strategies affect the test accuracy of the resulting +models. We show that the methods and guiding principles put forth in this study +lead to state-of-the-art results and provide strong baselines that future +studies should use for comparison purposes. To this end, we also release a +highly optimized library in JAX that can be used to reproduce all results +reported in this paper, enable future research studies, as well as facilitate +easy adaptation to new use-case scenarios. + +
+
+ comment: 36 pages, 25 figures, 13 tables +
+
+
+
+
+ + ☆ On Neural Quantum Support Vector Machines + + +
+ In \cite{simon2023algorithms} we introduced four algorithms for the training +of neural support vector machines (NSVMs) and demonstrated their feasibility. +In this note we introduce neural quantum support vector machines, that is, +NSVMs with a quantum kernel, and extend our results to this setting. + +
+
+ comment: 13 pages, 0 figures. arXiv admin note: substantial text overlap with + arXiv:2308.07204 +
+
+
+
+
+ + ☆ Hierarchical Uncertainty Estimation for Medical Image Segmentation + Networks + + +
+ Learning a medical image segmentation model is an inherently ambiguous task, +as uncertainties exist in both images (noise) and manual annotations (human +errors and bias) used for model training. To build a trustworthy image +segmentation model, it is important to not just evaluate its performance but +also estimate the uncertainty of the model prediction. Most state-of-the-art +image segmentation networks adopt a hierarchical encoder architecture, +extracting image features at multiple resolution levels from fine to coarse. In +this work, we leverage this hierarchical image representation and propose a +simple yet effective method for estimating uncertainties at multiple levels. +The multi-level uncertainties are modelled via the skip-connection module and +then sampled to generate an uncertainty map for the predicted image +segmentation. We demonstrate that a deep learning segmentation network such as +U-net, when implemented with such hierarchical uncertainty estimation module, +can achieve a high segmentation performance, while at the same time provide +meaningful uncertainty maps that can be used for out-of-distribution detection. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Accurate synthesis of Dysarthric Speech for ASR data augmentation + + +
+ Dysarthria is a motor speech disorder often characterized by reduced speech +intelligibility through slow, uncoordinated control of speech production +muscles. Automatic Speech recognition (ASR) systems can help dysarthric talkers +communicate more effectively. However, robust dysarthria-specific ASR requires +a significant amount of training speech, which is not readily available for +dysarthric talkers. This paper presents a new dysarthric speech synthesis +method for the purpose of ASR training data augmentation. Differences in +prosodic and acoustic characteristics of dysarthric spontaneous speech at +varying severity levels are important components for dysarthric speech +modeling, synthesis, and augmentation. For dysarthric speech synthesis, a +modified neural multi-talker TTS is implemented by adding a dysarthria severity +level coefficient and a pause insertion model to synthesize dysarthric speech +for varying severity levels. To evaluate the effectiveness for synthesis of +training data for ASR, dysarthria-specific speech recognition was used. Results +show that a DNN-HMM model trained on additional synthetic dysarthric speech +achieves WER improvement of 12.2% compared to the baseline, and that the +addition of the severity level and pause insertion controls decrease WER by +6.5%, showing the effectiveness of adding these parameters. Overall results on +the TORGO database demonstrate that using dysarthric synthetic speech to +increase the amount of dysarthric-patterned speech for training has significant +impact on the dysarthric ASR systems. In addition, we have conducted a +subjective evaluation to evaluate the dysarthric-ness and similarity of +synthesized speech. Our subjective evaluation shows that the perceived +dysartrhic-ness of synthesized speech is similar to that of true dysarthric +speech, especially for higher levels of dysarthria + +
+
+ comment: arXiv admin note: text overlap with arXiv:2201.11571 +
+
+
+
+
+ + ☆ Eliciting Risk Aversion with Inverse Reinforcement Learning via + Interactive Questioning + + +
+ This paper proposes a novel framework for identifying an agent's risk +aversion using interactive questioning. Our study is conducted in two +scenarios: a one-period case and an infinite horizon case. In the one-period +case, we assume that the agent's risk aversion is characterized by a cost +function of the state and a distortion risk measure. In the infinite horizon +case, we model risk aversion with an additional component, a discount factor. +Assuming the access to a finite set of candidates containing the agent's true +risk aversion, we show that asking the agent to demonstrate her optimal +policies in various environment, which may depend on their previous answers, is +an effective means of identifying the agent's risk aversion. Specifically, we +prove that the agent's risk aversion can be identified as the number of +questions tends to infinity, and the questions are randomly designed. We also +develop an algorithm for designing optimal questions and provide empirical +evidence that our method learns risk aversion significantly faster than +randomly designed questions in simulations. Our framework has important +applications in robo-advising and provides a new approach for identifying an +agent's risk preferences. + +
+
+
+
+
+ + ☆ Digital twinning of cardiac electrophysiology models from the surface + ECG: a geodesic backpropagation approach + + +
+ The eikonal equation has become an indispensable tool for modeling cardiac +electrical activation accurately and efficiently. In principle, by matching +clinically recorded and eikonal-based electrocardiograms (ECGs), it is possible +to build patient-specific models of cardiac electrophysiology in a purely +non-invasive manner. Nonetheless, the fitting procedure remains a challenging +task. The present study introduces a novel method, Geodesic-BP, to solve the +inverse eikonal problem. Geodesic-BP is well-suited for GPU-accelerated machine +learning frameworks, allowing us to optimize the parameters of the eikonal +equation to reproduce a given ECG. We show that Geodesic-BP can reconstruct a +simulated cardiac activation with high accuracy in a synthetic test case, even +in the presence of modeling inaccuracies. Furthermore, we apply our algorithm +to a publicly available dataset of a rabbit model, with very positive results. +Given the future shift towards personalized medicine, Geodesic-BP has the +potential to help in future functionalizations of cardiac models meeting +clinical time constraints while maintaining the physiological accuracy of +state-of-the-art cardiac models. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ Explainable AI for clinical risk prediction: a survey of concepts, + methods, and modalities + + +
+ Recent advancements in AI applications to healthcare have shown incredible +promise in surpassing human performance in diagnosis and disease prognosis. +With the increasing complexity of AI models, however, concerns regarding their +opacity, potential biases, and the need for interpretability. To ensure trust +and reliability in AI systems, especially in clinical risk prediction models, +explainability becomes crucial. Explainability is usually referred to as an AI +system's ability to provide a robust interpretation of its decision-making +logic or the decisions themselves to human stakeholders. In clinical risk +prediction, other aspects of explainability like fairness, bias, trust, and +transparency also represent important concepts beyond just interpretability. In +this review, we address the relationship between these concepts as they are +often used together or interchangeably. This review also discusses recent +progress in developing explainable models for clinical risk prediction, +highlighting the importance of quantitative and clinical evaluation and +validation across multiple common modalities in clinical practice. It +emphasizes the need for external validation and the combination of diverse +interpretability methods to enhance trust and fairness. Adopting rigorous +testing, such as using synthetic datasets with known generative factors, can +further improve the reliability of explainability methods. Open access and +code-sharing resources are essential for transparency and reproducibility, +enabling the growth and trustworthiness of explainable research. While +challenges exist, an end-to-end approach to explainability in clinical risk +prediction, incorporating stakeholders from clinicians to developers, is +essential for success. + +
+
+
+
+
+ + ☆ Content-based Recommendation Engine for Video Streaming Platform + + +
+ Recommendation engine suggest content, product or services to the user by +using machine learning algorithm. This paper proposed a content-based +recommendation engine for providing video suggestion to the user based on their +previous interests and choices. We will use TF-IDF text vectorization method to +determine the relevance of words in a document. Then we will find out the +similarity between each content by calculating cosine similarity between them. +Finally, engine will recommend videos to the users based on the obtained +similarity score value. In addition, we will measure the engine's performance +by computing precision, recall, and F1 core of the proposed system. + +
+
+
+
+
+ + ☆ Fast Uncertainty Quantification of Spent Nuclear Fuel with Neural + Networks + + +
+ The accurate calculation and uncertainty quantification of the +characteristics of spent nuclear fuel (SNF) play a crucial role in ensuring the +safety, efficiency, and sustainability of nuclear energy production, waste +management, and nuclear safeguards. State of the art physics-based models, +while reliable, are computationally intensive and time-consuming. This paper +presents a surrogate modeling approach using neural networks (NN) to predict a +number of SNF characteristics with reduced computational costs compared to +physics-based models. An NN is trained using data generated from CASMO5 lattice +calculations. The trained NN accurately predicts decay heat and nuclide +concentrations of SNF, as a function of key input parameters, such as +enrichment, burnup, cooling time between cycles, mean boron concentration and +fuel temperature. The model is validated against physics-based decay heat +simulations and measurements of different uranium oxide fuel assemblies from +two different pressurized water reactors. In addition, the NN is used to +perform sensitivity analysis and uncertainty quantification. The results are in +very good alignment to CASMO5, while the computational costs (taking into +account the costs of generating training samples) are reduced by a factor of 10 +or more. Our findings demonstrate the feasibility of using NNs as surrogate +models for fast characterization of SNF, providing a promising avenue for +improving computational efficiency in assessing nuclear fuel behavior and +associated risks. + +
+
+
+
+
+ + ☆ Continuous Sweep: an improved, binary quantifier + + +
+ Quantification is a supervised machine learning task, focused on estimating +the class prevalence of a dataset rather than labeling its individual +observations. We introduce Continuous Sweep, a new parametric binary quantifier +inspired by the well-performing Median Sweep. Median Sweep is currently one of +the best binary quantifiers, but we have changed this quantifier on three +points, namely 1) using parametric class distributions instead of empirical +distributions, 2) optimizing decision boundaries instead of applying discrete +decision rules, and 3) calculating the mean instead of the median. We derive +analytic expressions for the bias and variance of Continuous Sweep under +general model assumptions. This is one of the first theoretical contributions +in the field of quantification learning. Moreover, these derivations enable us +to find the optimal decision boundaries. Finally, our simulation study shows +that Continuous Sweep outperforms Median Sweep in a wide range of situations. + +
+
+
+
+
+ + ☆ Precision and Recall Reject Curves for Classification + + +
+ For some classification scenarios, it is desirable to use only those +classification instances that a trained model associates with a high certainty. +To obtain such high-certainty instances, previous work has proposed +accuracy-reject curves. Reject curves allow to evaluate and compare the +performance of different certainty measures over a range of thresholds for +accepting or rejecting classifications. However, the accuracy may not be the +most suited evaluation metric for all applications, and instead precision or +recall may be preferable. This is the case, for example, for data with +imbalanced class distributions. We therefore propose reject curves that +evaluate precision and recall, the recall-reject curve and the precision-reject +curve. Using prototype-based classifiers from learning vector quantization, we +first validate the proposed curves on artificial benchmark data against the +accuracy reject curve as a baseline. We then show on imbalanced benchmarks and +medical, real-world data that for these scenarios, the proposed precision- and +recall-curves yield more accurate insights into classifier performance than +accuracy reject curves. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ A distributed neural network architecture for dynamic sensor selection + with application to bandwidth-constrained body-sensor networks + + +
+ We propose a dynamic sensor selection approach for deep neural networks +(DNNs), which is able to derive an optimal sensor subset selection for each +specific input sample instead of a fixed selection for the entire dataset. This +dynamic selection is jointly learned with the task model in an end-to-end way, +using the Gumbel-Softmax trick to allow the discrete decisions to be learned +through standard backpropagation. We then show how we can use this dynamic +selection to increase the lifetime of a wireless sensor network (WSN) by +imposing constraints on how often each node is allowed to transmit. We further +improve performance by including a dynamic spatial filter that makes the +task-DNN more robust against the fact that it now needs to be able to handle a +multitude of possible node subsets. Finally, we explain how the selection of +the optimal channels can be distributed across the different nodes in a WSN. We +validate this method on a use case in the context of body-sensor networks, +where we use real electroencephalography (EEG) sensor data to emulate an EEG +sensor network. We analyze the resulting trade-offs between transmission load +and task accuracy. + +
+
+
+
+
+ + ☆ PDPK: A Framework to Synthesise Process Data and Corresponding + Procedural Knowledge for Manufacturing + + +
+ Procedural knowledge describes how to accomplish tasks and mitigate problems. +Such knowledge is commonly held by domain experts, e.g. operators in +manufacturing who adjust parameters to achieve quality targets. To the best of +our knowledge, no real-world datasets containing process data and corresponding +procedural knowledge are publicly available, possibly due to corporate +apprehensions regarding the loss of knowledge advances. Therefore, we provide a +framework to generate synthetic datasets that can be adapted to different +domains. The design choices are inspired by two real-world datasets of +procedural knowledge we have access to. Apart from containing representations +of procedural knowledge in Resource Description Framework (RDF)-compliant +knowledge graphs, the framework simulates parametrisation processes and +provides consistent process data. We compare established embedding methods on +the resulting knowledge graphs, detailing which out-of-the-box methods have the +potential to represent procedural knowledge. This provides a baseline which can +be used to increase the comparability of future work. Furthermore, we validate +the overall characteristics of a synthesised dataset by comparing the results +to those achievable on a real-world dataset. The framework and evaluation code, +as well as the dataset used in the evaluation, are available open source. + +
+
+
+
+
+ + ☆ Dual-Branch Temperature Scaling Calibration for Long-Tailed Recognition + + +
+ The calibration for deep neural networks is currently receiving widespread +attention and research. Miscalibration usually leads to overconfidence of the +model. While, under the condition of long-tailed distribution of data, the +problem of miscalibration is more prominent due to the different confidence +levels of samples in minority and majority categories, and it will result in +more serious overconfidence. To address this problem, some current research +have designed diverse temperature coefficients for different categories based +on temperature scaling (TS) method. However, in the case of rare samples in +minority classes, the temperature coefficient is not generalizable, and there +is a large difference between the temperature coefficients of the training set +and the validation set. To solve this challenge, this paper proposes a +dual-branch temperature scaling calibration model (Dual-TS), which considers +the diversities in temperature parameters of different categories and the +non-generalizability of temperature parameters for rare samples in minority +classes simultaneously. Moreover, we noticed that the traditional calibration +evaluation metric, Excepted Calibration Error (ECE), gives a higher weight to +low-confidence samples in the minority classes, which leads to inaccurate +evaluation of model calibration. Therefore, we also propose Equal Sample Bin +Excepted Calibration Error (Esbin-ECE) as a new calibration evaluation metric. +Through experiments, we demonstrate that our model yields state-of-the-art in +both traditional ECE and Esbin-ECE metrics. + +
+
+
+
+
+ + ☆ KernelWarehouse: Towards Parameter-Efficient Dynamic Convolution + + +
+ Dynamic convolution learns a linear mixture of $n$ static kernels weighted +with their sample-dependent attentions, demonstrating superior performance +compared to normal convolution. However, existing designs are +parameter-inefficient: they increase the number of convolutional parameters by +$n$ times. This and the optimization difficulty lead to no research progress in +dynamic convolution that can allow us to use a significant large value of $n$ +(e.g., $n>100$ instead of typical setting $n<10$) to push forward the +performance boundary. In this paper, we propose $KernelWarehouse$, a more +general form of dynamic convolution, which can strike a favorable trade-off +between parameter efficiency and representation power. Its key idea is to +redefine the basic concepts of "$kernels$" and "$assembling$ $kernels$" in +dynamic convolution from the perspective of reducing kernel dimension and +increasing kernel number significantly. In principle, KernelWarehouse enhances +convolutional parameter dependencies within the same layer and across +successive layers via tactful kernel partition and warehouse sharing, yielding +a high degree of freedom to fit a desired parameter budget. We validate our +method on ImageNet and MS-COCO datasets with different ConvNet architectures, +and show that it attains state-of-the-art results. For instance, the +ResNet18|ResNet50|MobileNetV2|ConvNeXt-Tiny model trained with KernelWarehouse +on ImageNet reaches 76.05%|81.05%|75.52%|82.51% top-1 accuracy. Thanks to its +flexible design, KernelWarehouse can even reduce the model size of a ConvNet +while improving the accuracy, e.g., our ResNet18 model with 36.45%|65.10% +parameter reduction to the baseline shows 2.89%|2.29% absolute improvement to +top-1 accuracy. + +
+
+ comment: This research work was completed and submitted in early May 2023. + Code and pre-trained models are available at + https://github.com/OSVAI/KernelWarehouse +
+
+
+
+
+ + ☆ Independent Distribution Regularization for Private Graph Embedding CIKM 2023 + + +
+ Learning graph embeddings is a crucial task in graph mining tasks. An +effective graph embedding model can learn low-dimensional representations from +graph-structured data for data publishing benefiting various downstream +applications such as node classification, link prediction, etc. However, recent +studies have revealed that graph embeddings are susceptible to attribute +inference attacks, which allow attackers to infer private node attributes from +the learned graph embeddings. To address these concerns, privacy-preserving +graph embedding methods have emerged, aiming to simultaneously consider primary +learning and privacy protection through adversarial learning. However, most +existing methods assume that representation models have access to all sensitive +attributes in advance during the training stage, which is not always the case +due to diverse privacy preferences. Furthermore, the commonly used adversarial +learning technique in privacy-preserving representation learning suffers from +unstable training issues. In this paper, we propose a novel approach called +Private Variational Graph AutoEncoders (PVGAE) with the aid of independent +distribution penalty as a regularization term. Specifically, we split the +original variational graph autoencoder (VGAE) to learn sensitive and +non-sensitive latent representations using two sets of encoders. Additionally, +we introduce a novel regularization to enforce the independence of the +encoders. We prove the theoretical effectiveness of regularization from the +perspective of mutual information. Experimental results on three real-world +datasets demonstrate that PVGAE outperforms other baselines in private +embedding learning regarding utility performance and privacy protection. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ☆ Convergence of Two-Layer Regression with Nonlinear Units + + +
+ Large language models (LLMs), such as ChatGPT and GPT4, have shown +outstanding performance in many human life task. Attention computation plays an +important role in training LLMs. Softmax unit and ReLU unit are the key +structure in attention computation. Inspired by them, we put forward a softmax +ReLU regression problem. Generally speaking, our goal is to find an optimal +solution to the regression problem involving the ReLU unit. In this work, we +calculate a close form representation for the Hessian of the loss function. +Under certain assumptions, we prove the Lipschitz continuous and the PSDness of +the Hessian. Then, we introduce an greedy algorithm based on approximate Newton +method, which converges in the sense of the distance to optimal solution. Last, +We relax the Lipschitz condition and prove the convergence in the sense of loss +value. + +
+
+
+
+
+ + ☆ Is Meta-Learning the Right Approach for the Cold-Start Problem in + Recommender Systems? + + +
+ Recommender systems have become fundamental building blocks of modern online +products and services, and have a substantial impact on user experience. In the +past few years, deep learning methods have attracted a lot of research, and are +now heavily used in modern real-world recommender systems. Nevertheless, +dealing with recommendations in the cold-start setting, e.g., when a user has +done limited interactions in the system, is a problem that remains far from +solved. Meta-learning techniques, and in particular optimization-based +meta-learning, have recently become the most popular approaches in the academic +research literature for tackling the cold-start problem in deep learning models +for recommender systems. However, current meta-learning approaches are not +practical for real-world recommender systems, which have billions of users and +items, and strict latency requirements. In this paper we show that it is +possible to obtaining similar, or higher, performance on commonly used +benchmarks for the cold-start problem without using meta-learning techniques. +In more detail, we show that, when tuned correctly, standard and widely adopted +deep learning models perform just as well as newer meta-learning models. We +further show that an extremely simple modular approach using common +representation learning techniques, can perform comparably to meta-learning +techniques specifically designed for the cold-start setting while being much +more easily deployable in real-world applications. + +
+
+
+
+
+ + ☆ Graph Out-of-Distribution Generalization with Controllable Data + Augmentation + + +
+ Graph Neural Network (GNN) has demonstrated extraordinary performance in +classifying graph properties. However, due to the selection bias of training +and testing data (e.g., training on small graphs and testing on large graphs, +or training on dense graphs and testing on sparse graphs), distribution +deviation is widespread. More importantly, we often observe \emph{hybrid +structure distribution shift} of both scale and density, despite of one-sided +biased data partition. The spurious correlations over hybrid distribution +deviation degrade the performance of previous GNN methods and show large +instability among different datasets. To alleviate this problem, we propose +\texttt{OOD-GMixup} to jointly manipulate the training distribution with +\emph{controllable data augmentation} in metric space. Specifically, we first +extract the graph rationales to eliminate the spurious correlations due to +irrelevant information. Secondly, we generate virtual samples with perturbation +on graph rationale representation domain to obtain potential OOD training +samples. Finally, we propose OOD calibration to measure the distribution +deviation of virtual samples by leveraging Extreme Value Theory, and further +actively control the training distribution by emphasizing the impact of virtual +OOD samples. Extensive studies on several real-world datasets on graph +classification demonstrate the superiority of our proposed method over +state-of-the-art baselines. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Learning Logic Programs by Discovering Higher-Order Abstractions + + +
+ Discovering novel abstractions is important for human-level AI. We introduce +an approach to discover higher-order abstractions, such as map, filter, and +fold. We focus on inductive logic programming, which induces logic programs +from examples and background knowledge. We introduce the higher-order +refactoring problem, where the goal is to compress a logic program by +introducing higher-order abstractions. We implement our approach in STEVIE, +which formulates the higher-order refactoring problem as a constraint +optimisation problem. Our experimental results on multiple domains, including +program synthesis and visual reasoning, show that, compared to no refactoring, +STEVIE can improve predictive accuracies by 27% and reduce learning times by +47%. We also show that STEVIE can discover abstractions that transfer to +different domains + +
+
+
+
+
+ + ☆ Warped geometric information on the optimisation of Euclidean functions + + +
+ We consider the fundamental task of optimizing a real-valued function defined +in a potentially high-dimensional Euclidean space, such as the loss function in +many machine-learning tasks or the logarithm of the probability distribution in +statistical inference. We use the warped Riemannian geometry notions to +redefine the optimisation problem of a function on Euclidean space to a +Riemannian manifold with a warped metric, and then find the function's optimum +along this manifold. The warped metric chosen for the search domain induces a +computational friendly metric-tensor for which optimal search directions +associate with geodesic curves on the manifold becomes easier to compute. +Performing optimization along geodesics is known to be generally infeasible, +yet we show that in this specific manifold we can analytically derive Taylor +approximations up to third-order. In general these approximations to the +geodesic curve will not lie on the manifold, however we construct suitable +retraction maps to pull them back onto the manifold. Therefore, we can +efficiently optimize along the approximate geodesic curves. We cover the +related theory, describe a practical optimization algorithm and empirically +evaluate it on a collection of challenging optimisation benchmarks. Our +proposed algorithm, using third-order approximation of geodesics, outperforms +standard Euclidean gradient-based counterparts in term of number of iterations +until convergence and an alternative method for Hessian-based optimisation +routines. + +
+
+
+
+
+ + ☆ Robust Bayesian Satisficing + + +
+ Distributional shifts pose a significant challenge to achieving robustness in +contemporary machine learning. To overcome this challenge, robust satisficing +(RS) seeks a robust solution to an unspecified distributional shift while +achieving a utility above a desired threshold. This paper focuses on the +problem of RS in contextual Bayesian optimization when there is a discrepancy +between the true and reference distributions of the context. We propose a novel +robust Bayesian satisficing algorithm called RoBOS for noisy black-box +optimization. Our algorithm guarantees sublinear lenient regret under certain +assumptions on the amount of distribution shift. In addition, we define a +weaker notion of regret called robust satisficing regret, in which our +algorithm achieves a sublinear upper bound independent of the amount of +distribution shift. To demonstrate the effectiveness of our method, we apply it +to various learning problems and compare it to other approaches, such as +distributionally robust optimization. + +
+
+
+
+
+ + ☆ DFedADMM: Dual Constraints Controlled Model Inconsistency for + Decentralized Federated Learning + + +
+ To address the communication burden issues associated with federated learning +(FL), decentralized federated learning (DFL) discards the central server and +establishes a decentralized communication network, where each client +communicates only with neighboring clients. However, existing DFL methods still +suffer from two major challenges: local inconsistency and local heterogeneous +overfitting, which have not been fundamentally addressed by existing DFL +methods. To tackle these issues, we propose novel DFL algorithms, DFedADMM and +its enhanced version DFedADMM-SAM, to enhance the performance of DFL. The +DFedADMM algorithm employs primal-dual optimization (ADMM) by utilizing dual +variables to control the model inconsistency raised from the decentralized +heterogeneous data distributions. The DFedADMM-SAM algorithm further improves +on DFedADMM by employing a Sharpness-Aware Minimization (SAM) optimizer, which +uses gradient perturbations to generate locally flat models and searches for +models with uniformly low loss values to mitigate local heterogeneous +overfitting. Theoretically, we derive convergence rates of $\small +\mathcal{O}\Big(\frac{1}{\sqrt{KT}}+\frac{1}{KT(1-\psi)^2}\Big)$ and $\small +\mathcal{O}\Big(\frac{1}{\sqrt{KT}}+\frac{1}{KT(1-\psi)^2}+ +\frac{1}{T^{3/2}K^{1/2}}\Big)$ in the non-convex setting for DFedADMM and +DFedADMM-SAM, respectively, where $1 - \psi$ represents the spectral gap of the +gossip matrix. Empirically, extensive experiments on MNIST, CIFAR10 and +CIFAR100 datesets demonstrate that our algorithms exhibit superior performance +in terms of both generalization and convergence speed compared to existing +state-of-the-art (SOTA) optimizers in DFL. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ CARE: A Large Scale CT Image Dataset and Clinical Applicable Benchmark + Model for Rectal Cancer Segmentation + + +
+ Rectal cancer segmentation of CT image plays a crucial role in timely +clinical diagnosis, radiotherapy treatment, and follow-up. Although current +segmentation methods have shown promise in delineating cancerous tissues, they +still encounter challenges in achieving high segmentation precision. These +obstacles arise from the intricate anatomical structures of the rectum and the +difficulties in performing differential diagnosis of rectal cancer. +Additionally, a major obstacle is the lack of a large-scale, finely annotated +CT image dataset for rectal cancer segmentation. To address these issues, this +work introduces a novel large scale rectal cancer CT image dataset CARE with +pixel-level annotations for both normal and cancerous rectum, which serves as a +valuable resource for algorithm research and clinical application development. +Moreover, we propose a novel medical cancer lesion segmentation benchmark model +named U-SAM. The model is specifically designed to tackle the challenges posed +by the intricate anatomical structures of abdominal organs by incorporating +prompt information. U-SAM contains three key components: promptable information +(e.g., points) to aid in target area localization, a convolution module for +capturing low-level lesion details, and skip-connections to preserve and +recover spatial information during the encoding-decoding process. To evaluate +the effectiveness of U-SAM, we systematically compare its performance with +several popular segmentation methods on the CARE dataset. The generalization of +the model is further verified on the WORD dataset. Extensive experiments +demonstrate that the proposed U-SAM outperforms state-of-the-art methods on +these two datasets. These experiments can serve as the baseline for future +research and clinical application development. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ It Ain't That Bad: Understanding the Mysterious Performance Drop in OOD + Generalization for Generative Transformer Models + + +
+ Generative Transformer-based models have achieved remarkable proficiency on +solving diverse problems. However, their generalization ability is not fully +understood and not always satisfying. Researchers take basic mathematical tasks +like n-digit addition or multiplication as important perspectives for +investigating their generalization behaviors. Curiously, it is observed that +when training on n-digit operations (e.g., additions) in which both input +operands are n-digit in length, models generalize successfully on unseen +n-digit inputs (in-distribution (ID) generalization), but fail miserably and +mysteriously on longer, unseen cases (out-of-distribution (OOD) +generalization). Studies try to bridge this gap with workarounds such as +modifying position embedding, fine-tuning, and priming with more extensive or +instructive data. However, without addressing the essential mechanism, there is +hardly any guarantee regarding the robustness of these solutions. We bring this +unexplained performance drop into attention and ask whether it is purely from +random errors. Here we turn to the mechanistic line of research which has +notable successes in model interpretability. We discover that the strong ID +generalization stems from structured representations, while behind the +unsatisfying OOD performance, the models still exhibit clear learned algebraic +structures. Specifically, these models map unseen OOD inputs to outputs with +equivalence relations in the ID domain. These highlight the potential of the +models to carry useful information for improved generalization. + +
+
+
+
+
+ + ☆ Graph Relation Aware Continual Learning + + +
+ Continual graph learning (CGL) studies the problem of learning from an +infinite stream of graph data, consolidating historical knowledge, and +generalizing it to the future task. At once, only current graph data are +available. Although some recent attempts have been made to handle this task, we +still face two potential challenges: 1) most of existing works only manipulate +on the intermediate graph embedding and ignore intrinsic properties of graphs. +It is non-trivial to differentiate the transferred information across graphs. +2) recent attempts take a parameter-sharing policy to transfer knowledge across +time steps or progressively expand new architecture given shifted graph +distribution. Learning a single model could loss discriminative information for +each graph task while the model expansion scheme suffers from high model +complexity. In this paper, we point out that latent relations behind graph +edges can be attributed as an invariant factor for the evolving graphs and the +statistical information of latent relations evolves. Motivated by this, we +design a relation-aware adaptive model, dubbed as RAM-CG, that consists of a +relation-discovery modular to explore latent relations behind edges and a +task-awareness masking classifier to accounts for the shifted. Extensive +experiments show that RAM-CG provides significant 2.2%, 6.9% and 6.6% accuracy +improvements over the state-of-the-art results on CitationNet, OGBN-arxiv and +TWITCH dataset, respective. + +
+
+
+
+
+ + ☆ Two Phases of Scaling Laws for Nearest Neighbor Classifiers + + +
+ A scaling law refers to the observation that the test performance of a model +improves as the number of training data increases. A fast scaling law implies +that one can solve machine learning problems by simply boosting the data and +the model sizes. Yet, in many cases, the benefit of adding more data can be +negligible. In this work, we study the rate of scaling laws of nearest neighbor +classifiers. We show that a scaling law can have two phases: in the first +phase, the generalization error depends polynomially on the data dimension and +decreases fast; whereas in the second phase, the error depends exponentially on +the data dimension and decreases slowly. Our analysis highlights the complexity +of the data distribution in determining the generalization error. When the data +distributes benignly, our result suggests that nearest neighbor classifier can +achieve a generalization error that depends polynomially, instead of +exponentially, on the data dimension. + +
+
+
+
+
+ + ☆ The Expressive Power of Graph Neural Networks: A Survey + + +
+ Graph neural networks (GNNs) are effective machine learning models for many +graph-related applications. Despite their empirical success, many research +efforts focus on the theoretical limitations of GNNs, i.e., the GNNs expressive +power. Early works in this domain mainly focus on studying the graph +isomorphism recognition ability of GNNs, and recent works try to leverage the +properties such as subgraph counting and connectivity learning to characterize +the expressive power of GNNs, which are more practical and closer to +real-world. However, no survey papers and open-source repositories +comprehensively summarize and discuss models in this important direction. To +fill the gap, we conduct a first survey for models for enhancing expressive +power under different forms of definition. Concretely, the models are reviewed +based on three categories, i.e., Graph feature enhancement, Graph topology +enhancement, and GNNs architecture enhancement. + +
+
+
+
+
+ + ☆ Challenges and Opportunities of Using Transformer-Based Multi-Task + Learning in NLP Through ML Lifecycle: A Survey + + +
+ The increasing adoption of natural language processing (NLP) models across +industries has led to practitioners' need for machine learning systems to +handle these models efficiently, from training to serving them in production. +However, training, deploying, and updating multiple models can be complex, +costly, and time-consuming, mainly when using transformer-based pre-trained +language models. Multi-Task Learning (MTL) has emerged as a promising approach +to improve efficiency and performance through joint training, rather than +training separate models. Motivated by this, we first provide an overview of +transformer-based MTL approaches in NLP. Then, we discuss the challenges and +opportunities of using MTL approaches throughout typical ML lifecycle phases, +specifically focusing on the challenges related to data engineering, model +development, deployment, and monitoring phases. This survey focuses on +transformer-based MTL architectures and, to the best of our knowledge, is novel +in that it systematically analyses how transformer-based MTL in NLP fits into +ML lifecycle phases. Furthermore, we motivate research on the connection +between MTL and continual learning (CL), as this area remains unexplored. We +believe it would be practical to have a model that can handle both MTL and CL, +as this would make it easier to periodically re-train the model, update it due +to distribution shifts, and add new capabilities to meet real-world +requirements. + +
+
+
+
+
+ + ☆ SCQPTH: an efficient differentiable splitting method for convex + quadratic programming + + +
+ We present SCQPTH: a differentiable first-order splitting method for convex +quadratic programs. The SCQPTH framework is based on the alternating direction +method of multipliers (ADMM) and the software implementation is motivated by +the state-of-the art solver OSQP: an operating splitting solver for convex +quadratic programs (QPs). The SCQPTH software is made available as an +open-source python package and contains many similar features including +efficient reuse of matrix factorizations, infeasibility detection, automatic +scaling and parameter selection. The forward pass algorithm performs operator +splitting in the dimension of the original problem space and is therefore +suitable for large scale QPs with $100-1000$ decision variables and thousands +of constraints. Backpropagation is performed by implicit differentiation of the +ADMM fixed-point mapping. Experiments demonstrate that for large scale QPs, +SCQPTH can provide a $1\times - 10\times$ improvement in computational +efficiency in comparison to existing differentiable QP solvers. + +
+
+
+
+
+ + ☆ Exploring Winograd Convolution for Cost-effective Neural Network Fault + Tolerance + + +
+ Winograd is generally utilized to optimize convolution performance and +computational efficiency because of the reduced multiplication operations, but +the reliability issues brought by winograd are usually overlooked. In this +work, we observe the great potential of winograd convolution in improving +neural network (NN) fault tolerance. Based on the observation, we evaluate +winograd convolution fault tolerance comprehensively from different +granularities ranging from models, layers, and operation types for the first +time. Then, we explore the use of inherent fault tolerance of winograd +convolution for cost-effective NN protection against soft errors. Specifically, +we mainly investigate how winograd convolution can be effectively incorporated +with classical fault-tolerant design approaches including triple modular +redundancy (TMR), fault-aware retraining, and constrained activation functions. +According to our experiments, winograd convolution can reduce the +fault-tolerant design overhead by 55.77\% on average without any accuracy loss +compared to standard convolution, and further reduce the computing overhead by +17.24\% when the inherent fault tolerance of winograd convolution is +considered. When it is applied on fault-tolerant neural networks enhanced with +fault-aware retraining and constrained activation functions, the resulting +model accuracy generally shows significant improvement in presence of various +faults. + +
+
+
+
+
+ + ☆ Inherent Redundancy in Spiking Neural Networks ICCV2023 + + +
+ Spiking Neural Networks (SNNs) are well known as a promising energy-efficient +alternative to conventional artificial neural networks. Subject to the +preconceived impression that SNNs are sparse firing, the analysis and +optimization of inherent redundancy in SNNs have been largely overlooked, thus +the potential advantages of spike-based neuromorphic computing in accuracy and +energy efficiency are interfered. In this work, we pose and focus on three key +questions regarding the inherent redundancy in SNNs. We argue that the +redundancy is induced by the spatio-temporal invariance of SNNs, which enhances +the efficiency of parameter utilization but also invites lots of noise spikes. +Further, we analyze the effect of spatio-temporal invariance on the +spatio-temporal dynamics and spike firing of SNNs. Then, motivated by these +analyses, we propose an Advance Spatial Attention (ASA) module to harness SNNs' +redundancy, which can adaptively optimize their membrane potential distribution +by a pair of individual spatial attention sub-modules. In this way, noise spike +features are accurately regulated. Experimental results demonstrate that the +proposed method can significantly drop the spike firing with better performance +than state-of-the-art SNN baselines. Our code is available in +\url{https://github.com/BICLab/ASA-SNN}. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ How To Overcome Confirmation Bias in Semi-Supervised Image + Classification By Active Learning ECML + + +
+ Do we need active learning? The rise of strong deep semi-supervised methods +raises doubt about the usability of active learning in limited labeled data +settings. This is caused by results showing that combining semi-supervised +learning (SSL) methods with a random selection for labeling can outperform +existing active learning (AL) techniques. However, these results are obtained +from experiments on well-established benchmark datasets that can overestimate +the external validity. However, the literature lacks sufficient research on the +performance of active semi-supervised learning methods in realistic data +scenarios, leaving a notable gap in our understanding. Therefore we present +three data challenges common in real-world applications: between-class +imbalance, within-class imbalance, and between-class similarity. These +challenges can hurt SSL performance due to confirmation bias. We conduct +experiments with SSL and AL on simulated data challenges and find that random +sampling does not mitigate confirmation bias and, in some cases, leads to worse +performance than supervised learning. In contrast, we demonstrate that AL can +overcome confirmation bias in SSL in these realistic settings. Our results +provide insights into the potential of combining active and semi-supervised +learning in the presence of common real-world challenges, which is a promising +direction for robust methods when learning with limited labeled data in +real-world applications. + +
+
+ comment: Accepted @ ECML PKDD 2023. This is the author's version of the work. + The definitive Version of Record will be published in the Proceedings of ECML + PKDD 2023 +
+
+
+
+
+ + ☆ HyperSNN: A new efficient and robust deep learning model for resource + constrained control applications + + +
+ In light of the increasing adoption of edge computing in areas such as +intelligent furniture, robotics, and smart homes, this paper introduces +HyperSNN, an innovative method for control tasks that uses spiking neural +networks (SNNs) in combination with hyperdimensional computing. HyperSNN +substitutes expensive 32-bit floating point multiplications with 8-bit integer +additions, resulting in reduced energy consumption while enhancing robustness +and potentially improving accuracy. Our model was tested on AI Gym benchmarks, +including Cartpole, Acrobot, MountainCar, and Lunar Lander. HyperSNN achieves +control accuracies that are on par with conventional machine learning methods +but with only 1.36% to 9.96% of the energy expenditure. Furthermore, our +experiments showed increased robustness when using HyperSNN. We believe that +HyperSNN is especially suitable for interactive, mobile, and wearable devices, +promoting energy-efficient and robust system design. Furthermore, it paves the +way for the practical implementation of complex algorithms like model +predictive control (MPC) in real-world industrial scenarios. + +
+
+
+
+
+ + ☆ Epicure: Distilling Sequence Model Predictions into Patterns + + +
+ Most machine learning models predict a probability distribution over concrete +outputs and struggle to accurately predict names over high entropy sequence +distributions. Here, we explore finding abstract, high-precision patterns +intrinsic to these predictions in order to make abstract predictions that +usefully capture rare sequences. In this short paper, we present Epicure, a +method that distils the predictions of a sequence model, such as the output of +beam search, into simple patterns. Epicure maps a model's predictions into a +lattice that represents increasingly more general patterns that subsume the +concrete model predictions. + On the tasks of predicting a descriptive name of a function given the source +code of its body and detecting anomalous names given a function, we show that +Epicure yields accurate naming patterns that match the ground truth more often +compared to just the highest probability model prediction. For a false alarm +rate of 10%, Epicure predicts patterns that match 61% more ground-truth names +compared to the best model prediction, making Epicure well-suited for scenarios +that require high precision. + +
+
+
+
+
+ + ☆ DeSCo: Towards Generalizable and Scalable Deep Subgraph Counting + + +
+ Subgraph counting is the problem of counting the occurrences of a given query +graph in a large target graph. Large-scale subgraph counting is useful in +various domains, such as motif counting for social network analysis and loop +counting for money laundering detection on transaction networks. Recently, to +address the exponential runtime complexity of scalable subgraph counting, +neural methods are proposed. However, existing neural counting approaches fall +short in three aspects. Firstly, the counts of the same query can vary from +zero to millions on different target graphs, posing a much larger challenge +than most graph regression tasks. Secondly, current scalable graph neural +networks have limited expressive power and fail to efficiently distinguish +graphs in count prediction. Furthermore, existing neural approaches cannot +predict the occurrence position of queries in the target graph. + Here we design DeSCo, a scalable neural deep subgraph counting pipeline, +which aims to accurately predict the query count and occurrence position on any +target graph after one-time training. Firstly, DeSCo uses a novel canonical +partition and divides the large target graph into small neighborhood graphs. +The technique greatly reduces the count variation while guaranteeing no missing +or double-counting. Secondly, neighborhood counting uses an expressive +subgraph-based heterogeneous graph neural network to accurately perform +counting in each neighborhood. Finally, gossip propagation propagates +neighborhood counts with learnable gates to harness the inductive biases of +motif counts. DeSCo is evaluated on eight real-world datasets from various +domains. It outperforms state-of-the-art neural methods with 137x improvement +in the mean squared error of count prediction, while maintaining the polynomial +runtime complexity. + +
+
+ comment: 8 pages main text, 10 pages appendix +
+
+
+
+
+ + ☆ Endogenous Macrodynamics in Algorithmic Recourse + + +
+ Existing work on Counterfactual Explanations (CE) and Algorithmic Recourse +(AR) has largely focused on single individuals in a static environment: given +some estimated model, the goal is to find valid counterfactuals for an +individual instance that fulfill various desiderata. The ability of such +counterfactuals to handle dynamics like data and model drift remains a largely +unexplored research challenge. There has also been surprisingly little work on +the related question of how the actual implementation of recourse by one +individual may affect other individuals. Through this work, we aim to close +that gap. We first show that many of the existing methodologies can be +collectively described by a generalized framework. We then argue that the +existing framework does not account for a hidden external cost of recourse, +that only reveals itself when studying the endogenous dynamics of recourse at +the group level. Through simulation experiments involving various state-of +the-art counterfactual generators and several benchmark datasets, we generate +large numbers of counterfactuals and study the resulting domain and model +shifts. We find that the induced shifts are substantial enough to likely impede +the applicability of Algorithmic Recourse in some situations. Fortunately, we +find various strategies to mitigate these concerns. Our simulation framework +for studying recourse dynamics is fast and opensourced. + +
+
+ comment: 12 pages, 11 figures. Originally published at the 2023 IEEE + Conference on Secure and Trustworthy Machine Learning (SaTML). IEEE holds the + copyright +
+
+
+
+
+ + ☆ Accelerating Generic Graph Neural Networks via Architecture, Compiler, + Partition Method Co-Design + + +
+ Graph neural networks (GNNs) have shown significant accuracy improvements in +a variety of graph learning domains, sparking considerable research interest. +To translate these accuracy improvements into practical applications, it is +essential to develop high-performance and efficient hardware acceleration for +GNN models. However, designing GNN accelerators faces two fundamental +challenges: the high bandwidth requirement of GNN models and the diversity of +GNN models. Previous works have addressed the first challenge by using more +expensive memory interfaces to achieve higher bandwidth. For the second +challenge, existing works either support specific GNN models or have generic +designs with poor hardware utilization. + In this work, we tackle both challenges simultaneously. First, we identify a +new type of partition-level operator fusion, which we utilize to internally +reduce the high bandwidth requirement of GNNs. Next, we introduce +partition-level multi-threading to schedule the concurrent processing of graph +partitions, utilizing different hardware resources. To further reduce the extra +on-chip memory required by multi-threading, we propose fine-grained graph +partitioning to generate denser graph partitions. Importantly, these three +methods make no assumptions about the targeted GNN models, addressing the +challenge of model variety. We implement these methods in a framework called +SwitchBlade, consisting of a compiler, a graph partitioner, and a hardware +accelerator. Our evaluation demonstrates that SwitchBlade achieves an average +speedup of $1.85\times$ and energy savings of $19.03\times$ compared to the +NVIDIA V100 GPU. Additionally, SwitchBlade delivers performance comparable to +state-of-the-art specialized accelerators. + +
+
+
+
+
+ + ☆ Expressivity of Graph Neural Networks Through the Lens of Adversarial + Robustness + + +
+ We perform the first adversarial robustness study into Graph Neural Networks +(GNNs) that are provably more powerful than traditional Message Passing Neural +Networks (MPNNs). In particular, we use adversarial robustness as a tool to +uncover a significant gap between their theoretically possible and empirically +achieved expressive power. To do so, we focus on the ability of GNNs to count +specific subgraph patterns, which is an established measure of expressivity, +and extend the concept of adversarial robustness to this task. Based on this, +we develop efficient adversarial attacks for subgraph counting and show that +more powerful GNNs fail to generalize even to small perturbations to the +graph's structure. Expanding on this, we show that such architectures also fail +to count substructures on out-of-distribution graphs. + +
+
+ comment: Published in ${2}^{nd}$ AdvML Frontiers workshop at ${40}^{th}$ + International Conference on Machine Learning +
+
+
+
+
+ + ☆ AATCT-IDS: A Benchmark Abdominal Adipose Tissue CT Image Dataset for + Image Denoising, Semantic Segmentation, and Radiomics Evaluation + + +
+ Methods: In this study, a benchmark \emph{Abdominal Adipose Tissue CT Image +Dataset} (AATTCT-IDS) containing 300 subjects is prepared and published. +AATTCT-IDS publics 13,732 raw CT slices, and the researchers individually +annotate the subcutaneous and visceral adipose tissue regions of 3,213 of those +slices that have the same slice distance to validate denoising methods, train +semantic segmentation models, and study radiomics. For different tasks, this +paper compares and analyzes the performance of various methods on AATTCT-IDS by +combining the visualization results and evaluation data. Thus, verify the +research potential of this data set in the above three types of tasks. + Results: In the comparative study of image denoising, algorithms using a +smoothing strategy suppress mixed noise at the expense of image details and +obtain better evaluation data. Methods such as BM3D preserve the original image +structure better, although the evaluation data are slightly lower. The results +show significant differences among them. In the comparative study of semantic +segmentation of abdominal adipose tissue, the segmentation results of adipose +tissue by each model show different structural characteristics. Among them, +BiSeNet obtains segmentation results only slightly inferior to U-Net with the +shortest training time and effectively separates small and isolated adipose +tissue. In addition, the radiomics study based on AATTCT-IDS reveals three +adipose distributions in the subject population. + Conclusion: AATTCT-IDS contains the ground truth of adipose tissue regions in +abdominal CT slices. This open-source dataset can attract researchers to +explore the multi-dimensional characteristics of abdominal adipose tissue and +thus help physicians and patients in clinical practice. AATCT-IDS is freely +published for non-commercial purpose at: +\url{https://figshare.com/articles/dataset/AATTCT-IDS/23807256}. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ☆ A Quantum Approximation Scheme for k-Means + + +
+ We give a quantum approximation scheme (i.e., $(1 + +\varepsilon)$-approximation for every $\varepsilon > 0$) for the classical +$k$-means clustering problem in the QRAM model with a running time that has +only polylogarithmic dependence on the number of data points. More +specifically, given a dataset $V$ with $N$ points in $\mathbb{R}^d$ stored in +QRAM data structure, our quantum algorithm runs in time $\tilde{O} \left( +2^{\tilde{O}(\frac{k}{\varepsilon})} \eta^2 d\right)$ and with high probability +outputs a set $C$ of $k$ centers such that $cost(V, C) \leq (1+\varepsilon) +\cdot cost(V, C_{OPT})$. Here $C_{OPT}$ denotes the optimal $k$-centers, +$cost(.)$ denotes the standard $k$-means cost function (i.e., the sum of the +squared distance of points to the closest center), and $\eta$ is the aspect +ratio (i.e., the ratio of maximum distance to minimum distance). This is the +first quantum algorithm with a polylogarithmic running time that gives a +provable approximation guarantee of $(1+\varepsilon)$ for the $k$-means +problem. Also, unlike previous works on unsupervised learning, our quantum +algorithm does not require quantum linear algebra subroutines and has a running +time independent of parameters (e.g., condition number) that appear in such +procedures. + +
+
+
+
+
+ + ☆ Characteristics of networks generated by kernel growing neural gas + + +
+ This research aims to develop kernel GNG, a kernelized version of the growing +neural gas (GNG) algorithm, and to investigate the features of the networks +generated by the kernel GNG. The GNG is an unsupervised artificial neural +network that can transform a dataset into an undirected graph, thereby +extracting the features of the dataset as a graph. The GNG is widely used in +vector quantization, clustering, and 3D graphics. Kernel methods are often used +to map a dataset to feature space, with support vector machines being the most +prominent application. This paper introduces the kernel GNG approach and +explores the characteristics of the networks generated by kernel GNG. Five +kernels, including Gaussian, Laplacian, Cauchy, inverse multiquadric, and log +kernels, are used in this study. + +
+
+
+
+
+ + ☆ Interpretability Benchmark for Evaluating Spatial Misalignment of + Prototypical Parts Explanations + + +
+ Prototypical parts-based networks are becoming increasingly popular due to +their faithful self-explanations. However, their similarity maps are calculated +in the penultimate network layer. Therefore, the receptive field of the +prototype activation region often depends on parts of the image outside this +region, which can lead to misleading interpretations. We name this undesired +behavior a spatial explanation misalignment and introduce an interpretability +benchmark with a set of dedicated metrics for quantifying this phenomenon. In +addition, we propose a method for misalignment compensation and apply it to +existing state-of-the-art models. We show the expressiveness of our benchmark +and the effectiveness of the proposed compensation methodology through +extensive empirical studies. + +
+
+ comment: Under review. Code will be release upon acceptance +
+
+
+
+
+ + ☆ Benchmarking Adversarial Robustness of Compressed Deep Learning Models + + +
+ The increasing size of Deep Neural Networks (DNNs) poses a pressing need for +model compression, particularly when employed on resource constrained devices. +Concurrently, the susceptibility of DNNs to adversarial attacks presents +another significant hurdle. Despite substantial research on both model +compression and adversarial robustness, their joint examination remains +underexplored. Our study bridges this gap, seeking to understand the effect of +adversarial inputs crafted for base models on their pruned versions. To examine +this relationship, we have developed a comprehensive benchmark across diverse +adversarial attacks and popular DNN models. We uniquely focus on models not +previously exposed to adversarial training and apply pruning schemes optimized +for accuracy and performance. Our findings reveal that while the benefits of +pruning enhanced generalizability, compression, and faster inference times are +preserved, adversarial robustness remains comparable to the base model. This +suggests that model compression while offering its unique advantages, does not +undermine adversarial robustness. + +
+
+
+
+
+ + ☆ Deep Generative Imputation Model for Missing Not At Random Data + + +
+ Data analysis usually suffers from the Missing Not At Random (MNAR) problem, +where the cause of the value missing is not fully observed. Compared to the +naive Missing Completely At Random (MCAR) problem, it is more in line with the +realistic scenario whereas more complex and challenging. Existing statistical +methods model the MNAR mechanism by different decomposition of the joint +distribution of the complete data and the missing mask. But we empirically find +that directly incorporating these statistical methods into deep generative +models is sub-optimal. Specifically, it would neglect the confidence of the +reconstructed mask during the MNAR imputation process, which leads to +insufficient information extraction and less-guaranteed imputation quality. In +this paper, we revisit the MNAR problem from a novel perspective that the +complete data and missing mask are two modalities of incomplete data on an +equal footing. Along with this line, we put forward a generative-model-specific +joint probability decomposition method, conjunction model, to represent the +distributions of two modalities in parallel and extract sufficient information +from both complete data and missing mask. Taking a step further, we exploit a +deep generative imputation model, namely GNR, to process the real-world missing +mechanism in the latent space and concurrently impute the incomplete data and +reconstruct the missing mask. The experimental results show that our GNR +surpasses state-of-the-art MNAR baselines with significant margins (averagely +improved from 9.9% to 18.8% in RMSE) and always gives a better mask +reconstruction accuracy which makes the imputation more principle. + +
+
+
+
+
+ + ☆ Sarcasm Detection in a Disaster Context + + +
+ During natural disasters, people often use social media platforms such as +Twitter to ask for help, to provide information about the disaster situation, +or to express contempt about the unfolding event or public policies and +guidelines. This contempt is in some cases expressed as sarcasm or irony. +Understanding this form of speech in a disaster-centric context is essential to +improving natural language understanding of disaster-related tweets. In this +paper, we introduce HurricaneSARC, a dataset of 15,000 tweets annotated for +intended sarcasm, and provide a comprehensive investigation of sarcasm +detection using pre-trained language models. Our best model is able to obtain +as much as 0.70 F1 on our dataset. We also demonstrate that the performance on +HurricaneSARC can be improved by leveraging intermediate task transfer +learning. We release our data and code at +https://github.com/tsosea2/HurricaneSarc. + +
+
+
+
+
+ + ☆ Hierarchical Topological Ordering with Conditional Independence Test for + Limited Time Series + + +
+ Learning directed acyclic graphs (DAGs) to identify causal relations +underlying observational data is crucial but also poses significant challenges. +Recently, topology-based methods have emerged as a two-step approach to +discovering DAGs by first learning the topological ordering of variables and +then eliminating redundant edges, while ensuring that the graph remains +acyclic. However, one limitation is that these methods would generate numerous +spurious edges that require subsequent pruning. To overcome this limitation, in +this paper, we propose an improvement to topology-based methods by introducing +limited time series data, consisting of only two cross-sectional records that +need not be adjacent in time and are subject to flexible timing. By +incorporating conditional instrumental variables as exogenous interventions, we +aim to identify descendant nodes for each variable. Following this line, we +propose a hierarchical topological ordering algorithm with conditional +independence test (HT-CIT), which enables the efficient learning of sparse DAGs +with a smaller search space compared to other popular approaches. The HT-CIT +algorithm greatly reduces the number of edges that need to be pruned. Empirical +results from synthetic and real-world datasets demonstrate the superiority of +the proposed HT-CIT algorithm. + +
+
+
+
+
+ + ☆ Online Control for Linear Dynamics: A Data-Driven Approach + + +
+ This paper considers an online control problem over a linear time-invariant +system with unknown dynamics, bounded disturbance, and adversarial cost. We +propose a data-driven strategy to reduce the regret of the controller. Unlike +model-based methods, our algorithm does not identify the system model, instead, +it leverages a single noise-free trajectory to calculate the accumulation of +disturbance and makes decisions using the accumulated disturbance action +controller we design, whose parameters are updated by online gradient descent. +We prove that the regret of our algorithm is $\mathcal{O}(\sqrt{T})$ under mild +assumptions, suggesting that its performance is on par with model-based +methods. + +
+
+
+
+
+ + ☆ Microstructure-Empowered Stock Factor Extraction and Utilization + + +
+ High-frequency quantitative investment is a crucial aspect of stock +investment. Notably, order flow data plays a critical role as it provides the +most detailed level of information among high-frequency trading data, including +comprehensive data from the order book and transaction records at the tick +level. The order flow data is extremely valuable for market analysis as it +equips traders with essential insights for making informed decisions. However, +extracting and effectively utilizing order flow data present challenges due to +the large volume of data involved and the limitations of traditional factor +mining techniques, which are primarily designed for coarser-level stock data. +To address these challenges, we propose a novel framework that aims to +effectively extract essential factors from order flow data for diverse +downstream tasks across different granularities and scenarios. Our method +consists of a Context Encoder and an Factor Extractor. The Context Encoder +learns an embedding for the current order flow data segment's context by +considering both the expected and actual market state. In addition, the Factor +Extractor uses unsupervised learning methods to select such important signals +that are most distinct from the majority within the given context. The +extracted factors are then utilized for downstream tasks. In empirical studies, +our proposed framework efficiently handles an entire year of stock order flow +data across diverse scenarios, offering a broader range of applications +compared to existing tick-level approaches that are limited to only a few days +of stock data. We demonstrate that our method extracts superior factors from +order flow data, enabling significant improvement for stock trend prediction +and order execution tasks at the second and minute level. + +
+
+
+
+
+ + ☆ Is Self-Supervised Pretraining Good for Extrapolation in Molecular + Property Prediction? + + +
+ The prediction of material properties plays a crucial role in the development +and discovery of materials in diverse applications, such as batteries, +semiconductors, catalysts, and pharmaceuticals. Recently, there has been a +growing interest in employing data-driven approaches by using machine learning +technologies, in combination with conventional theoretical calculations. In +material science, the prediction of unobserved values, commonly referred to as +extrapolation, is particularly critical for property prediction as it enables +researchers to gain insight into materials beyond the limits of available data. +However, even with the recent advancements in powerful machine learning models, +accurate extrapolation is still widely recognized as a significantly +challenging problem. On the other hand, self-supervised pretraining is a +machine learning technique where a model is first trained on unlabeled data +using relatively simple pretext tasks before being trained on labeled data for +target tasks. As self-supervised pretraining can effectively utilize material +data without observed property values, it has the potential to improve the +model's extrapolation ability. In this paper, we clarify how such +self-supervised pretraining can enhance extrapolation performance.We propose an +experimental framework for the demonstration and empirically reveal that while +models were unable to accurately extrapolate absolute property values, +self-supervised pretraining enables them to learn relative tendencies of +unobserved property values and improve extrapolation performance. + +
+
+
+
+
+ + ☆ How to Mask in Error Correction Code Transformer: Systematic and Double + Masking + + +
+ In communication and storage systems, error correction codes (ECCs) are +pivotal in ensuring data reliability. As deep learning's applicability has +broadened across diverse domains, there is a growing research focus on neural +network-based decoders that outperform traditional decoding algorithms. Among +these neural decoders, Error Correction Code Transformer (ECCT) has achieved +the state-of-the-art performance, outperforming other methods by large margins. +To further enhance the performance of ECCT, we propose two novel methods. +First, leveraging the systematic encoding technique of ECCs, we introduce a new +masking matrix for ECCT, aiming to improve the performance and reduce the +computational complexity. Second, we propose a novel transformer architecture +of ECCT called a double-masked ECCT. This architecture employs two different +mask matrices in a parallel manner to learn more diverse features of the +relationship between codeword bits in the masked self-attention blocks. +Extensive simulation results show that the proposed double-masked ECCT +outperforms the conventional ECCT, achieving the state-of-the-art decoding +performance with significant margins. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ S-Mixup: Structural Mixup for Graph Neural Networks CIKM 2023 + + +
+ Existing studies for applying the mixup technique on graphs mainly focus on +graph classification tasks, while the research in node classification is still +under-explored. In this paper, we propose a novel mixup augmentation for node +classification called Structural Mixup (S-Mixup). The core idea is to take into +account the structural information while mixing nodes. Specifically, S-Mixup +obtains pseudo-labels for unlabeled nodes in a graph along with their +prediction confidence via a Graph Neural Network (GNN) classifier. These serve +as the criteria for the composition of the mixup pool for both inter and +intra-class mixups. Furthermore, we utilize the edge gradient obtained from the +GNN training and propose a gradient-based edge selection strategy for selecting +edges to be attached to the nodes generated by the mixup. Through extensive +experiments on real-world benchmark datasets, we demonstrate the effectiveness +of S-Mixup evaluated on the node classification task. We observe that S-Mixup +enhances the robustness and generalization performance of GNNs, especially in +heterophilous situations. The source code of S-Mixup can be found at +\url{https://github.com/SukwonYun/S-Mixup} + +
+
+ comment: CIKM 2023 (Short Paper) +
+
+
+
+
+ + ☆ Safety Filter Design for Neural Network Systems via Convex Optimization + + +
+ With the increase in data availability, it has been widely demonstrated that +neural networks (NN) can capture complex system dynamics precisely in a +data-driven manner. However, the architectural complexity and nonlinearity of +the NNs make it challenging to synthesize a provably safe controller. In this +work, we propose a novel safety filter that relies on convex optimization to +ensure safety for a NN system, subject to additive disturbances that are +capable of capturing modeling errors. Our approach leverages tools from NN +verification to over-approximate NN dynamics with a set of linear bounds, +followed by an application of robust linear MPC to search for controllers that +can guarantee robust constraint satisfaction. We demonstrate the efficacy of +the proposed framework numerically on a nonlinear pendulum system. + +
+
+ comment: This paper has been accepted to the 2023 62nd IEEE Conference on + Decision and Control (CDC) +
+
+
+
+
+ + ☆ Rigid Transformations for Stabilized Lower Dimensional Space to Support + Subsurface Uncertainty Quantification and Interpretation + + +
+ Subsurface datasets inherently possess big data characteristics such as vast +volume, diverse features, and high sampling speeds, further compounded by the +curse of dimensionality from various physical, engineering, and geological +inputs. Among the existing dimensionality reduction (DR) methods, nonlinear +dimensionality reduction (NDR) methods, especially Metric-multidimensional +scaling (MDS), are preferred for subsurface datasets due to their inherent +complexity. While MDS retains intrinsic data structure and quantifies +uncertainty, its limitations include unstabilized unique solutions invariant to +Euclidean transformations and an absence of out-of-sample points (OOSP) +extension. To enhance subsurface inferential and machine learning workflows, +datasets must be transformed into stable, reduced-dimension representations +that accommodate OOSP. + Our solution employs rigid transformations for a stabilized Euclidean +invariant representation for LDS. By computing an MDS input dissimilarity +matrix, and applying rigid transformations on multiple realizations, we ensure +transformation invariance and integrate OOSP. This process leverages a convex +hull algorithm and incorporates loss function and normalized stress for +distortion quantification. We validate our approach with synthetic data, +varying distance metrics, and real-world wells from the Duvernay Formation. +Results confirm our method's efficacy in achieving consistent LDS +representations. Furthermore, our proposed "stress ratio" (SR) metric provides +insight into uncertainty, beneficial for model adjustments and inferential +analysis. Consequently, our workflow promises enhanced repeatability and +comparability in NDR for subsurface energy resource engineering and associated +big data workflows. + +
+
+ comment: 30 pages, 17 figures, Submitted to Computational Geosciences Journal +
+
+
+
+
+ + ♻ ☆ AI-Assisted Discovery of Quantitative and Formal Models in Social + Science + + +
+ In social science, formal and quantitative models, such as ones describing +economic growth and collective action, are used to formulate mechanistic +explanations, provide predictions, and uncover questions about observed +phenomena. Here, we demonstrate the use of a machine learning system to aid the +discovery of symbolic models that capture nonlinear and dynamical relationships +in social science datasets. By extending neuro-symbolic methods to find compact +functions and differential equations in noisy and longitudinal data, we show +that our system can be used to discover interpretable models from real-world +data in economics and sociology. Augmenting existing workflows with symbolic +regression can help uncover novel relationships and explore counterfactual +models during the scientific process. We propose that this AI-assisted +framework can bridge parametric and non-parametric models commonly employed in +social science research by systematically exploring the space of nonlinear +models and enabling fine-grained control over expressivity and +interpretability. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Data Selection: A Surprisingly Effective and General Principle for + Building Small Interpretable Models + + +
+ We present convincing empirical evidence for an effective and general +strategy for building accurate small models. Such models are attractive for +interpretability and also find use in resource-constrained environments. The +strategy is to learn the training distribution instead of using data from the +test distribution. The distribution learning algorithm is not a contribution of +this work; we highlight the broad usefulness of this simple strategy on a +diverse set of tasks, and as such these rigorous empirical results are our +contribution. We apply it to the tasks of (1) building cluster explanation +trees, (2) prototype-based classification, and (3) classification using Random +Forests, and show that it improves the accuracy of weak traditional baselines +to the point that they are surprisingly competitive with specialized modern +techniques. + This strategy is also versatile wrt the notion of model size. In the first +two tasks, model size is identified by number of leaves in the tree and the +number of prototypes respectively. In the final task involving Random Forests +the strategy is shown to be effective even when model size is determined by +more than one factor: number of trees and their maximum depth. + Positive results using multiple datasets are presented that are shown to be +statistically significant. These lead us to conclude that this strategy is both +effective, i.e, leads to significant improvements, and general, i.e., is +applicable to different tasks and model families, and therefore merits further +attention in domains that require small accurate models. + +
+
+
+
+
+ + ♻ ☆ Decision-Focused Learning: Foundations, State of the Art, Benchmark and + Future Opportunities + + +
+ Decision-focused learning (DFL) is an emerging paradigm in machine learning +which trains a model to optimize decisions, integrating prediction and +optimization in an end-to-end system. This paradigm holds the promise to +revolutionize decision-making in many real-world applications which operate +under uncertainty, where the estimation of unknown parameters within these +decision models often becomes a substantial roadblock. This paper presents a +comprehensive review of DFL. It provides an in-depth analysis of the various +techniques devised to integrate machine learning and optimization models, +introduces a taxonomy of DFL methods distinguished by their unique +characteristics, and conducts an extensive empirical evaluation of these +methods proposing suitable benchmark dataset and tasks for DFL. Finally, the +study provides valuable insights into current and potential future avenues in +DFL research. + +
+
+ comment: Experimental Survey and Benchmarking +
+
+
+
+
+ + ♻ ☆ Large-Scale Traffic Congestion Prediction based on Multimodal Fusion and + Representation Mapping + + +
+ With the progress of the urbanisation process, the urban transportation +system is extremely critical to the development of cities and the quality of +life of the citizens. Among them, it is one of the most important tasks to +judge traffic congestion by analysing the congestion factors. Recently, various +traditional and machine-learning-based models have been introduced for +predicting traffic congestion. However, these models are either poorly +aggregated for massive congestion factors or fail to make accurate predictions +for every precise location in large-scale space. To alleviate these problems, a +novel end-to-end framework based on convolutional neural networks is proposed +in this paper. With learning representations, the framework proposes a novel +multimodal fusion module and a novel representation mapping module to achieve +traffic congestion predictions on arbitrary query locations on a large-scale +map, combined with various global reference information. The proposed framework +achieves significant results and efficient inference on real-world large-scale +datasets. + +
+
+
+
+
+ + ♻ ☆ Sensitivity-Aware Mixed-Precision Quantization and Width Optimization of + Deep Neural Networks Through Cluster-Based Tree-Structured Parzen Estimation + + +
+ As the complexity and computational demands of deep learning models rise, the +need for effective optimization methods for neural network designs becomes +paramount. This work introduces an innovative search mechanism for +automatically selecting the best bit-width and layer-width for individual +neural network layers. This leads to a marked enhancement in deep neural +network efficiency. The search domain is strategically reduced by leveraging +Hessian-based pruning, ensuring the removal of non-crucial parameters. +Subsequently, we detail the development of surrogate models for favorable and +unfavorable outcomes by employing a cluster-based tree-structured Parzen +estimator. This strategy allows for a streamlined exploration of architectural +possibilities and swift pinpointing of top-performing designs. Through rigorous +testing on well-known datasets, our method proves its distinct advantage over +existing methods. Compared to leading compression strategies, our approach +records an impressive 20% decrease in model size without compromising accuracy. +Additionally, our method boasts a 12x reduction in search time relative to the +best search-focused strategies currently available. As a result, our proposed +method represents a leap forward in neural network design optimization, paving +the way for quick model design and implementation in settings with limited +resources, thereby propelling the potential of scalable deep learning +solutions. + +
+
+
+
+
+ + ♻ ☆ Box$^2$EL: Concept and Role Box Embeddings for the Description Logic + EL++ + + +
+ Description logic (DL) ontologies extend knowledge graphs (KGs) with +conceptual information and logical background knowledge. In recent years, there +has been growing interest in inductive reasoning techniques for such +ontologies, which promise to complement classical deductive reasoning +algorithms. Similar to KG completion, several existing approaches learn +ontology embeddings in a latent space, while additionally ensuring that they +faithfully capture the logical semantics of the underlying DL. However, they +suffer from several shortcomings, mainly due to a limiting role representation. +We propose Box$^2$EL, which represents both concepts and roles as boxes (i.e., +axis-aligned hyperrectangles) and demonstrate how it overcomes the limitations +of previous methods. We theoretically prove the soundness of our model and +conduct an extensive experimental evaluation, achieving state-of-the-art +results across a variety of datasets. As part of our evaluation, we introduce a +novel benchmark for subsumption prediction involving both atomic and complex +concepts. + +
+
+
+
+
+ + ♻ ☆ Disentangled Representation Learning + + +
+ Disentangled Representation Learning (DRL) aims to learn a model capable of +identifying and disentangling the underlying factors hidden in the observable +data in representation form. The process of separating underlying factors of +variation into variables with semantic meaning benefits in learning explainable +representations of data, which imitates the meaningful understanding process of +humans when observing an object or relation. As a general learning strategy, +DRL has demonstrated its power in improving the model explainability, +controlability, robustness, as well as generalization capacity in a wide range +of scenarios such as computer vision, natural language processing, data mining +etc. In this article, we comprehensively review DRL from various aspects +including motivations, definitions, methodologies, evaluations, applications +and model designs. We discuss works on DRL based on two well-recognized +definitions, i.e., Intuitive Definition and Group Theory Definition. We further +categorize the methodologies for DRL into four groups, i.e., Traditional +Statistical Approaches, Variational Auto-encoder Based Approaches, Generative +Adversarial Networks Based Approaches, Hierarchical Approaches and Other +Approaches. We also analyze principles to design different DRL models that may +benefit different tasks in practical applications. Finally, we point out +challenges in DRL as well as potential research directions deserving future +investigations. We believe this work may provide insights for promoting the DRL +research in the community. + +
+
+ comment: 26 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Black Box Few-Shot Adaptation for Vision-Language models ICCV 2023 + + +
+ Vision-Language (V-L) models trained with contrastive learning to align the +visual and language modalities have been shown to be strong few-shot learners. +Soft prompt learning is the method of choice for few-shot downstream adaption +aiming to bridge the modality gap caused by the distribution shift induced by +the new domain. While parameter-efficient, prompt learning still requires +access to the model weights and can be computationally infeasible for large +models with billions of parameters. To address these shortcomings, in this +work, we describe a black-box method for V-L few-shot adaptation that (a) +operates on pre-computed image and text features and hence works without access +to the model's weights, (b) it is orders of magnitude faster at training time, +(c) it is amenable to both supervised and unsupervised training, and (d) it can +be even used to align image and text features computed from uni-modal models. +To achieve this, we propose Linear Feature Alignment (LFA), a simple linear +approach for V-L re-alignment in the target domain. LFA is initialized from a +closed-form solution to a least-squares problem and then it is iteratively +updated by minimizing a re-ranking loss. Despite its simplicity, our approach +can even surpass soft-prompt learning methods as shown by extensive experiments +on 11 image and 2 video datasets. + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ A Distributionally Robust Approach to Regret Optimal Control using the + Wasserstein Distance + + +
+ This paper proposes a distributionally robust approach to regret optimal +control of discrete-time linear dynamical systems with quadratic costs subject +to a stochastic additive disturbance on the state process. The underlying +probability distribution of the disturbance process is unknown, but assumed to +lie in a given ball of distributions defined in terms of the type-2 Wasserstein +distance. In this framework, strictly causal linear disturbance feedback +controllers are designed to minimize the worst-case expected regret. The regret +incurred by a controller is defined as the difference between the cost it +incurs in response to a realization of the disturbance process and the cost +incurred by the optimal noncausal controller which has perfect knowledge of the +disturbance process realization at the outset. Building on a well-established +duality theory for optimal transport problems, we derive a reformulation of the +minimax regret optimal control problem as a tractable semidefinite program. +Using the equivalent dual reformulation, we characterize a worst-case +distribution achieving the worst-case expected regret in relation to the +distribution at the center of the Wasserstein ball. We compare the minimax +regret optimal control design method with the distributionally robust optimal +control approach using an illustrative example and numerical experiments. + +
+
+ comment: 8 pages, 3 figures, to appear in the proceedings of the 2023 IEEE + Conference on Decision and Control (CDC) +
+
+
+
+
+ + ♻ ☆ RD-DPP: Rate-Distortion Theory Meets Determinantal Point Process to + Diversify Learning Data Samples + + +
+ In some practical learning tasks, such as traffic video analysis, the number +of available training samples is restricted by different factors, such as +limited communication bandwidth and computation power. Determinantal Point +Process (DPP) is a common method for selecting the most diverse samples to +enhance learning quality. However, the number of selected samples is restricted +to the rank of the kernel matrix implied by the dimensionality of data samples. +Secondly, it is not easily customizable to different learning tasks. In this +paper, we propose a new way of measuring task-oriented diversity based on the +Rate-Distortion (RD) theory, appropriate for multi-level classification. To +this end, we establish a fundamental relationship between DPP and RD theory. We +observe that the upper bound of the diversity of data selected by DPP has a +universal trend of $\textit{phase transition}$, which suggests that DPP is +beneficial only at the beginning of sample accumulation. This led to the design +of a bi-modal method, where RD-DPP is used in the first mode to select initial +data samples, then classification inconsistency (as an uncertainty measure) is +used to select the subsequent samples in the second mode. This phase transition +solves the limitation to the rank of the similarity matrix. Applying our method +to six different datasets and five benchmark models suggests that our method +consistently outperforms random selection, DPP-based methods, and alternatives +like uncertainty-based and coreset methods under all sampling budgets, while +exhibiting high generalizability to different learning tasks. + +
+
+
+
+
+ + ♻ ☆ EfficientTrain: Exploring Generalized Curriculum Learning for Training + Visual Backbones ICCV 2023 + + +
+ The superior performance of modern deep networks usually comes with a costly +training procedure. This paper presents a new curriculum learning approach for +the efficient training of visual backbones (e.g., vision Transformers). Our +work is inspired by the inherent learning dynamics of deep networks: we +experimentally show that at an earlier training stage, the model mainly learns +to recognize some 'easier-to-learn' discriminative patterns within each +example, e.g., the lower-frequency components of images and the original +information before data augmentation. Driven by this phenomenon, we propose a +curriculum where the model always leverages all the training data at each +epoch, while the curriculum starts with only exposing the 'easier-to-learn' +patterns of each example, and introduces gradually more difficult patterns. To +implement this idea, we 1) introduce a cropping operation in the Fourier +spectrum of the inputs, which enables the model to learn from only the +lower-frequency components efficiently, 2) demonstrate that exposing the +features of original images amounts to adopting weaker data augmentation, and +3) integrate 1) and 2) and design a curriculum learning schedule with a +greedy-search algorithm. The resulting approach, EfficientTrain, is simple, +general, yet surprisingly effective. As an off-the-shelf method, it reduces the +wall-time training cost of a wide variety of popular models (e.g., ResNet, +ConvNeXt, DeiT, PVT, Swin, and CSWin) by >1.5x on ImageNet-1K/22K without +sacrificing accuracy. It is also effective for self-supervised learning (e.g., +MAE). Code is available at https://github.com/LeapLabTHU/EfficientTrain. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ QBSD: Quartile-Based Seasonality Decomposition for Cost-Effective Time + Series Forecasting + + +
+ In the telecom domain, precise forecasting of time series patterns, such as +cell key performance indicators (KPIs), plays a pivotal role in enhancing +service quality and operational efficiency. State-of-the-art forecasting +approaches prioritize forecasting accuracy at the expense of computational +performance, rendering them less suitable for data-intensive applications +encompassing systems with a multitude of time series variables. To address this +issue, we introduce QBSD, a live forecasting approach tailored to optimize the +trade-off between accuracy and computational complexity. We have evaluated the +performance of QBSD against state-of-the-art forecasting approaches on publicly +available datasets. We have also extended this investigation to our curated +network KPI dataset, now publicly accessible, to showcase the effect of dynamic +operating ranges that varies with time. The results demonstrate that the +proposed method excels in runtime efficiency compared to the leading algorithms +available while maintaining competitive forecast accuracy. + +
+
+
+
+
+ + ♻ ☆ Latent Dynamical Implicit Diffusion Processes + + +
+ Latent dynamical models are commonly used to learn the distribution of a +latent dynamical process that represents a sequence of noisy data samples. +However, producing samples from such models with high fidelity is challenging +due to the complexity and variability of latent and observation dynamics. +Recent advances in diffusion-based generative models, such as DDPM and NCSN, +have shown promising alternatives to state-of-the-art latent generative models, +such as Neural ODEs, RNNs, and Normalizing flow networks, for generating +high-quality sequential samples from a prior distribution. However, their +application in modeling sequential data with latent dynamical models is yet to +be explored. Here, we propose a novel latent variable model named latent +dynamical implicit diffusion processes (LDIDPs), which utilizes implicit +diffusion processes to sample from dynamical latent processes and generate +sequential observation samples accordingly. We tested LDIDPs on synthetic and +simulated neural decoding problems. We demonstrate that LDIDPs can accurately +learn the dynamics over latent dimensions. Furthermore, the implicit sampling +method allows for the computationally efficient generation of high-quality +sequential data samples from the latent and observation spaces. + +
+
+ comment: I request a withdrawal because there are no experiments with + real-world datasets and also the method section requires major changes to + look mathematically sounds +
+
+
+
+
+ + ♻ ☆ LLM Cognitive Judgements Differ From Human + + +
+ Large Language Models (LLMs) have lately been on the spotlight of +researchers, businesses, and consumers alike. While the linguistic capabilities +of such models have been studied extensively, there is growing interest in +investigating them as cognitive subjects. In the present work I examine GPT-3 +and ChatGPT capabilities on an limited-data inductive reasoning task from the +cognitive science literature. The results suggest that these models' cognitive +judgements are not human-like. + +
+
+ comment: 7 pages, 1 figure. License changed to CC BY-NC-SA +
+
+
+
+
+ + ♻ ☆ Bluetooth and WiFi Dataset for Real World RF Fingerprinting of + Commercial Devices + + +
+ RF fingerprinting is emerging as a physical layer security scheme to identify +illegitimate and/or unauthorized emitters sharing the RF spectrum. However, due +to the lack of publicly accessible real-world datasets, most research focuses +on generating synthetic waveforms with software-defined radios (SDRs) which are +not suited for practical deployment settings. On other hand, the limited +datasets that are available focus only on chipsets that generate only one kind +of waveform. Commercial off-the-shelf (COTS) combo chipsets that support two +wireless standards (for example WiFi and Bluetooth) over a shared dual-band +antenna such as those found in laptops, adapters, wireless chargers, Raspberry +Pis, among others are becoming ubiquitous in the IoT realm. Hence, to keep up +with the modern IoT environment, there is a pressing need for real-world open +datasets capturing emissions from these combo chipsets transmitting +heterogeneous communication protocols. To this end, we capture the first known +emissions from the COTS IoT chipsets transmitting WiFi and Bluetooth under two +different time frames. The different time frames are essential to rigorously +evaluate the generalization capability of the models. To ensure widespread use, +each capture within the comprehensive 72 GB dataset is long enough (40 +MSamples) to support diverse input tensor lengths and formats. Finally, the +dataset also comprises emissions at varying signal powers to account for the +feeble to high signal strength emissions as encountered in a real-world +setting. + +
+
+ comment: Revision Under Review +
+
+
+
+
+ + ♻ ☆ Deep Unrolling Networks with Recurrent Momentum Acceleration for + Nonlinear Inverse Problems + + +
+ Combining the strengths of model-based iterative algorithms and data-driven +deep learning solutions, deep unrolling networks (DuNets) have become a popular +tool to solve inverse imaging problems. While DuNets have been successfully +applied to many linear inverse problems, nonlinear problems tend to impair the +performance of the method. Inspired by momentum acceleration techniques that +are often used in optimization algorithms, we propose a recurrent momentum +acceleration (RMA) framework that uses a long short-term memory recurrent +neural network (LSTM-RNN) to simulate the momentum acceleration process. The +RMA module leverages the ability of the LSTM-RNN to learn and retain knowledge +from the previous gradients. We apply RMA to two popular DuNets -- the learned +proximal gradient descent (LPGD) and the learned primal-dual (LPD) methods, +resulting in LPGD-RMA and LPD-RMA respectively. We provide experimental results +on two nonlinear inverse problems: a nonlinear deconvolution problem, and an +electrical impedance tomography problem with limited boundary measurements. In +the first experiment we have observed that the improvement due to RMA largely +increases with respect to the nonlinearity of the problem. The results of the +second example further demonstrate that the RMA schemes can significantly +improve the performance of DuNets in strongly ill-posed problems. + +
+
+
+
+
+ + ♻ ☆ Echoes: Unsupervised Debiasing via Pseudo-bias Labeling in an Echo + Chamber + + +
+ Neural networks often learn spurious correlations when exposed to biased +training data, leading to poor performance on out-of-distribution data. A +biased dataset can be divided, according to biased features, into bias-aligned +samples (i.e., with biased features) and bias-conflicting samples (i.e., +without biased features). Recent debiasing works typically assume that no bias +label is available during the training phase, as obtaining such information is +challenging and labor-intensive. Following this unsupervised assumption, +existing methods usually train two models: a biased model specialized to learn +biased features and a target model that uses information from the biased model +for debiasing. This paper first presents experimental analyses revealing that +the existing biased models overfit to bias-conflicting samples in the training +data, which negatively impacts the debiasing performance of the target models. +To address this issue, we propose a straightforward and effective method called +Echoes, which trains a biased model and a target model with a different +strategy. We construct an "echo chamber" environment by reducing the weights of +samples which are misclassified by the biased model, to ensure the biased model +fully learns the biased features without overfitting to the bias-conflicting +samples. The biased model then assigns lower weights on the bias-conflicting +samples. Subsequently, we use the inverse of the sample weights of the biased +model for training the target model. Experiments show that our approach +achieves superior debiasing results compared to the existing baselines on both +synthetic and real-world datasets. Our code is available at +https://github.com/isruihu/Echoes. + +
+
+ comment: Accepted by ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ SpecInfer: Accelerating Generative Large Language Model Serving with + Speculative Inference and Token Tree Verification + + +
+ The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them quickly and cheaply. This paper +introduces SpecInfer, an LLM serving system that accelerates generative LLM +inference with speculative inference and token tree verification. A key insight +behind Specinfer is to combine various collectively boost-tuned small language +models to jointly predict the LLM's outputs; the predictions are organized as a +token tree, whose nodes each represent a candidate token sequence. The +correctness of all candidate token sequences represented by a token tree is +verified against the LLM in parallel using a novel tree-based parallel decoding +mechanism. SpecInfer uses an LLM as a token tree verifier instead of an +incremental decoder, which significantly reduces the end-to-end latency and +computational requirement for serving generative LLMs while provably preserving +model quality. Our evaluation shows that SpecInfer outperforms existing LLM +serving systems by 1.3-2.4x for distributed LLM inference and by 2.6-3.5x for +offloading-based LLM inference, while preserving the same generative +performance. SpecInfer is publicly available at +https://github.com/flexflow/FlexFlow/tree/inference. + +
+
+
+
+
+ + ♻ ☆ How does over-squashing affect the power of GNNs? + + +
+ Graph Neural Networks (GNNs) are the state-of-the-art model for machine +learning on graph-structured data. The most popular class of GNNs operate by +exchanging information between adjacent nodes, and are known as Message Passing +Neural Networks (MPNNs). Given their widespread use, understanding the +expressive power of MPNNs is a key question. However, existing results +typically consider settings with uninformative node features. In this paper, we +provide a rigorous analysis to determine which function classes of node +features can be learned by an MPNN of a given capacity. We do so by measuring +the level of pairwise interactions between nodes that MPNNs allow for. This +measure provides a novel quantitative characterization of the so-called +over-squashing effect, which is observed to occur when a large volume of +messages is aggregated into fixed-size vectors. Using our measure, we prove +that, to guarantee sufficient communication between pairs of nodes, the +capacity of the MPNN must be large enough, depending on properties of the input +graph structure, such as commute times. For many relevant scenarios, our +analysis results in impossibility statements in practice, showing that +over-squashing hinders the expressive power of MPNNs. We validate our +theoretical findings through extensive controlled experiments and ablation +studies. + +
+
+ comment: 37 pages +
+
+
+
+
+ + ♻ ☆ Bi-level Contrastive Learning for Knowledge-Enhanced Molecule + Representations + + +
+ Molecule representation learning underpins diverse downstream applications +such as molecular property and side effect understanding and prediction. In +this paper, we recognize the two-level structure of individual molecule as +having intrinsic graph structure as well as being a node in a large molecule +knowledge graph, and present GODE, a new approach that seamlessly integrates +graph representations of individual molecules with multi-domain biomedical data +from knowledge graphs. By pre-training two graph neural networks (GNNs) on +different graph structures, combined with contrastive learning, GODE adeptly +fuses molecular structures with their corresponding knowledge graph +substructures. This fusion results in a more robust and informative +representation, enhancing molecular property prediction by harnessing both +chemical and biological information. Finetuned on 11 chemical property tasks, +our model surpasses benchmarks, achieving an average ROC-AUC improvement of +14.5%, 9.8%, and 7.3% on BBBP, SIDER, and Tox21 datasets. In regression tasks +on ESOL and QM7 datasets, we achieve average improvements of 21.0% and 29.6% +improvements in RMSE and MAE, setting a new field benchmark. + +
+
+
+
+
+ + ♻ ☆ Text-only domain adaptation for end-to-end ASR using integrated + text-to-mel-spectrogram generator INTERSPEECH 2023 + + +
+ We propose an end-to-end Automatic Speech Recognition (ASR) system that can +be trained on transcribed speech data, text-only data, or a mixture of both. +The proposed model uses an integrated auxiliary block for text-based training. +This block combines a non-autoregressive multi-speaker text-to-mel-spectrogram +generator with a GAN-based enhancer to improve the spectrogram quality. The +proposed system can generate a mel-spectrogram dynamically during training. It +can be used to adapt the ASR model to a new domain by using text-only data from +this domain. We demonstrate that the proposed training method significantly +improves ASR accuracy compared to the system trained on transcribed speech +only. It also surpasses cascade TTS systems with the vocoder in the adaptation +quality and training speed. + +
+
+ comment: Accepted to INTERSPEECH 2023 +
+
+
+
+
+ + ♻ ☆ An ensemble of VisNet, Transformer-M, and pretraining models for + molecular property prediction in OGB Large-Scale Challenge @ NeurIPS 2022 + + +
+ In the technical report, we provide our solution for OGB-LSC 2022 Graph +Regression Task. The target of this task is to predict the quantum chemical +property, HOMO-LUMO gap for a given molecule on PCQM4Mv2 dataset. In the +competition, we designed two kinds of models: Transformer-M-ViSNet which is an +geometry-enhanced graph neural network for fully connected molecular graphs and +Pretrained-3D-ViSNet which is a pretrained ViSNet by distilling geomeotric +information from optimized structures. With an ensemble of 22 models, ViSNet +Team achieved the MAE of 0.0723 eV on the test-challenge set, dramatically +reducing the error by 39.75% compared with the best method in the last year +competition. + +
+
+
+
+
+ + ♻ ☆ Unsafe Diffusion: On the Generation of Unsafe Images and Hateful Memes + From Text-To-Image Models + + +
+ State-of-the-art Text-to-Image models like Stable Diffusion and DALLE$\cdot$2 +are revolutionizing how people generate visual content. At the same time, +society has serious concerns about how adversaries can exploit such models to +generate unsafe images. In this work, we focus on demystifying the generation +of unsafe images and hateful memes from Text-to-Image models. We first +construct a typology of unsafe images consisting of five categories (sexually +explicit, violent, disturbing, hateful, and political). Then, we assess the +proportion of unsafe images generated by four advanced Text-to-Image models +using four prompt datasets. We find that these models can generate a +substantial percentage of unsafe images; across four models and four prompt +datasets, 14.56% of all generated images are unsafe. When comparing the four +models, we find different risk levels, with Stable Diffusion being the most +prone to generating unsafe content (18.92% of all generated images are unsafe). +Given Stable Diffusion's tendency to generate more unsafe content, we evaluate +its potential to generate hateful meme variants if exploited by an adversary to +attack a specific individual or community. We employ three image editing +methods, DreamBooth, Textual Inversion, and SDEdit, which are supported by +Stable Diffusion. Our evaluation result shows that 24% of the generated images +using DreamBooth are hateful meme variants that present the features of the +original hateful meme and the target individual/community; these generated +images are comparable to hateful meme variants collected from the real world. +Overall, our results demonstrate that the danger of large-scale generation of +unsafe images is imminent. We discuss several mitigating measures, such as +curating training data, regulating prompts, and implementing safety filters, +and encourage better safeguard tools to be developed to prevent unsafe +generation. + +
+
+ comment: To Appear in the ACM Conference on Computer and Communications + Security, November 26, 2023 +
+
+
+
+
+ + ♻ ☆ 3D-aware Blending with Generative NeRFs ICCV 2023 + + +
+ Image blending aims to combine multiple images seamlessly. It remains +challenging for existing 2D-based methods, especially when input images are +misaligned due to differences in 3D camera poses and object shapes. To tackle +these issues, we propose a 3D-aware blending method using generative Neural +Radiance Fields (NeRF), including two key components: 3D-aware alignment and +3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of +the reference image with respect to generative NeRFs and then perform 3D local +alignment for each part. To further leverage 3D information of the generative +NeRF, we propose 3D-aware blending that directly blends images on the NeRF's +latent representation space, rather than raw pixel space. Collectively, our +method outperforms existing 2D baselines, as validated by extensive +quantitative and qualitative evaluations with FFHQ and AFHQ-Cat. + +
+
+ comment: ICCV 2023, Project page: https://blandocs.github.io/blendnerf +
+
+
+
+
+ + ♻ ☆ Active Learning for Optimal Intervention Design in Causal Models + + +
+ Sequential experimental design to discover interventions that achieve a +desired outcome is a key problem in various domains including science, +engineering and public policy. When the space of possible interventions is +large, making an exhaustive search infeasible, experimental design strategies +are needed. In this context, encoding the causal relationships between the +variables, and thus the effect of interventions on the system, is critical for +identifying desirable interventions more efficiently. Here, we develop a causal +active learning strategy to identify interventions that are optimal, as +measured by the discrepancy between the post-interventional mean of the +distribution and a desired target mean. The approach employs a Bayesian update +for the causal model and prioritizes interventions using a carefully designed, +causally informed acquisition function. This acquisition function is evaluated +in closed form, allowing for fast optimization. The resulting algorithms are +theoretically grounded with information-theoretic bounds and provable +consistency results for linear causal models with known causal graph. We apply +our approach to both synthetic data and single-cell transcriptomic data from +Perturb-CITE-seq experiments to identify optimal perturbations that induce a +specific cell state transition. The causally informed acquisition function +generally outperforms existing criteria allowing for optimal intervention +design with fewer but carefully selected samples. + +
+
+
+
+
+ + ♻ ☆ Editing Language Model-based Knowledge Graph Embeddings + + +
+ Recently decades have witnessed the empirical success of framing Knowledge +Graph (KG) embeddings via language models. However, language model-based KG +embeddings are usually deployed as static artifacts, making them difficult to +modify post-deployment without re-training after deployment. To address this +issue, we propose a new task of editing language model-based KG embeddings in +this paper. This task is designed to facilitate rapid, data-efficient updates +to KG embeddings without compromising the performance of other aspects. We +build four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and +evaluate several knowledge editing baselines demonstrating the limited ability +of previous models to handle the proposed challenging task. We further propose +a simple yet strong baseline dubbed KGEditor, which utilizes additional +parametric layers of the hyper network to edit/add facts. Our comprehensive +experimental results reveal that KGEditor excels in updating specific facts +without impacting the overall performance, even when faced with limited +training resources. Code and datasets are available in +https://github.com/zjunlp/PromptKG/tree/main/deltaKG. + +
+
+ comment: Work in progress and the project website is + https://zjunlp.github.io/project/KGE_Editing/ +
+
+
+
+
+ + ♻ ☆ Neural radiance fields in the industrial and robotics domain: + applications, research opportunities and use cases + + +
+ The proliferation of technologies, such as extended reality (XR), has +increased the demand for high-quality three-dimensional (3D) graphical +representations. Industrial 3D applications encompass computer-aided design +(CAD), finite element analysis (FEA), scanning, and robotics. However, current +methods employed for industrial 3D representations suffer from high +implementation costs and reliance on manual human input for accurate 3D +modeling. To address these challenges, neural radiance fields (NeRFs) have +emerged as a promising approach for learning 3D scene representations based on +provided training 2D images. Despite a growing interest in NeRFs, their +potential applications in various industrial subdomains are still unexplored. +In this paper, we deliver a comprehensive examination of NeRF industrial +applications while also providing direction for future research endeavors. We +also present a series of proof-of-concept experiments that demonstrate the +potential of NeRFs in the industrial domain. These experiments include +NeRF-based video compression techniques and using NeRFs for 3D motion +estimation in the context of collision avoidance. In the video compression +experiment, our results show compression savings up to 48\% and 74\% for +resolutions of 1920x1080 and 300x168, respectively. The motion estimation +experiment used a 3D animation of a robotic arm to train Dynamic-NeRF (D-NeRF) +and achieved an average peak signal-to-noise ratio (PSNR) of disparity map with +the value of 23 dB and an structural similarity index measure (SSIM) 0.97. + +
+
+
+
+
+ + ♻ ☆ STS-GAN: Can We Synthesize Solid Texture with High Fidelity from + Arbitrary 2D Exemplar? + + +
+ Solid texture synthesis (STS), an effective way to extend a 2D exemplar to a +3D solid volume, exhibits advantages in computational photography. However, +existing methods generally fail to accurately learn arbitrary textures, which +may result in the failure to synthesize solid textures with high fidelity. In +this paper, we propose a novel generative adversarial nets-based framework +(STS-GAN) to extend the given 2D exemplar to arbitrary 3D solid textures. In +STS-GAN, multi-scale 2D texture discriminators evaluate the similarity between +the given 2D exemplar and slices from the generated 3D texture, promoting the +3D texture generator synthesizing realistic solid textures. Finally, +experiments demonstrate that the proposed method can generate high-fidelity +solid textures with similar visual characteristics to the 2D exemplar. + +
+
+
+
+
+ + ♻ ☆ Relevant Entity Selection: Knowledge Graph Bootstrapping via Zero-Shot + Analogical Pruning + + +
+ Knowledge Graph Construction (KGC) can be seen as an iterative process +starting from a high quality nucleus that is refined by knowledge extraction +approaches in a virtuous loop. Such a nucleus can be obtained from knowledge +existing in an open KG like Wikidata. However, due to the size of such generic +KGs, integrating them as a whole may entail irrelevant content and scalability +issues. We propose an analogy-based approach that starts from seed entities of +interest in a generic KG, and keeps or prunes their neighboring entities. We +evaluate our approach on Wikidata through two manually labeled datasets that +contain either domain-homogeneous or -heterogeneous seed entities. We +empirically show that our analogy-based approach outperforms LSTM, Random +Forest, SVM, and MLP, with a drastically lower number of parameters. We also +evaluate its generalization potential in a transfer learning setting. These +results advocate for the further integration of analogy-based inference in +tasks related to the KG lifecycle. + +
+
+
+
+
+ + ♻ ☆ Beyond the Meta: Leveraging Game Design Parameters for Patch-Agnostic + Esport Analytics + + +
+ Esport games comprise a sizeable fraction of the global games market, and is +the fastest growing segment in games. This has given rise to the domain of +esports analytics, which uses telemetry data from games to inform players, +coaches, broadcasters and other stakeholders. Compared to traditional sports, +esport titles change rapidly, in terms of mechanics as well as rules. Due to +these frequent changes to the parameters of the game, esport analytics models +can have a short life-spam, a problem which is largely ignored within the +literature. This paper extracts information from game design (i.e. patch notes) +and utilises clustering techniques to propose a new form of character +representation. As a case study, a neural network model is trained to predict +the number of kills in a Dota 2 match utilising this novel character +representation technique. The performance of this model is then evaluated +against two distinct baselines, including conventional techniques. Not only did +the model significantly outperform the baselines in terms of accuracy (85% +AUC), but the model also maintains the accuracy in two newer iterations of the +game that introduced one new character and a brand new character type. These +changes introduced to the design of the game would typically break conventional +techniques that are commonly used within the literature. Therefore, the +proposed methodology for representing characters can increase the life-spam of +machine learning models as well as contribute to a higher performance when +compared to traditional techniques typically employed within the literature. + +
+
+
+
+
+ + ♻ ☆ Beyond Individual Input for Deep Anomaly Detection on Tabular Data + + +
+ Anomaly detection is crucial in various domains, such as finance, healthcare, +and cybersecurity. In this paper, we propose a novel deep anomaly detection +method for tabular data that leverages Non-Parametric Transformers (NPTs), a +model initially proposed for supervised tasks, to capture both feature-feature +and sample-sample dependencies. In a reconstruction-based framework, we train +the NPT to reconstruct masked features of normal samples. In a non-parametric +fashion, we leverage the whole training set during inference and use the +model's ability to reconstruct the masked features during to generate an +anomaly score. To the best of our knowledge, our proposed method is the first +to successfully combine feature-feature and sample-sample dependencies for +anomaly detection on tabular datasets. We evaluate our method on an extensive +benchmark of 31 tabular datasets and demonstrate that our approach outperforms +existing state-of-the-art methods based on the F1-score and AUROC by a +significant margin. + +
+
+
+
+
+ + ♻ ☆ Architecture-Preserving Provable Repair of Deep Neural Networks + + +
+ Deep neural networks (DNNs) are becoming increasingly important components of +software, and are considered the state-of-the-art solution for a number of +problems, such as image recognition. However, DNNs are far from infallible, and +incorrect behavior of DNNs can have disastrous real-world consequences. This +paper addresses the problem of architecture-preserving V-polytope provable +repair of DNNs. A V-polytope defines a convex bounded polytope using its vertex +representation. V-polytope provable repair guarantees that the repaired DNN +satisfies the given specification on the infinite set of points in the given +V-polytope. An architecture-preserving repair only modifies the parameters of +the DNN, without modifying its architecture. The repair has the flexibility to +modify multiple layers of the DNN, and runs in polynomial time. It supports +DNNs with activation functions that have some linear pieces, as well as +fully-connected, convolutional, pooling and residual layers. To the best our +knowledge, this is the first provable repair approach that has all of these +features. We implement our approach in a tool called APRNN. Using MNIST, +ImageNet, and ACAS Xu DNNs, we show that it has better efficiency, scalability, +and generalization compared to PRDNN and REASSURE, prior provable repair +methods that are not architecture preserving. + +
+
+ comment: Accepted paper at PLDI 2023. Tool is available at + https://github.com/95616ARG/APRNN/ +
+
+
+
+
+ + ♻ ☆ Are demographically invariant models and representations in medical + imaging fair? + + +
+ Medical imaging models have been shown to encode information about patient +demographics such as age, race, and sex in their latent representation, raising +concerns about their potential for discrimination. Here, we ask whether +requiring models not to encode demographic attributes is desirable. We point +out that marginal and class-conditional representation invariance imply the +standard group fairness notions of demographic parity and equalized odds, +respectively, while additionally requiring risk distribution matching, thus +potentially equalizing away important group differences. Enforcing the +traditional fairness notions directly instead does not entail these strong +constraints. Moreover, representationally invariant models may still take +demographic attributes into account for deriving predictions. The latter can be +prevented using counterfactual notions of (individual) fairness or invariance. +We caution, however, that properly defining medical image counterfactuals with +respect to demographic attributes is highly challenging. Finally, we posit that +encoding demographic attributes may even be advantageous if it enables learning +a task-specific encoding of demographic features that does not rely on social +constructs such as 'race' and 'gender.' We conclude that demographically +invariant representations are neither necessary nor sufficient for fairness in +medical imaging. Models may need to encode demographic attributes, lending +further urgency to calls for comprehensive model fairness assessments in terms +of predictive performance across diverse patient groups. + +
+
+
+
+
+ + ♻ ☆ SciRE-Solver: Efficient Sampling of Diffusion Probabilistic Models by + Score-integrand Solver with Recursive Derivative Estimation + + +
+ Diffusion probabilistic models (DPMs) are a powerful class of generative +models known for their ability to generate high-fidelity image samples. A major +challenge in the implementation of DPMs is the slow sampling process. In this +work, we bring a high-efficiency sampler for DPMs. Specifically, we propose a +score-based exact solution paradigm for the diffusion ODEs corresponding to the +sampling process of DPMs, which introduces a new perspective on developing +numerical algorithms for solving diffusion ODEs. To achieve an efficient +sampler, we propose a recursive derivative estimation (RDE) method to reduce +the estimation error. With our proposed solution paradigm and RDE method, we +propose the score-integrand solver with the convergence order guarantee as +efficient solver (SciRE-Solver) for solving diffusion ODEs. The SciRE-Solver +attains state-of-the-art (SOTA) sampling performance with a limited number of +score function evaluations (NFE) on both discrete-time and continuous-time DPMs +in comparison to existing training-free sampling algorithms. Such as, we +achieve $3.48$ FID with $12$ NFE and $2.42$ FID with $20$ NFE for +continuous-time DPMs on CIFAR10, respectively. Different from other samplers, +SciRE-Solver has the promising potential to surpass the FIDs achieved in the +original papers of some pre-trained models with a small NFEs. For example, we +reach SOTA value of $2.40$ FID with $100$ NFE for continuous-time DPM and of +$3.15$ FID with $84$ NFE for discrete-time DPM on CIFAR-10, as well as of +$2.17$ ($2.02$) FID with $18$ ($50$) NFE for discrete-time DPM on CelebA +64$\times$64. + +
+
+
+
+
+ + ♻ ☆ Stochastic Constrained DRO with a Complexity Independent of Sample Size + + +
+ Distributionally Robust Optimization (DRO), as a popular method to train +robust models against distribution shift between training and test sets, has +received tremendous attention in recent years. In this paper, we propose and +analyze stochastic algorithms that apply to both non-convex and convex losses +for solving Kullback Leibler divergence constrained DRO problem. Compared with +existing methods solving this problem, our stochastic algorithms not only enjoy +competitive if not better complexity independent of sample size but also just +require a constant batch size at every iteration, which is more practical for +broad applications. We establish a nearly optimal complexity bound for finding +an $\epsilon$ stationary solution for non-convex losses and an optimal +complexity for finding an $\epsilon$ optimal solution for convex losses. +Empirical studies demonstrate the effectiveness of the proposed algorithms for +solving non-convex and convex constrained DRO problems. + +
+
+ comment: 37 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Non-linear Embeddings in Hilbert Simplex Geometry + + +
+ A key technique of machine learning and computer vision is to embed discrete +weighted graphs into continuous spaces for further downstream processing. +Embedding discrete hierarchical structures in hyperbolic geometry has proven +very successful since it was shown that any weighted tree can be embedded in +that geometry with arbitrary low distortion. Various optimization methods for +hyperbolic embeddings based on common models of hyperbolic geometry have been +studied. In this paper, we consider Hilbert geometry for the standard simplex +which is isometric to a vector space equipped with the variation polytope norm. +We study the representation power of this Hilbert simplex geometry by embedding +distance matrices of graphs. Our findings demonstrate that Hilbert simplex +geometry is competitive to alternative geometries such as the Poincar\'e +hyperbolic ball or the Euclidean geometry for embedding tasks while being fast +and numerically robust. + +
+
+ comment: 21 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using + Pre-trained Diffusion Models ICCV2023 + + +
+ Given a classifier, the inherent property of semantic Out-of-Distribution +(OOD) samples is that their contents differ from all legal classes in terms of +semantics, namely semantic mismatch. There is a recent work that directly +applies it to OOD detection, which employs a conditional Generative Adversarial +Network (cGAN) to enlarge semantic mismatch in the image space. While achieving +remarkable OOD detection performance on small datasets, it is not applicable to +ImageNet-scale datasets due to the difficulty in training cGANs with both input +images and labels as conditions. As diffusion models are much easier to train +and amenable to various conditions compared to cGANs, in this work, we propose +to directly use pre-trained diffusion models for semantic mismatch-guided OOD +detection, named DiffGuard. Specifically, given an OOD input image and the +predicted label from the classifier, we try to enlarge the semantic difference +between the reconstructed OOD image under these conditions and the original +input image. We also present several test-time techniques to further strengthen +such differences. Experimental results show that DiffGuard is effective on both +Cifar-10 and hard cases of the large-scale ImageNet, and it can be easily +combined with existing OOD detection techniques to achieve state-of-the-art OOD +detection results. + +
+
+ comment: Accepted by ICCV2023, with supplementary materials +
+
+
+
+
+ + ♻ ☆ Learning Ability of Interpolating Deep Convolutional Neural Networks + + +
+ It is frequently observed that overparameterized neural networks generalize +well. Regarding such phenomena, existing theoretical work mainly devotes to +linear settings or fully-connected neural networks. This paper studies the +learning ability of an important family of deep neural networks, deep +convolutional neural networks (DCNNs), under both underparameterized and +overparameterized settings. We establish the first learning rates of +underparameterized DCNNs without parameter or function variable structure +restrictions presented in the literature. We also show that by adding +well-defined layers to a non-interpolating DCNN, we can obtain some +interpolating DCNNs that maintain the good learning rates of the +non-interpolating DCNN. This result is achieved by a novel network deepening +scheme designed for DCNNs. Our work provides theoretical verification of how +overfitted DCNNs generalize well. + +
+
+
+
+
+ + ♻ ☆ A Scalable Test Problem Generator for Sequential Transfer Optimization + + +
+ Sequential transfer optimization (STO), which aims to improve the +optimization performance on a task at hand by exploiting the knowledge captured +from several previously-solved optimization tasks stored in a database, has +been gaining increasing research attention over the years. However, despite +remarkable advances in algorithm design, the development of a systematic +benchmark suite for comprehensive comparisons of STO algorithms received far +less attention. Existing test problems are either simply generated by +assembling other benchmark functions or extended from specific practical +problems with limited variations. The relationships between the optimal +solutions of the source and target tasks in these problems are always manually +configured, limiting their ability to model different relationships presented +in real-world problems. Consequently, the good performance achieved by an +algorithm on these problems might be biased and could not be generalized to +other problems. In light of the above, in this study, we first introduce four +rudimentary concepts for characterizing STO problems (STOPs) and present an +important problem feature, namely similarity distribution, which quantitatively +delineates the relationship between the optima of the source and target tasks. +Then, we propose the general design guidelines and a problem generator with +superior scalability. Specifically, the similarity distribution of an STOP can +be easily customized, enabling a continuous spectrum of representation of the +diverse similarity relationships of real-world problems. Lastly, a benchmark +suite with 12 STOPs featured by a variety of customized similarity +relationships is developed using the proposed generator, which would serve as +an arena for STO algorithms and provide more comprehensive evaluation results. +The source code of the problem generator is available at +https://github.com/XmingHsueh/STOP-G. + +
+
+
+
+
+ + ♻ ☆ Capturing the Diffusive Behavior of the Multiscale Linear Transport + Equations by Asymptotic-Preserving Convolutional DeepONets + + +
+ In this paper, we introduce two types of novel Asymptotic-Preserving +Convolutional Deep Operator Networks (APCONs) designed to address the +multiscale time-dependent linear transport problem. We observe that the vanilla +physics-informed DeepONets with modified MLP may exhibit instability in +maintaining the desired limiting macroscopic behavior. Therefore, this +necessitates the utilization of an asymptotic-preserving loss function. Drawing +inspiration from the heat kernel in the diffusion equation, we propose a new +architecture called Convolutional Deep Operator Networks, which employ multiple +local convolution operations instead of a global heat kernel, along with +pooling and activation operations in each filter layer. Our APCON methods +possess a parameter count that is independent of the grid size and are capable +of capturing the diffusive behavior of the linear transport problem. Finally, +we validate the effectiveness of our methods through several numerical +examples. + +
+
+
+
+
+ + ♻ ☆ CUTS+: High-dimensional Causal Discovery from Irregular Time-series AAAI-24 + + +
+ Causal discovery in time-series is a fundamental problem in the machine +learning community, enabling causal reasoning and decision-making in complex +scenarios. Recently, researchers successfully discover causality by combining +neural networks with Granger causality, but their performances degrade largely +when encountering high-dimensional data because of the highly redundant network +design and huge causal graphs. Moreover, the missing entries in the +observations further hamper the causal structural learning. To overcome these +limitations, We propose CUTS+, which is built on the Granger-causality-based +causal discovery method CUTS and raises the scalability by introducing a +technique called Coarse-to-fine-discovery (C2FD) and leveraging a +message-passing-based graph neural network (MPGNN). Compared to previous +methods on simulated, quasi-real, and real datasets, we show that CUTS+ largely +improves the causal discovery performance on high-dimensional data with +different types of irregular sampling. + +
+
+ comment: Submit to AAAI-24 +
+
+
+
+
+ + ♻ ☆ ST-former for short-term passenger flow prediction during COVID-19 in + urban rail transit system + + +
+ Accurate passenger flow prediction of urban rail transit is essential for +improving the performance of intelligent transportation systems, especially +during the epidemic. How to dynamically model the complex spatiotemporal +dependencies of passenger flow is the main issue in achieving accurate +passenger flow prediction during the epidemic. To solve this issue, this paper +proposes a brand-new transformer-based architecture called STformer under the +encoder-decoder framework specifically for COVID-19. Concretely, we develop a +modified self-attention mechanism named Causal-Convolution ProbSparse +Self-Attention (CPSA) to model the multiple temporal dependencies of passenger +flow with low computational costs. To capture the complex and dynamic spatial +dependencies, we introduce a novel Adaptive Multi-Graph Convolution Network +(AMGCN) by leveraging multiple graphs in a self-adaptive manner. Additionally, +the Multi-source Data Fusion block fuses the passenger flow data, COVID-19 +confirmed case data, and the relevant social media data to study the impact of +COVID-19 to passenger flow. Experiments on real-world passenger flow datasets +demonstrate the superiority of ST-former over the other eleven state-of-the-art +methods. Several ablation studies are carried out to verify the effectiveness +and reliability of our model structure. Results can provide critical insights +for the operation of URT systems. + +
+
+ comment: There are some errors that might mislead readers for this version. + There is no new version right now +
+
+
+
+
+ + ♻ ☆ Spatial-Temporal Attention Fusion Network for short-term passenger flow + prediction on holidays in urban rail transit systems + + +
+ The short term passenger flow prediction of the urban rail transit system is +of great significance for traffic operation and management. The emerging deep +learning-based models provide effective methods to improve prediction accuracy. +However, most of the existing models mainly predict the passenger flow on +general weekdays or weekends. There are only few studies focusing on predicting +the passenger flow on holidays, which is a significantly challenging task for +traffic management because of its suddenness and irregularity. To this end, we +propose a deep learning-based model named Spatial Temporal Attention Fusion +Network comprising a novel Multi-Graph Attention Network, a Conv-Attention +Block, and Feature Fusion Block for short-term passenger flow prediction on +holidays. The multi-graph attention network is applied to extract the complex +spatial dependencies of passenger flow dynamically and the conv-attention block +is applied to extract the temporal dependencies of passenger flow from global +and local perspectives. Moreover, in addition to the historical passenger flow +data, the social media data, which has been proven that they can effectively +reflect the evolution trend of passenger flow under events, are also fused into +the feature fusion block of STAFN. The STAFN is tested on two large-scale urban +rail transit AFC datasets from China on the New Year holiday, and the +prediction performance of the model are compared with that of several +conventional prediction models. Results demonstrate its better robustness and +advantages among benchmark methods, which can provide overwhelming support for +practical applications of short term passenger flow prediction on holidays. + +
+
+ comment: There are some errors that might mislead readers for this version. + There is no new version right now +
+
+
+
+
+ + ♻ ☆ STG-GAN: A spatiotemporal graph generative adversarial networks for + short-term passenger flow prediction in urban rail transit systems + + +
+ Short-term passenger flow prediction is an important but challenging task for +better managing urban rail transit (URT) systems. Some emerging deep learning +models provide good insights to improve short-term prediction accuracy. +However, there exist many complex spatiotemporal dependencies in URT systems. +Most previous methods only consider the absolute error between ground truth and +predictions as the optimization objective, which fails to account for spatial +and temporal constraints on the predictions. Furthermore, a large number of +existing prediction models introduce complex neural network layers to improve +accuracy while ignoring their training efficiency and memory occupancy, +decreasing the chances to be applied to the real world. To overcome these +limitations, we propose a novel deep learning-based spatiotemporal graph +generative adversarial network (STG-GAN) model with higher prediction accuracy, +higher efficiency, and lower memory occupancy to predict short-term passenger +flows of the URT network. Our model consists of two major parts, which are +optimized in an adversarial learning manner: (1) a generator network including +gated temporal conventional networks (TCN) and weight sharing graph convolution +networks (GCN) to capture structural spatiotemporal dependencies and generate +predictions with a relatively small computational burden; (2) a discriminator +network including a spatial discriminator and a temporal discriminator to +enhance the spatial and temporal constraints of the predictions. The STG-GAN is +evaluated on two large-scale real-world datasets from Beijing Subway. A +comparison with those of several state-of-the-art models illustrates its +superiority and robustness. This study can provide critical experience in +conducting short-term passenger flow predictions, especially from the +perspective of real-world applications. + +
+
+ comment: There are some errors that might mislead readers for this version. + There is no new version right now +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning with Multitask Episodic Memory Based on + Task-Conditioned Hypernetwork + + +
+ Deep reinforcement learning algorithms are usually impeded by sampling +inefficiency, heavily depending on multiple interactions with the environment +to acquire accurate decision-making capabilities. In contrast, humans rely on +their hippocampus to retrieve relevant information from past experiences of +relevant tasks, which guides their decision-making when learning a new task, +rather than exclusively depending on environmental interactions. Nevertheless, +designing a hippocampus-like module for an agent to incorporate past +experiences into established reinforcement learning algorithms presents two +challenges. The first challenge involves selecting the most relevant past +experiences for the current task, and the second challenge is integrating such +experiences into the decision network. To address these challenges, we propose +a novel method that utilizes a retrieval network based on task-conditioned +hypernetwork, which adapts the retrieval network's parameters depending on the +task. At the same time, a dynamic modification mechanism enhances the +collaborative efforts between the retrieval and decision networks. We evaluate +the proposed method on the MiniGrid environment.The experimental results +demonstrate that our proposed method significantly outperforms strong +baselines. + +
+
+
+
+
+ + ♻ ☆ Interaction-Aware Personalized Vehicle Trajectory Prediction Using + Temporal Graph Neural Networks + + +
+ Accurate prediction of vehicle trajectories is vital for advanced driver +assistance systems and autonomous vehicles. Existing methods mainly rely on +generic trajectory predictions derived from large datasets, overlooking the +personalized driving patterns of individual drivers. To address this gap, we +propose an approach for interaction-aware personalized vehicle trajectory +prediction that incorporates temporal graph neural networks. Our method +utilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to +model the spatio-temporal interactions between target vehicles and their +surrounding traffic. To personalize the predictions, we establish a pipeline +that leverages transfer learning: the model is initially pre-trained on a +large-scale trajectory dataset and then fine-tuned for each driver using their +specific driving data. We employ human-in-the-loop simulation to collect +personalized naturalistic driving trajectories and corresponding surrounding +vehicle trajectories. Experimental results demonstrate the superior performance +of our personalized GCN-LSTM model, particularly for longer prediction +horizons, compared to its generic counterpart. Moreover, the personalized model +outperforms individual models created without pre-training, emphasizing the +significance of pre-training on a large dataset to avoid overfitting. By +incorporating personalization, our approach enhances trajectory prediction +accuracy. + +
+
+
+
+
+ + ♻ ☆ Neuro-Dynamic State Estimation for Networked Microgrids + + +
+ We devise neuro-dynamic state estimation (Neuro-DSE), a learning-based +dynamic state estimation (DSE) algorithm for networked microgrids (NMs) under +unknown subsystems. Our contributions include: 1) a data-driven Neuro-DSE +algorithm for NMs DSE with partially unidentified dynamic models, which +incorporates the neural-ordinary-differential-equations (ODE-Net) into Kalman +filters; 2) a self-refining Neuro-DSE algorithm (Neuro-DSE+) which enables +data-driven DSE under limited and noisy measurements by establishing an +automatic filtering, augmenting and correcting framework; 3) a +Neuro-KalmanNet-DSE algorithm which further integrates KalmanNet with Neuro-DSE +to relieve the model mismatch of both neural- and physics-based dynamic models; +and 4) an augmented Neuro-DSE for joint estimation of NMs states and unknown +parameters (e.g., inertia). Extensive case studies demonstrate the efficacy of +Neuro-DSE and its variants under different noise levels, control modes, power +sources, observabilities and model knowledge, respectively. + +
+
+ comment: This paper needs to be withdrawn by the author. In Section II, Part + C, there is lack of procedure to achieve parameter estimation using the + proposed model. In Section V, Part E, experiment parameter setting is missed. + Noise for estimating inertia case needs to be reset for simulation. + Additional tests need to be added. These two parts need to be rewritten +
+
+
+
+
+ + ♻ ☆ DyTed: Disentangled Representation Learning for Discrete-time Dynamic + Graph + + +
+ Unsupervised representation learning for dynamic graphs has attracted a lot +of research attention in recent years. Compared with static graph, the dynamic +graph is a comprehensive embodiment of both the intrinsic stable +characteristics of nodes and the time-related dynamic preference. However, +existing methods generally mix these two types of information into a single +representation space, which may lead to poor explanation, less robustness, and +a limited ability when applied to different downstream tasks. To solve the +above problems, in this paper, we propose a novel disenTangled representation +learning framework for discrete-time Dynamic graphs, namely DyTed. We specially +design a temporal-clips contrastive learning task together with a structure +contrastive learning to effectively identify the time-invariant and +time-varying representations respectively. To further enhance the +disentanglement of these two types of representation, we propose a +disentanglement-aware discriminator under an adversarial learning framework +from the perspective of information theory. Extensive experiments on Tencent +and five commonly used public datasets demonstrate that DyTed, as a general +framework that can be applied to existing methods, achieves state-of-the-art +performance on various downstream tasks, as well as be more robust against +noise. + +
+
+
+
+
+ + ♻ ☆ Explainable Machine Learning for Categorical and Mixed Data with + Lossless Visualization + + +
+ Building accurate and interpretable Machine Learning (ML) models for +heterogeneous/mixed data is a long-standing challenge for algorithms designed +for numeric data. This work focuses on developing numeric coding schemes for +non-numeric attributes for ML algorithms to support accurate and explainable ML +models, methods for lossless visualization of n-D non-numeric categorical data +with visual rule discovery in these visualizations, and accurate and +explainable ML models for categorical data. This study proposes a +classification of mixed data types and analyzes their important role in Machine +Learning. It presents a toolkit for enforcing interpretability of all internal +operations of ML algorithms on mixed data with a visual data exploration on +mixed data. A new Sequential Rule Generation (SRG) algorithm for explainable +rule generation with categorical data is proposed and successfully evaluated in +multiple computational experiments. This work is one of the steps to the full +scope ML algorithms for mixed data supported by lossless visualization of n-D +data in General Line Coordinates beyond Parallel Coordinates. + +
+
+ comment: 46 pages, 32 figures, 29 tables. arXiv admin note: substantial text + overlap with arXiv:2206.06476 +
+
+
+
+
+ + ♻ ☆ Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot + Task Generalization + + +
+ We investigate the emergent abilities of the recently proposed web-scale +speech model Whisper, by adapting it to unseen tasks with prompt engineering. +We selected three tasks: audio-visual speech recognition (AVSR), code-switched +speech recognition (CS-ASR), and speech translation (ST) on unseen language +pairs. We design task-specific prompts, by either leveraging another +large-scale model, or simply manipulating the special tokens in the default +prompts. Experiments show that compared to the default prompts, our proposed +prompts improve performance by 10% to 45% on the three zero-shot tasks, and +even outperform SotA supervised models on some datasets. In addition, our +experiments reveal many interesting properties of Whisper, including its +robustness to prompts, bias on accents, and the multilingual understanding in +its latent space. Code is available at +https://github.com/jasonppy/PromptingWhisper + +
+
+ comment: Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ Ablating Concepts in Text-to-Image Diffusion Models ICCV 2023 + + +
+ Large-scale text-to-image diffusion models can generate high-fidelity images +with powerful compositional ability. However, these models are typically +trained on an enormous amount of Internet data, often containing copyrighted +material, licensed images, and personal photos. Furthermore, they have been +found to replicate the style of various living artists or memorize exact +training samples. How can we remove such copyrighted concepts or images without +retraining the model from scratch? To achieve this goal, we propose an +efficient method of ablating concepts in the pretrained model, i.e., preventing +the generation of a target concept. Our algorithm learns to match the image +distribution for a target style, instance, or text prompt we wish to ablate to +the distribution corresponding to an anchor concept. This prevents the model +from generating target concepts given its text condition. Extensive experiments +show that our method can successfully prevent the generation of the ablated +concept while preserving closely related concepts in the model. + +
+
+ comment: ICCV 2023. Project website: https://www.cs.cmu.edu/~concept-ablation/ +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech + Separation + + +
+ The integration of different modalities, such as audio and visual +information, plays a crucial role in human perception of the surrounding +environment. Recent research has made significant progress in designing fusion +modules for audio-visual speech separation. However, they predominantly focus +on multi-modal fusion architectures situated either at the top or bottom +positions, rather than comprehensively considering multi-modal fusion at +various hierarchical positions within the network. In this paper, we propose a +novel model called self- and cross-attention network (SCANet), which leverages +the attention mechanism for efficient audio-visual feature fusion. SCANet +consists of two types of attention blocks: self-attention (SA) and +cross-attention (CA) blocks, where the CA blocks are distributed at the top +(TCA), middle (MCA) and bottom (BCA) of SCANet. These blocks maintain the +ability to learn modality-specific features and enable the extraction of +different semantics from audio-visual features. Comprehensive experiments on +three standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2) +demonstrate the effectiveness of SCANet, outperforming existing +state-of-the-art (SOTA) methods while maintaining comparable inference time. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme + Detection ACM MM + + +
+ Hateful meme detection is a challenging multimodal task that requires +comprehension of both vision and language, as well as cross-modal interactions. +Recent studies have tried to fine-tune pre-trained vision-language models +(PVLMs) for this task. However, with increasing model sizes, it becomes +important to leverage powerful PVLMs more efficiently, rather than simply +fine-tuning them. Recently, researchers have attempted to convert meme images +into textual captions and prompt language models for predictions. This approach +has shown good performance but suffers from non-informative image captions. +Considering the two factors mentioned above, we propose a probing-based +captioning approach to leverage PVLMs in a zero-shot visual question answering +(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful +content-related questions and use the answers as image captions (which we call +Pro-Cap), so that the captions contain information critical for hateful content +detection. The good performance of models with Pro-Cap on three benchmarks +validates the effectiveness and generalization of the proposed method. + +
+
+ comment: Camera-ready for 23, ACM MM +
+
+
+
+
+ + ♻ ☆ Understanding User Behavior in Volumetric Video Watching: Dataset, + Analysis and Prediction ACM MM'23 + + +
+ Volumetric video emerges as a new attractive video paradigm in recent years +since it provides an immersive and interactive 3D viewing experience with six +degree-of-freedom (DoF). Unlike traditional 2D or panoramic videos, volumetric +videos require dense point clouds, voxels, meshes, or huge neural models to +depict volumetric scenes, which results in a prohibitively high bandwidth +burden for video delivery. Users' behavior analysis, especially the viewport +and gaze analysis, then plays a significant role in prioritizing the content +streaming within users' viewport and degrading the remaining content to +maximize user QoE with limited bandwidth. Although understanding user behavior +is crucial, to the best of our best knowledge, there are no available 3D +volumetric video viewing datasets containing fine-grained user interactivity +features, not to mention further analysis and behavior prediction. In this +paper, we for the first time release a volumetric video viewing behavior +dataset, with a large scale, multiple dimensions, and diverse conditions. We +conduct an in-depth analysis to understand user behaviors when viewing +volumetric videos. Interesting findings on user viewport, gaze, and motion +preference related to different videos and users are revealed. We finally +design a transformer-based viewport prediction model that fuses the features of +both gaze and motion, which is able to achieve high accuracy at various +conditions. Our prediction model is expected to further benefit volumetric +video streaming optimization. Our dataset, along with the corresponding +visualization tools is accessible at +https://cuhksz-inml.github.io/user-behavior-in-vv-watching/ + +
+
+ comment: Accepted by ACM MM'23 +
+
+
+
+
+ + ♻ ☆ Seeing through the Brain: Image Reconstruction of Visual Perception from + Human Brain Signals + + +
+ Seeing is believing, however, the underlying mechanism of how human visual +perceptions are intertwined with our cognitions is still a mystery. Thanks to +the recent advances in both neuroscience and artificial intelligence, we have +been able to record the visually evoked brain activities and mimic the visual +perception ability through computational approaches. In this paper, we pay +attention to visual stimuli reconstruction by reconstructing the observed +images based on portably accessible brain signals, i.e., electroencephalography +(EEG) data. Since EEG signals are dynamic in the time-series format and are +notorious to be noisy, processing and extracting useful information requires +more dedicated efforts; In this paper, we propose a comprehensive pipeline, +named NeuroImagen, for reconstructing visual stimuli images from EEG signals. +Specifically, we incorporate a novel multi-level perceptual information +decoding to draw multi-grained outputs from the given EEG data. A latent +diffusion model will then leverage the extracted information to reconstruct the +high-resolution visual stimuli images. The experimental results have +illustrated the effectiveness of image reconstruction and superior quantitative +performance of our proposed method. + +
+
+ comment: A preprint version of an ongoing work +
+
+
+
+
+ + ♻ ☆ VoxBlink: X-Large Speaker Verification Dataset on Camera ICASSP2023 + + +
+ In this paper, we contribute a novel and extensive dataset for speaker +verification, which contains noisy 38k identities/1.45M utterances (VoxBlink) +and relatively cleaned 18k identities/1.02M (VoxBlink-Clean) utterances for +training. Firstly, we collect a 60K+ users' list as well as their avatar and +download their SHORT videos on the YouTube. Then, an automatically pipeline is +devised to extract target user's speech segments and videos, which is efficient +and scalable. To the best of our knowledge, the VoxBlink dataset is the largest +speaker recognition dataset. Secondly, we develop a series of experiments based +on VoxBlink-clean together with VoxCeleb2. Our findings highlight a notable +improvement in performance, ranging from 15% to 30%, across different backbone +architectures, upon integrating our dataset for training. The dataset will be +released SOON~. + +
+
+ comment: submit to ICASSP2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 57 + +
+
+
+ + ☆ RAVEN: In-Context Learning with Retrieval Augmented Encoder-Decoder + Language Models + + +
+ In this paper, we investigate the in-context learning ability of +retrieval-augmented encoder-decoder language models. We first conduct a +comprehensive analysis of the state-of-the-art ATLAS model and identify its +limitations in in-context learning, primarily due to a mismatch between +pretraining and testing, as well as a restricted context length. To address +these issues, we propose RAVEN, a model that combines retrieval-augmented +masked language modeling and prefix language modeling. We further introduce +Fusion-in-Context Learning to enhance the few-shot performance by enabling the +model to leverage more in-context examples without requiring additional +training or model modifications. Through extensive experiments, we demonstrate +that RAVEN significantly outperforms ATLAS and achieves results comparable to +the most advanced language models in certain scenarios, despite having +substantially fewer parameters. Our work underscores the potential of +retrieval-augmented encoder-decoder language models for in-context learning and +encourages further research in this direction. + +
+
+
+
+
+ + ☆ Solving Challenging Math Word Problems Using GPT-4 Code Interpreter with + Code-based Self-Verification + + +
+ Recent progress in large language models (LLMs) like GPT-4 and PaLM-2 has +brought significant advancements in addressing math reasoning problems. In +particular, OpenAI's latest version of GPT-4, known as GPT-4 Code Interpreter, +shows remarkable performance on challenging math datasets. In this paper, we +explore the effect of code on enhancing LLMs' reasoning capability by +introducing different constraints on the \textit{Code Usage Frequency} of GPT-4 +Code Interpreter. We found that its success can be largely attributed to its +powerful skills in generating and executing code, evaluating the output of code +execution, and rectifying its solution when receiving unreasonable outputs. +Based on this insight, we propose a novel and effective prompting method, +explicit \uline{c}ode-based \uline{s}elf-\uline{v}erification~(CSV), to further +boost the mathematical reasoning potential of GPT-4 Code Interpreter. This +method employs a zero-shot prompt on GPT-4 Code Interpreter to encourage it to +use code to self-verify its answers. In instances where the verification state +registers as ``False'', the model shall automatically amend its solution, +analogous to our approach of rectifying errors during a mathematics +examination. Furthermore, we recognize that the states of the verification +result indicate the confidence of a solution, which can improve the +effectiveness of majority voting. With GPT-4 Code Interpreter and CSV, we +achieve an impressive zero-shot accuracy on MATH dataset \textbf{(53.9\% $\to$ +84.3\%)}. + +
+
+ comment: Solving Challenging Math Word Problems Using GPT-4 Code Interpreter + with Code-based Self-Verification +
+
+
+
+
+ + ☆ Through the Lens of Core Competency: Survey on Evaluation of Large + Language Models + + +
+ From pre-trained language model (PLM) to large language model (LLM), the +field of natural language processing (NLP) has witnessed steep performance +gains and wide practical uses. The evaluation of a research field guides its +direction of improvement. However, LLMs are extremely hard to thoroughly +evaluate for two reasons. First of all, traditional NLP tasks become inadequate +due to the excellent performance of LLM. Secondly, existing evaluation tasks +are difficult to keep up with the wide range of applications in real-world +scenarios. To tackle these problems, existing works proposed various benchmarks +to better evaluate LLMs. To clarify the numerous evaluation tasks in both +academia and industry, we investigate multiple papers concerning LLM +evaluations. We summarize 4 core competencies of LLM, including reasoning, +knowledge, reliability, and safety. For every competency, we introduce its +definition, corresponding benchmarks, and metrics. Under this competency +architecture, similar tasks are combined to reflect corresponding ability, +while new tasks can also be easily added into the system. Finally, we give our +suggestions on the future direction of LLM's evaluation. + +
+
+
+
+
+ + ☆ The Regular Expression Inference Challenge + + +
+ We propose \emph{regular expression inference (REI)} as a challenge for +code/language modelling, and the wider machine learning community. REI is a +supervised machine learning (ML) and program synthesis task, and poses the +problem of finding minimal regular expressions from examples: Given two finite +sets of strings $P$ and $N$ and a cost function $\text{cost}(\cdot)$, the task +is to generate an expression $r$ that accepts all strings in $P$ and rejects +all strings in $N$, while no other such expression $r'$ exists with +$\text{cost}(r')<\text{cost}(r)$. + REI has advantages as a challenge problem: (i) regular expressions are +well-known, widely used, and a natural idealisation of code; (ii) REI's +asymptotic worst-case complexity is well understood; (iii) REI has a small +number of easy to understand parameters (e.g.~$P$ or $N$ cardinality, string +lengths of examples, or the cost function); this lets us easily finetune +REI-hardness; (iv) REI is an unsolved problem for deep learning based ML. + Recently, an REI solver was implemented on GPUs, using program synthesis +techniques. This enabled, for the first time, fast generation of minimal +expressions for complex REI instances. Building on this advance, we generate +and publish the first large-scale datasets for REI, and devise and evaluate +several initial heuristic and machine learning baselines. + We invite the community to participate and explore ML methods that learn to +solve REI problems. We believe that progress in REI directly translates to +code/language modelling. + +
+
+ comment: 7 pages, 3 pages appendix, 6 tables +
+
+
+
+
+ + ☆ Link-Context Learning for Multimodal LLMs + + +
+ The ability to learn from context with novel concepts, and deliver +appropriate responses are essential in human conversations. Despite current +Multimodal Large Language Models (MLLMs) and Large Language Models (LLMs) being +trained on mega-scale datasets, recognizing unseen images or understanding +novel concepts in a training-free manner remains a challenge. In-Context +Learning (ICL) explores training-free few-shot learning, where models are +encouraged to ``learn to learn" from limited tasks and generalize to unseen +tasks. In this work, we propose link-context learning (LCL), which emphasizes +"reasoning from cause and effect" to augment the learning capabilities of +MLLMs. LCL goes beyond traditional ICL by explicitly strengthening the causal +relationship between the support set and the query set. By providing +demonstrations with causal links, LCL guides the model to discern not only the +analogy but also the underlying causal associations between data points, which +empowers MLLMs to recognize unseen images and understand novel concepts more +effectively. To facilitate the evaluation of this novel approach, we introduce +the ISEKAI dataset, comprising exclusively of unseen generated image-label +pairs designed for link-context learning. Extensive experiments show that our +LCL-MLLM exhibits strong link-context learning capabilities to novel concepts +over vanilla MLLMs. Code and data will be released at +https://github.com/isekai-portal/Link-Context-Learning. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ A Comprehensive Study on Knowledge Graph Embedding over Relational + Patterns Based on Rule Learning ISWC 2023 + + +
+ Knowledge Graph Embedding (KGE) has proven to be an effective approach to +solving the Knowledge Graph Completion (KGC) task. Relational patterns which +refer to relations with specific semantics exhibiting graph patterns are an +important factor in the performance of KGE models. Though KGE models' +capabilities are analyzed over different relational patterns in theory and a +rough connection between better relational patterns modeling and better +performance of KGC has been built, a comprehensive quantitative analysis on KGE +models over relational patterns remains absent so it is uncertain how the +theoretical support of KGE to a relational pattern contributes to the +performance of triples associated to such a relational pattern. To address this +challenge, we evaluate the performance of 7 KGE models over 4 common relational +patterns on 2 benchmarks, then conduct an analysis in theory, entity frequency, +and part-to-whole three aspects and get some counterintuitive conclusions. +Finally, we introduce a training-free method Score-based Patterns Adaptation +(SPA) to enhance KGE models' performance over various relational patterns. This +approach is simple yet effective and can be applied to KGE models without +additional training. Our experimental results demonstrate that our method +generally enhances performance over specific relational patterns. Our source +code is available from GitHub at +https://github.com/zjukg/Comprehensive-Study-over-Relational-Patterns. + +
+
+ comment: This paper is accepted by ISWC 2023 +
+
+
+
+
+ + ☆ Synthesizing Political Zero-Shot Relation Classification via Codebook + Knowledge, NLI, and ChatGPT + + +
+ Recent supervised models for event coding vastly outperform pattern-matching +methods. However, their reliance solely on new annotations disregards the vast +knowledge within expert databases, hindering their applicability to +fine-grained classification. To address these limitations, we explore zero-shot +approaches for political event ontology relation classification, by leveraging +knowledge from established annotation codebooks. Our study encompasses both +ChatGPT and a novel natural language inference (NLI) based approach named ZSP. +ZSP adopts a tree-query framework that deconstructs the task into context, +modality, and class disambiguation levels. This framework improves +interpretability, efficiency, and adaptability to schema changes. By conducting +extensive experiments on our newly curated datasets, we pinpoint the +instability issues within ChatGPT and highlight the superior performance of +ZSP. ZSP achieves an impressive 40% improvement in F1 score for fine-grained +Rootcode classification. ZSP demonstrates competitive performance compared to +supervised BERT models, positioning it as a valuable tool for event record +validation and ontology development. Our work underscores the potential of +leveraging transfer learning and existing expertise to enhance the efficiency +and scalability of research in the field. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Emotion Embeddings $\unicode{x2014}$ Learning Stable and Homogeneous + Abstractions from Heterogeneous Affective Datasets + + +
+ Human emotion is expressed in many communication modalities and media formats +and so their computational study is equally diversified into natural language +processing, audio signal analysis, computer vision, etc. Similarly, the large +variety of representation formats used in previous research to describe +emotions (polarity scales, basic emotion categories, dimensional approaches, +appraisal theory, etc.) have led to an ever proliferating diversity of +datasets, predictive models, and software tools for emotion analysis. Because +of these two distinct types of heterogeneity, at the expressional and +representational level, there is a dire need to unify previous work on +increasingly diverging data and label types. This article presents such a +unifying computational model. We propose a training procedure that learns a +shared latent representation for emotions, so-called emotion embeddings, +independent of different natural languages, communication modalities, media or +representation label formats, and even disparate model architectures. +Experiments on a wide range of heterogeneous affective datasets indicate that +this approach yields the desired interoperability for the sake of reusability, +interpretability and flexibility, without penalizing prediction quality. Code +and data are archived under https://doi.org/10.5281/zenodo.7405327 . + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ☆ Informed Named Entity Recognition Decoding for Generative Language + Models + + +
+ Ever-larger language models with ever-increasing capabilities are by now +well-established text processing tools. Alas, information extraction tasks such +as named entity recognition are still largely unaffected by this progress as +they are primarily based on the previous generation of encoder-only transformer +models. Here, we propose a simple yet effective approach, Informed Named Entity +Recognition Decoding (iNERD), which treats named entity recognition as a +generative process. It leverages the language understanding capabilities of +recent generative models in a future-proof manner and employs an informed +decoding scheme incorporating the restricted nature of information extraction +into open-ended text generation, improving performance and eliminating any risk +of hallucinations. We coarse-tune our model on a merged named entity corpus to +strengthen its performance, evaluate five generative language models on eight +named entity recognition datasets, and achieve remarkable results, especially +in an environment with an unknown entity class set, demonstrating the +adaptability of the approach. + +
+
+ comment: 12 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ Enhancing Visually-Rich Document Understanding via Layout Structure + Modeling + + +
+ In recent years, the use of multi-modal pre-trained Transformers has led to +significant advancements in visually-rich document understanding. However, +existing models have mainly focused on features such as text and vision while +neglecting the importance of layout relationship between text nodes. In this +paper, we propose GraphLayoutLM, a novel document understanding model that +leverages the modeling of layout structure graph to inject document layout +knowledge into the model. GraphLayoutLM utilizes a graph reordering algorithm +to adjust the text sequence based on the graph structure. Additionally, our +model uses a layout-aware multi-head self-attention layer to learn document +layout knowledge. The proposed model enables the understanding of the spatial +arrangement of text elements, improving document comprehension. We evaluate our +model on various benchmarks, including FUNSD, XFUND and CORD, and achieve +state-of-the-art results among these datasets. Our experimental results +demonstrate that our proposed method provides a significant improvement over +existing approaches and showcases the importance of incorporating layout +information into document understanding models. We also conduct an ablation +study to investigate the contribution of each component of our model. The +results show that both the graph reordering algorithm and the layout-aware +multi-head self-attention layer play a crucial role in achieving the best +performance. + +
+
+
+
+
+ + ☆ Backward Reasoning in Large Language Models for Verification + + +
+ Chain-of-Though (CoT) prompting has shown promising performance in various +reasoning tasks. Recently, Self-Consistency \citep{wang2023selfconsistency} +proposes to sample a diverse set of reasoning chains which may lead to +different answers while the answer that receives the most votes is selected. In +this paper, we propose a novel method to use backward reasoning in verifying +candidate answers. We mask a token in the question by ${\bf x}$ and ask the LLM +to predict the masked token when a candidate answer is provided by \textit{a +simple template}, i.e., ``\textit{\textbf{If we know the answer of the above +question is \{a candidate answer\}, what is the value of unknown variable ${\bf +x}$?}}'' Intuitively, the LLM is expected to predict the masked token +successfully if the provided candidate answer is correct. We further propose +FOBAR to combine forward and backward reasoning for estimating the probability +of candidate answers. We conduct extensive experiments on six data sets and +three LLMs. Experimental results demonstrate that FOBAR achieves +state-of-the-art performance on various reasoning benchmarks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ SPM: Structured Pretraining and Matching Architectures for Relevance + Modeling in Meituan Search CIKM '23 + + +
+ In e-commerce search, relevance between query and documents is an essential +requirement for satisfying user experience. Different from traditional +e-commerce platforms that offer products, users search on life service +platforms such as Meituan mainly for product providers, which usually have +abundant structured information, e.g. name, address, category, thousands of +products. Modeling search relevance with these rich structured contents is +challenging due to the following issues: (1) there is language distribution +discrepancy among different fields of structured document, making it difficult +to directly adopt off-the-shelf pretrained language model based methods like +BERT. (2) different fields usually have different importance and their length +vary greatly, making it difficult to extract document information helpful for +relevance matching. + To tackle these issues, in this paper we propose a novel two-stage +pretraining and matching architecture for relevance matching with rich +structured documents. At pretraining stage, we propose an effective pretraining +method that employs both query and multiple fields of document as inputs, +including an effective information compression method for lengthy fields. At +relevance matching stage, a novel matching method is proposed by leveraging +domain knowledge in search query to generate more effective document +representations for relevance scoring. Extensive offline experiments and online +A/B tests on millions of users verify that the proposed architectures +effectively improve the performance of relevance modeling. The model has +already been deployed online, serving the search traffic of Meituan for over a +year. + +
+
+ comment: Accepted by CIKM '23 +
+
+
+
+
+ + ☆ Exploring Transfer Learning in Medical Image Segmentation using + Vision-Language Models + + +
+ Medical Image Segmentation is crucial in various clinical applications within +the medical domain. While state-of-the-art segmentation models have proven +effective, integrating textual guidance to enhance visual features for this +task remains an area with limited progress. Existing segmentation models that +utilize textual guidance are primarily trained on open-domain images, raising +concerns about their direct applicability in the medical domain without manual +intervention or fine-tuning. + To address these challenges, we propose using multimodal vision-language +models for capturing semantic information from image descriptions and images, +enabling the segmentation of diverse medical images. This study comprehensively +evaluates existing vision language models across multiple datasets to assess +their transferability from the open domain to the medical field. Furthermore, +we introduce variations of image descriptions for previously unseen images in +the dataset, revealing notable variations in model performance based on the +generated prompts. + Our findings highlight the distribution shift between the open-domain images +and the medical domain and show that the segmentation models trained on +open-domain images are not directly transferrable to the medical field. But +their performance can be increased by finetuning them in the medical datasets. +We report the zero-shot and finetuned segmentation performance of 4 Vision +Language Models (VLMs) on 11 medical datasets using 9 types of prompts derived +from 14 attributes. + +
+
+ comment: 25 pages, 9 figures +
+
+
+
+
+ + ☆ Better Zero-Shot Reasoning with Role-Play Prompting + + +
+ Modern large language models (LLMs), such as ChatGPT, exhibit a remarkable +capacity for role-playing, enabling them to embody not only human characters +but also non-human entities like a Linux terminal. This versatility allows them +to simulate complex human-like interactions and behaviors within various +contexts, as well as to emulate specific objects or systems. While these +capabilities have enhanced user engagement and introduced novel modes of +interaction, the influence of role-playing on LLMs' reasoning abilities remains +underexplored. In this study, we introduce a strategically designed role-play +prompting methodology and assess its performance under the zero-shot setting +across twelve diverse reasoning benchmarks, encompassing arithmetic, +commonsense reasoning, symbolic reasoning, and more. Leveraging models such as +ChatGPT and Llama 2, our empirical results illustrate that role-play prompting +consistently surpasses the standard zero-shot approach across most datasets. +Notably, accuracy on AQuA rises from 53.5% to 63.8%, and on Last Letter from +23.8% to 84.2%. Beyond enhancing contextual understanding, we posit that +role-play prompting serves as an implicit Chain-of-Thought (CoT) trigger, +thereby improving the quality of reasoning. By comparing our approach with the +Zero-Shot-CoT technique, which prompts the model to "think step by step", we +further demonstrate that role-play prompting can generate a more effective CoT. +This highlights its potential to augment the reasoning capabilities of LLMs. + +
+
+
+
+
+ + ☆ Attention Is Not All You Need Anymore + + +
+ In recent years, the popular Transformer architecture has achieved great +success in many application areas, including natural language processing and +computer vision. Many existing works aim to reduce the computational and memory +complexity of the self-attention mechanism in the Transformer by trading off +performance. However, performance is key for the continuing success of the +Transformer. In this paper, a drop-in replacement for the self-attention +mechanism in the Transformer, called the Extractor, is proposed. Experimental +results show that replacing the self-attention mechanism with the Extractor +improves the performance of the Transformer. Furthermore, the proposed +Extractor has the potential to run faster than the self-attention since it has +a much shorter critical path of computation. Additionally, the sequence +prediction problem in the context of text generation is formulated using +variable-length discrete-time Markov chains, and the Transformer is reviewed +based on our understanding. + +
+
+
+
+
+ + ☆ SEER: Super-Optimization Explorer for HLS using E-graph Rewriting with + MLIR + + +
+ High-level synthesis (HLS) is a process that automatically translates a +software program in a high-level language into a low-level hardware +description. However, the hardware designs produced by HLS tools still suffer +from a significant performance gap compared to manual implementations. This is +because the input HLS programs must still be written using hardware design +principles. + Existing techniques either leave the program source unchanged or perform a +fixed sequence of source transformation passes, potentially missing +opportunities to find the optimal design. We propose a super-optimization +approach for HLS that automatically rewrites an arbitrary software program into +efficient HLS code that can be used to generate an optimized hardware design. +We developed a toolflow named SEER, based on the e-graph data structure, to +efficiently explore equivalent implementations of a program at scale. SEER +provides an extensible framework, orchestrating existing software compiler +passes and hardware synthesis optimizers. + Our work is the first attempt to exploit e-graph rewriting for large software +compiler frameworks, such as MLIR. Across a set of open-source benchmarks, we +show that SEER achieves up to 38x the performance within 1.4x the area of the +original program. Via an Intel-provided case study, SEER demonstrates the +potential to outperform manually optimized designs produced by hardware +experts. + +
+
+
+
+
+ + ☆ Steering Language Generation: Harnessing Contrastive Expert Guidance and + Negative Prompting for Coherent and Diverse Synthetic Data Generation + + +
+ Large Language Models (LLMs) hold immense potential to generate synthetic +data of high quality and utility, which has numerous applications from +downstream model training to practical data utilisation. However, contemporary +models, despite their impressive capacities, consistently struggle to produce +both coherent and diverse data. To address the coherency issue, we introduce +contrastive expert guidance, where the difference between the logit +distributions of fine-tuned and base language models is emphasised to ensure +domain adherence. In order to ensure diversity, we utilise existing real and +synthetic examples as negative prompts to the model. We deem this dual-pronged +approach to logit reshaping as STEER: Semantic Text Enhancement via Embedding +Repositioning. STEER operates at inference-time and systematically guides the +LLMs to strike a balance between adherence to the data distribution (ensuring +semantic fidelity) and deviation from prior synthetic examples or existing real +datasets (ensuring diversity and authenticity). This delicate balancing act is +achieved by dynamically moving towards or away from chosen representations in +the latent space. STEER demonstrates improved performance over previous +synthetic data generation techniques, exhibiting better balance between data +diversity and coherency across three distinct tasks: hypothesis generation, +toxic and non-toxic comment generation, and commonsense reasoning task +generation. We demonstrate how STEER allows for fine-tuned control over the +diversity-coherency trade-off via its hyperparameters, highlighting its +versatility. + +
+
+
+
+
+ + ☆ LLM-Mini-CEX: Automatic Evaluation of Large Language Model for + Diagnostic Conversation + + +
+ There is an increasing interest in developing LLMs for medical diagnosis to +improve diagnosis efficiency. Despite their alluring technological potential, +there is no unified and comprehensive evaluation criterion, leading to the +inability to evaluate the quality and potential risks of medical LLMs, further +hindering the application of LLMs in medical treatment scenarios. Besides, +current evaluations heavily rely on labor-intensive interactions with LLMs to +obtain diagnostic dialogues and human evaluation on the quality of diagnosis +dialogue. To tackle the lack of unified and comprehensive evaluation criterion, +we first initially establish an evaluation criterion, termed LLM-specific +Mini-CEX to assess the diagnostic capabilities of LLMs effectively, based on +original Mini-CEX. To address the labor-intensive interaction problem, we +develop a patient simulator to engage in automatic conversations with LLMs, and +utilize ChatGPT for evaluating diagnosis dialogues automatically. Experimental +results show that the LLM-specific Mini-CEX is adequate and necessary to +evaluate medical diagnosis dialogue. Besides, ChatGPT can replace manual +evaluation on the metrics of humanistic qualities and provides reproducible and +automated comparisons between different LLMs. + +
+
+
+
+
+ + ☆ A Survey on Model Compression for Large Language Models + + +
+ Large Language Models (LLMs) have revolutionized natural language processing +tasks with remarkable success. However, their formidable size and computational +demands present significant challenges for practical deployment, especially in +resource-constrained environments. As these challenges become increasingly +pertinent, the field of model compression has emerged as a pivotal research +area to alleviate these limitations. This paper presents a comprehensive survey +that navigates the landscape of model compression techniques tailored +specifically for LLMs. Addressing the imperative need for efficient deployment, +we delve into various methodologies, encompassing quantization, pruning, +knowledge distillation, and more. Within each of these techniques, we highlight +recent advancements and innovative approaches that contribute to the evolving +landscape of LLM research. Furthermore, we explore benchmarking strategies and +evaluation metrics that are essential for assessing the effectiveness of +compressed LLMs. By providing insights into the latest developments and +practical implications, this survey serves as an invaluable resource for both +researchers and practitioners. As LLMs continue to evolve, this survey aims to +facilitate enhanced efficiency and real-world applicability, establishing a +foundation for future advancements in the field. + +
+
+
+
+
+ + ☆ LogPrompt: Prompt Engineering Towards Zero-Shot and Interpretable Log + Analysis + + +
+ Automated log analysis is crucial in modern software-intensive systems for +ensuring reliability and resilience throughout software maintenance and +engineering life cycles. Existing methods perform tasks such as log parsing and +log anomaly detection by providing a single prediction value without +interpretation. However, given the increasing volume of system events, the +limited interpretability of analysis results hinders analysts' trust and their +ability to take appropriate actions. Moreover, these methods require +substantial in-domain training data, and their performance declines sharply (by +up to 62.5%) in online scenarios involving unseen logs from new domains, a +common occurrence due to rapid software updates. In this paper, we propose +LogPrompt, a novel zero-shot and interpretable log analysis approach. LogPrompt +employs large language models (LLMs) to perform zero-shot log analysis tasks +via a suite of advanced prompt strategies tailored for log tasks, which +enhances LLMs' performance by up to 107.5% compared with simple prompts. +Experiments on nine publicly available evaluation datasets across two tasks +demonstrate that LogPrompt, despite using no training data, outperforms +existing approaches trained on thousands of logs by up to around 50%. We also +conduct a human evaluation of LogPrompt's interpretability, with six +practitioners possessing over 10 years of experience, who highly rated the +generated content in terms of usefulness and readability (averagely 4.42/5). +LogPrompt also exhibits remarkable compatibility with open-source and +smaller-scale LLMs, making it flexible for practical deployment. + +
+
+
+
+
+ + ☆ VBD-MT Chinese-Vietnamese Translation Systems for VLSP 2022 + + +
+ We present our systems participated in the VLSP 2022 machine translation +shared task. In the shared task this year, we participated in both translation +tasks, i.e., Chinese-Vietnamese and Vietnamese-Chinese translations. We build +our systems based on the neural-based Transformer model with the powerful +multilingual denoising pre-trained model mBART. The systems are enhanced by a +sampling method for backtranslation, which leverage large scale available +monolingual data. Additionally, several other methods are applied to improve +the translation quality including ensembling and postprocessing. We achieve +38.9 BLEU on ChineseVietnamese and 38.0 BLEU on VietnameseChinese on the public +test sets, which outperform several strong baselines. + +
+
+
+
+
+ + ☆ A User-Centered Evaluation of Spanish Text Simplification + + +
+ We present an evaluation of text simplification (TS) in Spanish for a +production system, by means of two corpora focused in both complex-sentence and +complex-word identification. We compare the most prevalent Spanish-specific +readability scores with neural networks, and show that the latter are +consistently better at predicting user preferences regarding TS. As part of our +analysis, we find that multilingual models underperform against equivalent +Spanish-only models on the same task, yet all models focus too often on +spurious statistical features, such as sentence length. We release the corpora +in our evaluation to the broader community with the hopes of pushing forward +the state-of-the-art in Spanish natural language processing. + +
+
+ comment: Data at https://github.com/microsoft/BrevE-CLaro +
+
+
+
+
+ + ☆ CALYPSO: LLMs as Dungeon Masters' Assistants + + +
+ The role of a Dungeon Master, or DM, in the game Dungeons & Dragons is to +perform multiple tasks simultaneously. The DM must digest information about the +game setting and monsters, synthesize scenes to present to other players, and +respond to the players' interactions with the scene. Doing all of these tasks +while maintaining consistency within the narrative and story world is no small +feat of human cognition, making the task tiring and unapproachable to new +players. Large language models (LLMs) like GPT-3 and ChatGPT have shown +remarkable abilities to generate coherent natural language text. In this paper, +we conduct a formative evaluation with DMs to establish the use cases of LLMs +in D&D and tabletop gaming generally. We introduce CALYPSO, a system of +LLM-powered interfaces that support DMs with information and inspiration +specific to their own scenario. CALYPSO distills game context into bite-sized +prose and helps brainstorm ideas without distracting the DM from the game. When +given access to CALYPSO, DMs reported that it generated high-fidelity text +suitable for direct presentation to players, and low-fidelity ideas that the DM +could develop further while maintaining their creative agency. We see CALYPSO +as exemplifying a paradigm of AI-augmented tools that provide synchronous +creative assistance within established game worlds, and tabletop gaming more +broadly. + +
+
+ comment: 11 pages, 4 figures. AIIDE 2023 +
+
+
+
+
+ + ☆ Finding Stakeholder-Material Information from 10-K Reports using + Fine-Tuned BERT and LSTM Models + + +
+ All public companies are required by federal securities law to disclose their +business and financial activities in their annual 10-K reports. Each report +typically spans hundreds of pages, making it difficult for human readers to +identify and extract the material information efficiently. To solve the +problem, I have fine-tuned BERT models and RNN models with LSTM layers to +identify stakeholder-material information, defined as statements that carry +information about a company's influence on its stakeholders, including +customers, employees, investors, and the community and natural environment. The +existing practice uses keyword search to identify such information, which is my +baseline model. Using business expert-labeled training data of nearly 6,000 +sentences from 62 10-K reports published in 2022, the best model has achieved +an accuracy of 0.904 and an F1 score of 0.899 in test data, significantly above +the baseline model's 0.781 and 0.749 respectively. Furthermore, the same work +was replicated on more granular taxonomies, based on which four distinct groups +of stakeholders (i.e., customers, investors, employees, and the community and +natural environment) are tested separately. Similarly, fined-tuned BERT models +outperformed LSTM and the baseline. The implications for industry application +and ideas for future extensions are discussed. + +
+
+
+
+
+ + ☆ Data Race Detection Using Large Language Models + + +
+ Large language models (LLMs) are demonstrating significant promise as an +alternate strategy to facilitate analyses and optimizations of high-performance +computing programs, circumventing the need for resource-intensive manual tool +creation. In this paper, we explore a novel LLM-based data race detection +approach combining prompting engineering and fine-tuning techniques. We create +a dedicated dataset named DRB-ML, which is derived from DataRaceBench, with +fine-grain labels showing the presence of data race pairs and their associated +variables, line numbers, and read/write information. DRB-ML is then used to +evaluate representative LLMs and fine-tune open-source ones. Our experiment +shows that LLMs can be a viable approach to data race detection. However, they +still cannot compete with traditional data race detection tools when we need +detailed information about variable pairs causing data races. + +
+
+
+
+
+ + ☆ The Costly Dilemma: Generalization, Evaluation and Cost-Optimal + Deployment of Large Language Models + + +
+ When deploying machine learning models in production for any +product/application, there are three properties that are commonly desired. +First, the models should be generalizable, in that we can extend it to further +use cases as our knowledge of the domain area develops. Second they should be +evaluable, so that there are clear metrics for performance and the calculation +of those metrics in production settings are feasible. Finally, the deployment +should be cost-optimal as far as possible. In this paper we propose that these +three objectives (i.e. generalization, evaluation and cost-optimality) can +often be relatively orthogonal and that for large language models, despite +their performance over conventional NLP models, enterprises need to carefully +assess all the three factors before making substantial investments in this +technology. We propose a framework for generalization, evaluation and +cost-modeling specifically tailored to large language models, offering insights +into the intricacies of development, deployment and management for these large +language models. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ DiagGPT: An LLM-based Chatbot with Automatic Topic Management for + Task-Oriented Dialogue + + +
+ Large Language Models (LLMs), such as ChatGPT, are becoming increasingly +sophisticated, demonstrating capabilities that closely resemble those of +humans. These AI models are playing an essential role in assisting humans with +a wide array of tasks in daily life. A significant application of AI is its use +as a chat agent, responding to human inquiries across various domains. Current +LLMs have shown proficiency in answering general questions. However, basic +question-answering dialogue often falls short in complex diagnostic scenarios, +such as legal or medical consultations. These scenarios typically necessitate +Task-Oriented Dialogue (TOD), wherein an AI chat agent needs to proactively +pose questions and guide users towards specific task completion. Previous +fine-tuning models have underperformed in TOD, and current LLMs do not +inherently possess this capability. In this paper, we introduce DiagGPT +(Dialogue in Diagnosis GPT), an innovative method that extends LLMs to TOD +scenarios. Our experiments reveal that DiagGPT exhibits outstanding performance +in conducting TOD with users, demonstrating its potential for practical +applications. + +
+
+
+
+
+ + ☆ Using Artificial Populations to Study Psychological Phenomena in Neural + Models + + +
+ The recent proliferation of research into transformer based natural language +processing has led to a number of studies which attempt to detect the presence +of human-like cognitive behavior in the models. We contend that, as is true of +human psychology, the investigation of cognitive behavior in language models +must be conducted in an appropriate population of an appropriate size for the +results to be meaningful. We leverage work in uncertainty estimation in a novel +approach to efficiently construct experimental populations. The resultant tool, +PopulationLM, has been made open source. We provide theoretical grounding in +the uncertainty estimation literature and motivation from current cognitive +work regarding language models. We discuss the methodological lessons from +other scientific communities and attempt to demonstrate their application to +two artificial population studies. Through population based experimentation we +find that language models exhibit behavior consistent with typicality effects +among categories highly represented in training. However, we find that language +models don't tend to exhibit structural priming effects. Generally, our results +show that single models tend to over estimate the presence of cognitive +behaviors in neural models. + +
+
+
+
+
+ + ☆ End-to-End Open Vocabulary Keyword Search With Multilingual Neural + Representations + + +
+ Conventional keyword search systems operate on automatic speech recognition +(ASR) outputs, which causes them to have a complex indexing and search +pipeline. This has led to interest in ASR-free approaches to simplify the +search procedure. We recently proposed a neural ASR-free keyword search model +which achieves competitive performance while maintaining an efficient and +simplified pipeline, where queries and documents are encoded with a pair of +recurrent neural network encoders and the encodings are combined with a +dot-product. In this article, we extend this work with multilingual pretraining +and detailed analysis of the model. Our experiments show that the proposed +multilingual training significantly improves the model performance and that +despite not matching a strong ASR-based conventional keyword search system for +short queries and queries comprising in-vocabulary words, the proposed model +outperforms the ASR-based system for long queries and queries that do not +appear in the training data. + +
+
+ comment: Accepted by IEEE/ACM Transactions on Audio, Speech and Language + Processing (TASLP), 2023 +
+
+
+
+
+ + ☆ Anaphoric Structure Emerges Between Neural Networks + + +
+ Pragmatics is core to natural language, enabling speakers to communicate +efficiently with structures like ellipsis and anaphora that can shorten +utterances without loss of meaning. These structures require a listener to +interpret an ambiguous form - like a pronoun - and infer the speaker's intended +meaning - who that pronoun refers to. Despite potential to introduce ambiguity, +anaphora is ubiquitous across human language. In an effort to better understand +the origins of anaphoric structure in natural language, we look to see if +analogous structures can emerge between artificial neural networks trained to +solve a communicative task. We show that: first, despite the potential for +increased ambiguity, languages with anaphoric structures are learnable by +neural models. Second, anaphoric structures emerge between models 'naturally' +without need for additional constraints. Finally, introducing an explicit +efficiency pressure on the speaker increases the prevalence of these +structures. We conclude that certain pragmatic structures straightforwardly +emerge between neural networks, without explicit efficiency pressures, but that +the competing needs of speakers and listeners conditions the degree and nature +of their emergence. + +
+
+ comment: Published as a conference paper at the Annual Meeting of the + Cognitive Science Society 2023: 6 Pages, 3 Figures, code available at + https://github.com/hcoxec/emerge +
+
+
+
+
+ + ☆ "Beware of deception": Detecting Half-Truth and Debunking it through + Controlled Claim Editing + + +
+ The prevalence of half-truths, which are statements containing some truth but +that are ultimately deceptive, has risen with the increasing use of the +internet. To help combat this problem, we have created a comprehensive pipeline +consisting of a half-truth detection model and a claim editing model. Our +approach utilizes the T5 model for controlled claim editing; "controlled" here +means precise adjustments to select parts of a claim. Our methodology achieves +an average BLEU score of 0.88 (on a scale of 0-1) and a disinfo-debunk score of +85% on edited claims. Significantly, our T5-based approach outperforms other +Language Models such as GPT2, RoBERTa, PEGASUS, and Tailor, with average +improvements of 82%, 57%, 42%, and 23% in disinfo-debunk scores, respectively. +By extending the LIAR PLUS dataset, we achieve an F1 score of 82% for the +half-truth detection model, setting a new benchmark in the field. While +previous attempts have been made at half-truth detection, our approach is, to +the best of our knowledge, the first to attempt to debunk half-truths. + +
+
+
+
+
+ + ☆ MultiSChuBERT: Effective Multimodal Fusion for Scholarly Document + Quality Prediction + + +
+ Automatic assessment of the quality of scholarly documents is a difficult +task with high potential impact. Multimodality, in particular the addition of +visual information next to text, has been shown to improve the performance on +scholarly document quality prediction (SDQP) tasks. We propose the multimodal +predictive model MultiSChuBERT. It combines a textual model based on chunking +full paper text and aggregating computed BERT chunk-encodings (SChuBERT), with +a visual model based on Inception V3.Our work contributes to the current +state-of-the-art in SDQP in three ways. First, we show that the method of +combining visual and textual embeddings can substantially influence the +results. Second, we demonstrate that gradual-unfreezing of the weights of the +visual sub-model, reduces its tendency to ovefit the data, improving results. +Third, we show the retained benefit of multimodality when replacing standard +BERT$_{\textrm{BASE}}$ embeddings with more recent state-of-the-art text +embedding models. + Using BERT$_{\textrm{BASE}}$ embeddings, on the (log) number of citations +prediction task with the ACL-BiblioMetry dataset, our MultiSChuBERT +(text+visual) model obtains an $R^{2}$ score of 0.454 compared to 0.432 for the +SChuBERT (text only) model. Similar improvements are obtained on the PeerRead +accept/reject prediction task. In our experiments using SciBERT, scincl, +SPECTER and SPECTER2.0 embeddings, we show that each of these tailored +embeddings adds further improvements over the standard BERT$_{\textrm{BASE}}$ +embeddings, with the SPECTER2.0 embeddings performing best. + +
+
+
+
+
+ + ☆ Teach LLMs to Personalize -- An Approach inspired by Writing Education + + +
+ Personalized text generation is an emerging research area that has attracted +much attention in recent years. Most studies in this direction focus on a +particular domain by designing bespoke features or models. In this work, we +propose a general approach for personalized text generation using large +language models (LLMs). Inspired by the practice of writing education, we +develop a multistage and multitask framework to teach LLMs for personalized +generation. In writing instruction, the task of writing from sources is often +decomposed into multiple steps that involve finding, evaluating, summarizing, +synthesizing, and integrating information. Analogously, our approach to +personalized text generation consists of multiple stages: retrieval, ranking, +summarization, synthesis, and generation. In addition, we introduce a multitask +setting that helps the model improve its generation ability further, which is +inspired by the observation in education that a student's reading proficiency +and writing ability are often correlated. We evaluate our approach on three +public datasets, each of which covers a different and representative domain. +Our results show significant improvements over a variety of baselines. + +
+
+
+
+
+ + ☆ Improving CTC-AED model with integrated-CTC and auxiliary loss + regularization + + +
+ Connectionist temporal classification (CTC) and attention-based encoder +decoder (AED) joint training has been widely applied in automatic speech +recognition (ASR). Unlike most hybrid models that separately calculate the CTC +and AED losses, our proposed integrated-CTC utilizes the attention mechanism of +AED to guide the output of CTC. In this paper, we employ two fusion methods, +namely direct addition of logits (DAL) and preserving the maximum probability +(PMP). We achieve dimensional consistency by adaptively affine transforming the +attention results to match the dimensions of CTC. To accelerate model +convergence and improve accuracy, we introduce auxiliary loss regularization +for accelerated convergence. Experimental results demonstrate that the DAL +method performs better in attention rescoring, while the PMP method excels in +CTC prefix beam search and greedy search. + +
+
+
+
+
+ + ♻ ☆ GripRank: Bridging the Gap between Retrieval and Generation via the + Generative Knowledge Improved Passage Ranking CIKM 2023 + + +
+ Retrieval-enhanced text generation has shown remarkable progress on +knowledge-intensive language tasks, such as open-domain question answering and +knowledge-enhanced dialogue generation, by leveraging passages retrieved from a +large passage corpus for delivering a proper answer given the input query. +However, the retrieved passages are not ideal for guiding answer generation +because of the discrepancy between retrieval and generation, i.e., the +candidate passages are all treated equally during the retrieval procedure +without considering their potential to generate a proper answer. This +discrepancy makes a passage retriever deliver a sub-optimal collection of +candidate passages to generate the answer. In this paper, we propose the +GeneRative Knowledge Improved Passage Ranking (GripRank) approach, addressing +the above challenge by distilling knowledge from a generative passage estimator +(GPE) to a passage ranker, where the GPE is a generative language model used to +measure how likely the candidate passages can generate the proper answer. We +realize the distillation procedure by teaching the passage ranker learning to +rank the passages ordered by the GPE. Furthermore, we improve the distillation +quality by devising a curriculum knowledge distillation mechanism, which allows +the knowledge provided by the GPE can be progressively distilled to the ranker +through an easy-to-hard curriculum, enabling the passage ranker to correctly +recognize the provenance of the answer from many plausible candidates. We +conduct extensive experiments on four datasets across three knowledge-intensive +language tasks. Experimental results show advantages over the state-of-the-art +methods for both passage ranking and answer generation on the KILT benchmark. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Stack More Layers Differently: High-Rank Training Through Low-Rank + Updates + + +
+ Despite the dominance and effectiveness of scaling, resulting in large +networks with hundreds of billions of parameters, the necessity to train +overparametrized models remains poorly understood, and alternative approaches +do not necessarily make it cheaper to train high-performance models. In this +paper, we explore low-rank training techniques as an alternative approach to +training large neural networks. We introduce a novel method called ReLoRA, +which utilizes low-rank updates to train high-rank networks. We apply ReLoRA to +pre-training transformer language models with up to 350M parameters and +demonstrate comparable performance to regular neural network training. +Furthermore, we observe that the efficiency of ReLoRA increases with model +size, making it a promising approach for training multi-billion-parameter +networks efficiently. Our findings shed light on the potential of low-rank +training techniques and their implications for scaling laws. + +
+
+
+
+
+ + ♻ ☆ PoetryDiffusion: Towards Joint Semantic and Metrical Manipulation in + Poetry Generation + + +
+ Controllable text generation is a challenging and meaningful field in natural +language generation (NLG). Especially, poetry generation is a typical one with +well-defined and strict conditions for text generation which is an ideal +playground for the assessment of current methodologies. While prior works +succeeded in controlling either semantic or metrical aspects of poetry +generation, simultaneously addressing both remains a challenge. In this paper, +we pioneer the use of the Diffusion model for generating sonnets and Chinese +SongCi poetry to tackle such challenges. In terms of semantics, our +PoetryDiffusion model, built upon the Diffusion model, generates entire +sentences or poetry by comprehensively considering the entirety of sentence +information. This approach enhances semantic expression, distinguishing it from +autoregressive and large language models (LLMs). For metrical control, the +separation feature of diffusion generation and its constraint control module +enable us to flexibly incorporate a novel metrical controller to manipulate and +evaluate metrics (format and rhythm). The denoising process in PoetryDiffusion +allows for gradual enhancement of semantics and flexible integration of the +metrical controller which can calculate and impose penalties on states that +stray significantly from the target control distribution. Experimental results +on two datasets demonstrate that our model outperforms existing models in +automatic evaluation of semantic, metrical, and overall performance as well as +human evaluation. + +
+
+ comment: 9 Pages +
+
+
+
+
+ + ♻ ☆ A Framework For Refining Text Classification and Object Recognition from + Academic Articles + + +
+ With the widespread use of the internet, it has become increasingly crucial +to extract specific information from vast amounts of academic articles +efficiently. Data mining techniques are generally employed to solve this issue. +However, data mining for academic articles is challenging since it requires +automatically extracting specific patterns in complex and unstructured layout +documents. Current data mining methods for academic articles employ +rule-based(RB) or machine learning(ML) approaches. However, using rule-based +methods incurs a high coding cost for complex typesetting articles. On the +other hand, simply using machine learning methods requires annotation work for +complex content types within the paper, which can be costly. Furthermore, only +using machine learning can lead to cases where patterns easily recognized by +rule-based methods are mistakenly extracted. To overcome these issues, from the +perspective of analyzing the standard layout and typesetting used in the +specified publication, we emphasize implementing specific methods for specific +characteristics in academic articles. We have developed a novel Text Block +Refinement Framework (TBRF), a machine learning and rule-based scheme hybrid. +We used the well-known ACL proceeding articles as experimental data for the +validation experiment. The experiment shows that our approach achieved over 95% +classification accuracy and 90% detection accuracy for tables and figures. + +
+
+ comment: This paper has been accepted at 'The International Symposium on + Innovations in Intelligent Systems and Applications 2023 (INISTA 2023)' +
+
+
+
+
+ + ♻ ☆ ANTONIO: Towards a Systematic Method of Generating NLP Benchmarks for + Verification + + +
+ Verification of machine learning models used in Natural Language Processing +(NLP) is known to be a hard problem. In particular, many known neural network +verification methods that work for computer vision and other numeric datasets +do not work for NLP. Here, we study technical reasons that underlie this +problem. Based on this analysis, we propose practical methods and heuristics +for preparing NLP datasets and models in a way that renders them amenable to +known verification methods based on abstract interpretation. We implement these +methods as a Python library called ANTONIO that links to the neural network +verifiers ERAN and Marabou. We perform evaluation of the tool using an NLP +dataset R-U-A-Robot suggested as a benchmark for verifying legally critical NLP +applications. We hope that, thanks to its general applicability, this work will +open novel possibilities for including NLP verification problems into neural +network verification competitions, and will popularise NLP problems within this +community. + +
+
+ comment: To appear in proceedings of 6th Workshop on Formal Methods for + ML-Enabled Autonomous Systems (Affiliated with CAV 2023) +
+
+
+
+
+ + ♻ ☆ BatGPT: A Bidirectional Autoregessive Talker from Generative Pre-trained + Transformer + + +
+ BatGPT is a large-scale language model designed and trained jointly by Wuhan +University and Shanghai Jiao Tong University. It is capable of generating +highly natural and fluent text in response to various types of input, including +text prompts, images, and audio. In the modeling level, we employ a +bidirectional autoregressive architecture that allows the model to efficiently +capture the complex dependencies of natural language, making it highly +effective in tasks such as language generation, dialog systems, and question +answering. Moreover, the bidirectional autoregressive modeling not only +operates from left to right but also from right to left, effectively reducing +fixed memory effects and alleviating model hallucinations. + In the training aspect, we propose a novel parameter expansion method for +leveraging the pre-training of smaller models and employ reinforcement learning +from both AI and human feedback, aimed at improving the model's alignment +performance. Overall, these approaches significantly improve the effectiveness +of BatGPT, and the model can be utilized for a wide range of natural language +applications. + +
+
+
+
+
+ + ♻ ☆ SuS-X: Training-Free Name-Only Transfer of Vision-Language Models ICCV2023 + + +
+ Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet +effective way to train large-scale vision-language models. CLIP demonstrates +impressive zero-shot classification and retrieval on diverse downstream tasks. +However, to leverage its full potential, fine-tuning still appears to be +necessary. Fine-tuning the entire CLIP model can be resource-intensive and +unstable. Moreover, recent methods that aim to circumvent this need for +fine-tuning still require access to images from the target distribution. In +this paper, we pursue a different approach and explore the regime of +training-free "name-only transfer" in which the only knowledge we possess about +the downstream task comprises the names of downstream target categories. We +propose a novel method, SuS-X, consisting of two key building blocks -- SuS and +TIP-X, that requires neither intensive fine-tuning nor costly labelled data. +SuS-X achieves state-of-the-art zero-shot classification results on 19 +benchmark datasets. We further show the utility of TIP-X in the training-free +few-shot setting, where we again achieve state-of-the-art results over strong +training-free baselines. Code is available at +https://github.com/vishaal27/SuS-X. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ LaFiCMIL: Rethinking Large File Classification from the Perspective of + Correlated Multiple Instance Learning + + +
+ Transformer-based models, such as BERT, have revolutionized various language +tasks, but still struggle with large file classification due to their input +limit (e.g., 512 tokens). Despite several attempts to alleviate this +limitation, no method consistently excels across all benchmark datasets, +primarily because they can only extract partial essential information from the +input file. Additionally, they fail to adapt to the varied properties of +different types of large files. In this work, we tackle this problem from the +perspective of correlated multiple instance learning. The proposed approach, +LaFiCMIL, serves as a versatile framework applicable to various large file +classification tasks covering binary, multi-class, and multi-label +classification tasks, spanning various domains including Natural Language +Processing, Programming Language Processing, and Android Analysis. To evaluate +its effectiveness, we employ eight benchmark datasets pertaining to Long +Document Classification, Code Defect Detection, and Android Malware Detection. +Leveraging BERT-family models as feature extractors, our experimental results +demonstrate that LaFiCMIL achieves new state-of-the-art performance across all +benchmark datasets. This is largely attributable to its capability of scaling +BERT up to nearly 20K tokens, running on a single Tesla V-100 GPU with 32G of +memory. + +
+
+ comment: 12 pages; update results; manuscript revision +
+
+
+
+
+ + ♻ ☆ Large Language Models for Information Retrieval: A Survey + + +
+ As a primary means of information acquisition, information retrieval (IR) +systems, such as search engines, have integrated themselves into our daily +lives. These systems also serve as components of dialogue, question-answering, +and recommender systems. The trajectory of IR has evolved dynamically from its +origins in term-based methods to its integration with advanced neural models. +While the neural models excel at capturing complex contextual signals and +semantic nuances, thereby reshaping the IR landscape, they still face +challenges such as data scarcity, interpretability, and the generation of +contextually plausible yet potentially inaccurate responses. This evolution +requires a combination of both traditional methods (such as term-based sparse +retrieval methods with rapid response) and modern neural architectures (such as +language models with powerful language understanding capacity). Meanwhile, the +emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has +revolutionized natural language processing due to their remarkable language +understanding, generation, generalization, and reasoning abilities. +Consequently, recent research has sought to leverage LLMs to improve IR +systems. Given the rapid evolution of this research trajectory, it is necessary +to consolidate existing methodologies and provide nuanced insights through a +comprehensive overview. In this survey, we delve into the confluence of LLMs +and IR systems, including crucial aspects such as query rewriters, retrievers, +rerankers, and readers. Additionally, we explore promising directions within +this expanding field. + +
+
+
+
+
+ + ♻ ☆ PromptStyler: Prompt-driven Style Generation for Source-free Domain + Generalization ICCV 2023 + + +
+ In a joint vision-language space, a text feature (e.g., from "a photo of a +dog") could effectively represent its relevant image features (e.g., from dog +photos). Also, a recent study has demonstrated the cross-modal transferability +phenomenon of this joint space. From these observations, we propose +PromptStyler which simulates various distribution shifts in the joint space by +synthesizing diverse styles via prompts without using any images to deal with +source-free domain generalization. The proposed method learns to generate a +variety of style features (from "a S* style of a") via learnable style word +vectors for pseudo-words S*. To ensure that learned styles do not distort +content information, we force style-content features (from "a S* style of a +[class]") to be located nearby their corresponding content features (from +"[class]") in the joint vision-language space. After learning style word +vectors, we train a linear classifier using synthesized style-content features. +PromptStyler achieves the state of the art on PACS, VLCS, OfficeHome and +DomainNet, even though it does not require any images for training. + +
+
+ comment: Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/ +
+
+
+
+
+ + ♻ ☆ SynJax: Structured Probability Distributions for JAX + + +
+ The development of deep learning software libraries enabled significant +progress in the field by allowing users to focus on modeling, while letting the +library to take care of the tedious and time-consuming task of optimizing +execution for modern hardware accelerators. However, this has benefited only +particular types of deep learning models, such as Transformers, whose +primitives map easily to the vectorized computation. The models that explicitly +account for structured objects, such as trees and segmentations, did not +benefit equally because they require custom algorithms that are difficult to +implement in a vectorized form. + SynJax directly addresses this problem by providing an efficient vectorized +implementation of inference algorithms for structured distributions covering +alignment, tagging, segmentation, constituency trees and spanning trees. With +SynJax we can build large-scale differentiable models that explicitly model +structure in the data. The code is available at +https://github.com/deepmind/synjax. + +
+
+
+
+
+ + ♻ ☆ SGL-PT: A Strong Graph Learner with Graph Prompt Tuning + + +
+ Recently, much exertion has been paid to design graph self-supervised methods +to obtain generalized pre-trained models, and adapt pre-trained models onto +downstream tasks through fine-tuning. However, there exists an inherent gap +between pretext and downstream graph tasks, which insufficiently exerts the +ability of pre-trained models and even leads to negative transfer. Meanwhile, +prompt tuning has seen emerging success in natural language processing by +aligning pre-training and fine-tuning with consistent training objectives. In +this paper, we identify the challenges for graph prompt tuning: The first is +the lack of a strong and universal pre-training task across sundry pre-training +methods in graph domain. The second challenge lies in the difficulty of +designing a consistent training objective for both pre-training and downstream +tasks. To overcome above obstacles, we propose a novel framework named SGL-PT +which follows the learning strategy ``Pre-train, Prompt, and Predict''. +Specifically, we raise a strong and universal pre-training task coined as SGL +that acquires the complementary merits of generative and contrastive +self-supervised graph learning. And aiming for graph classification task, we +unify pre-training and fine-tuning by designing a novel verbalizer-free +prompting function, which reformulates the downstream task in a similar format +as pretext task. Empirical results show that our method surpasses other +baselines under unsupervised setting, and our prompt tuning method can greatly +facilitate models on biological datasets over fine-tuning methods. + +
+
+
+
+
+ + ♻ ☆ #InsTag: Instruction Tagging for Analyzing Supervised Fine-tuning of + Large Language Models + + +
+ Foundation language models obtain the instruction-following ability through +supervised fine-tuning (SFT). Diversity and complexity are considered critical +factors of a successful SFT dataset, while their definitions remain obscure and +lack quantitative analyses. In this work, we propose InsTag, an open-set +fine-grained tagger, to tag samples within SFT datasets based on semantics and +intentions and define instruction diversity and complexity regarding tags. We +obtain 6.6K tags to describe comprehensive user queries. Then we analyze +popular open-sourced SFT datasets and find that the model ability grows with +more diverse and complex data. Based on this observation, we propose a data +selector based on InsTag to select 6K diverse and complex samples from +open-source datasets and fine-tune models on InsTag-selected data. The +resulting models, TagLM, outperform open-source models based on considerably +larger SFT data evaluated by MT-Bench, echoing the importance of query +diversity and complexity. We open-source InsTag in +https://github.com/OFA-Sys/InsTag. + +
+
+
+
+
+ + ♻ ☆ MC-DRE: Multi-Aspect Cross Integration for Drug Event/Entity Extraction CIKM 2023 + + +
+ Extracting meaningful drug-related information chunks, such as adverse drug +events (ADE), is crucial for preventing morbidity and saving many lives. Most +ADEs are reported via an unstructured conversation with the medical context, so +applying a general entity recognition approach is not sufficient enough. In +this paper, we propose a new multi-aspect cross-integration framework for drug +entity/event detection by capturing and aligning different +context/language/knowledge properties from drug-related documents. We first +construct multi-aspect encoders to describe semantic, syntactic, and medical +document contextual information by conducting those slot tagging tasks, main +drug entity/event detection, part-of-speech tagging, and general medical named +entity recognition. Then, each encoder conducts cross-integration with other +contextual information in three ways: the key-value cross, attention cross, and +feedforward cross, so the multi-encoders are integrated in depth. Our model +outperforms all SOTA on two widely used tasks, flat entity detection and +discontinuous event extraction. + +
+
+ comment: Accepted at CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Style Over Substance: Evaluation Biases for Large Language Models + + +
+ As large language models (LLMs) continue to advance, accurately and +comprehensively evaluating their performance becomes increasingly challenging. +Human evaluations are conventionally considered the gold standard in natural +language generation, but recent advancements incorporate state-of-the-art LLMs +as proxies for human judges in evaluation processes. However, the extent to +which humans and LLMs are capable evaluators remains uncertain. This study +investigates the behavior of crowd-sourced and expert annotators, as well as +LLMs, when comparing outputs from different models. To achieve this, we curate +a dataset of intentionally flawed machine-generated answers. Our findings +reveal a concerning bias in the evaluation process, as answers with factual +errors are rated more favorably than answers that are too short or contained +grammatical errors. To address this issue, we propose independently evaluating +machine-generated text across multiple dimensions, rather than merging all the +evaluation aspects into a single score. We instantiate this idea with the Elo +rating system, resulting in the Multi-Elo Rating System. Empirical results from +our study reveal that this proposed approach significantly enhances the quality +of LLM-based evaluations, particularly in terms of factual accuracy. However, +there is no significant improvement in crowd-sourced-based evaluations, +indicating the need for further investigation and refinement. + +
+
+ comment: Work in progress, 17 pages, 4 tables, 12 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Contextual Biasing for Transducer Based Streaming Speech + Recognition + + +
+ By incorporating additional contextual information, deep biasing methods have +emerged as a promising solution for speech recognition of personalized words. +However, for real-world voice assistants, always biasing on such personalized +words with high prediction scores can significantly degrade the performance of +recognizing common words. To address this issue, we propose an adaptive +contextual biasing method based on Context-Aware Transformer Transducer (CATT) +that utilizes the biased encoder and predictor embeddings to perform streaming +prediction of contextual phrase occurrences. Such prediction is then used to +dynamically switch the bias list on and off, enabling the model to adapt to +both personalized and common scenarios. Experiments on Librispeech and internal +voice assistant datasets show that our approach can achieve up to 6.7% and +20.7% relative reduction in WER and CER compared to the baseline respectively, +mitigating up to 96.7% and 84.9% of the relative WER and CER increase for +common cases. Furthermore, our approach has a minimal performance impact in +personalized scenarios while maintaining a streaming inference pipeline with +negligible RTF increase. + +
+
+
+
+
+ + ♻ ☆ Thresh: A Unified, Customizable and Deployable Platform for Fine-Grained + Text Evaluation + + +
+ Fine-grained, span-level human evaluation has emerged as a reliable and +robust method for evaluating text generation tasks such as summarization, +simplification, machine translation and news generation, and the derived +annotations have been useful for training automatic metrics and improving +language models. However, existing annotation tools implemented for these +evaluation frameworks lack the adaptability to be extended to different domains +or languages, or modify annotation settings according to user needs. And the +absence of a unified annotated data format inhibits the research in multi-task +learning. In this paper, we introduce Thresh, a unified, customizable and +deployable platform for fine-grained evaluation. By simply creating a YAML +configuration file, users can build and test an annotation interface for any +framework within minutes -- all in one web browser window. To facilitate +collaboration and sharing, Thresh provides a community hub that hosts a +collection of fine-grained frameworks and corresponding annotations made and +collected by the community, covering a wide range of NLP tasks. For deployment, +Thresh offers multiple options for any scale of annotation projects from small +manual inspections to large crowdsourcing ones. Additionally, we introduce a +Python library to streamline the entire process from typology design and +deployment to annotation processing. Thresh is publicly accessible at +https://thresh.tools. + +
+
+
+
+
+ + ♻ ☆ Deep Learning-Based Knowledge Injection for Metaphor Detection: A + Comprehensive Review + + +
+ The history of metaphor research also marks the evolution of knowledge +infusion research. With the continued advancement of deep learning techniques +in recent years, the natural language processing community has shown great +interest in applying knowledge to successful results in metaphor recognition +tasks. Although there has been a gradual increase in the number of approaches +involving knowledge injection in the field of metaphor recognition, there is a +lack of a complete review article on knowledge injection based approaches. +Therefore, the goal of this paper is to provide a comprehensive review of +research advances in the application of deep learning for knowledge injection +in metaphor recognition tasks. In this paper, we systematically summarize and +generalize the mainstream knowledge and knowledge injection principles, as well +as review the datasets, evaluation metrics, and benchmark models used in +metaphor recognition tasks. Finally, we explore the current issues facing +knowledge injection methods and provide an outlook on future research +directions. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked + + +
+ Large language models (LLMs) have skyrocketed in popularity in recent years +due to their ability to generate high-quality text in response to human +prompting. However, these models have been shown to have the potential to +generate harmful content in response to user prompting (e.g., giving users +instructions on how to commit crimes). There has been a focus in the literature +on mitigating these risks, through methods like aligning models with human +values through reinforcement learning. However, it has been shown that even +aligned language models are susceptible to adversarial attacks that bypass +their restrictions on generating harmful text. We propose a simple approach to +defending against these attacks by having a large language model filter its own +responses. Our current results show that even if a model is not fine-tuned to +be aligned with human values, it is possible to stop it from presenting harmful +content to users by validating the content using a language model. + +
+
+
+
+
+ + ♻ ☆ Metacognitive Prompting Improves Understanding in Large Language Models + + +
+ In Large Language Models (LLMs), there have been consistent advancements in +task-specific performance, largely influenced by effective prompt design. While +recent research on prompting has enhanced the reasoning capabilities of LLMs, a +gap remains in further improving their understanding abilities. In this study, +we introduce Metacognitive Prompting (MP), a strategy inspired by human +introspective reasoning processes. Using MP, LLMs undergo a systematic series +of structured, self-aware evaluations, drawing on both their vast inherent +knowledge and new insights. Our experiments involve five prevalent LLMs: +Llama2, Vicuna, PaLM, GPT-3.5, and GPT-4, all of which span various general +natural language understanding (NLU) tasks from the GLUE and SuperGLUE +benchmarks. Results indicate that, although GPT-4 consistently excels in most +tasks, PaLM, when equipped with MP, approaches its performance level. +Furthermore, across models and datasets, MP consistently outperforms existing +prompting methods, including standard and chain-of-thought prompting. This +study underscores the potential to amplify the understanding abilities of LLMs +and highlights the benefits of mirroring human introspective reasoning in NLU +tasks. + +
+
+ comment: 9 pages, in submission +
+
+
+
+
+ + ♻ ☆ When Good and Reproducible Results are a Giant with Feet of Clay: The + Importance of Software Quality in NLP + + +
+ Despite its crucial role in research experiments, code correctness is often +presumed only on the basis of the perceived quality of results. This assumption +comes with the risk of erroneous outcomes and potentially misleading findings. +To address this issue, we posit that the current focus on reproducibility +should go hand in hand with the emphasis on software quality. We present a case +study in which we identify and fix three bugs in widely used implementations of +the state-of-the-art Conformer architecture. Through experiments on speech +recognition and translation in various languages, we demonstrate that the +presence of bugs does not prevent the achievement of good and reproducible +results, which however can lead to incorrect conclusions that potentially +misguide future research. As a countermeasure, we propose a Code-quality +Checklist and release pangoliNN, a library dedicated to testing neural models, +with the goal of promoting coding best practices and improving research +software quality within the NLP community. + +
+
+
+
+
+ + ♻ ☆ Event and Entity Extraction from Generated Video Captions + + +
+ Annotation of multimedia data by humans is time-consuming and costly, while +reliable automatic generation of semantic metadata is a major challenge. We +propose a framework to extract semantic metadata from automatically generated +video captions. As metadata, we consider entities, the entities' properties, +relations between entities, and the video category. We employ two +state-of-the-art dense video captioning models with masked transformer (MT) and +parallel decoding (PVDC) to generate captions for videos of the ActivityNet +Captions dataset. Our experiments show that it is possible to extract entities, +their properties, relations between entities, and the video category from the +generated captions. We observe that the quality of the extracted information is +mainly influenced by the quality of the event localization in the video as well +as the performance of the event caption generation. + +
+
+ comment: Paper accepted at CD-MAKE 2023 +
+
+
+
+
+ + ♻ ☆ Probing Quantifier Comprehension in Large Language Models: Another + Example of Inverse Scaling + + +
+ With their increasing size, large language models (LLMs) are becoming +increasingly good at language understanding tasks. But even with high +performance on specific downstream task, LLMs fail at simple linguistic tests +for negation or quantifier understanding. Previous work on quantifier +understanding in LLMs show inverse scaling in understanding few-type +quantifiers. In this paper, we question the claims of of previous work and show +that it is a result of inappropriate testing methodology. We also present +alternate methods to measure quantifier comprehension in LLMs and show that +LLMs are able to better understand the difference between the meaning of +few-type and most-type quantifiers as their size increases, although they are +not particularly good at it. We also observe inverse scaling for most-type +quantifier understanding, which is contrary to human psycho-linguistic +experiments and previous work, where the model's understanding of most-type +quantifier gets worse as the model size increases. We do this evaluation on +models ranging from 125M-175B parameters, which suggests that LLMs do not do as +well as expected with quantifiers. We also discuss the possible reasons for +this and the relevance of quantifier understanding in evaluating language +understanding in LLMs. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 129 + +
+
+
+ + ☆ CoDeF: Content Deformation Fields for Temporally Consistent Video + Processing + + +
+ We present the content deformation field CoDeF as a new type of video +representation, which consists of a canonical content field aggregating the +static contents in the entire video and a temporal deformation field recording +the transformations from the canonical image (i.e., rendered from the canonical +content field) to each individual frame along the time axis.Given a target +video, these two fields are jointly optimized to reconstruct it through a +carefully tailored rendering pipeline.We advisedly introduce some +regularizations into the optimization process, urging the canonical content +field to inherit semantics (e.g., the object shape) from the video.With such a +design, CoDeF naturally supports lifting image algorithms for video processing, +in the sense that one can apply an image algorithm to the canonical image and +effortlessly propagate the outcomes to the entire video with the aid of the +temporal deformation field.We experimentally show that CoDeF is able to lift +image-to-image translation to video-to-video translation and lift keypoint +detection to keypoint tracking without any training.More importantly, thanks to +our lifting strategy that deploys the algorithms on only one image, we achieve +superior cross-frame consistency in processed videos compared to existing +video-to-video translation approaches, and even manage to track non-rigid +objects like water and smog.Project page can be found at +https://qiuyu96.github.io/CoDeF/. + +
+
+ comment: Project Webpage: https://qiuyu96.github.io/CoDeF/, Code: + https://github.com/qiuyu96/CoDeF +
+
+
+
+
+ + ☆ Solving Challenging Math Word Problems Using GPT-4 Code Interpreter with + Code-based Self-Verification + + +
+ Recent progress in large language models (LLMs) like GPT-4 and PaLM-2 has +brought significant advancements in addressing math reasoning problems. In +particular, OpenAI's latest version of GPT-4, known as GPT-4 Code Interpreter, +shows remarkable performance on challenging math datasets. In this paper, we +explore the effect of code on enhancing LLMs' reasoning capability by +introducing different constraints on the \textit{Code Usage Frequency} of GPT-4 +Code Interpreter. We found that its success can be largely attributed to its +powerful skills in generating and executing code, evaluating the output of code +execution, and rectifying its solution when receiving unreasonable outputs. +Based on this insight, we propose a novel and effective prompting method, +explicit \uline{c}ode-based \uline{s}elf-\uline{v}erification~(CSV), to further +boost the mathematical reasoning potential of GPT-4 Code Interpreter. This +method employs a zero-shot prompt on GPT-4 Code Interpreter to encourage it to +use code to self-verify its answers. In instances where the verification state +registers as ``False'', the model shall automatically amend its solution, +analogous to our approach of rectifying errors during a mathematics +examination. Furthermore, we recognize that the states of the verification +result indicate the confidence of a solution, which can improve the +effectiveness of majority voting. With GPT-4 Code Interpreter and CSV, we +achieve an impressive zero-shot accuracy on MATH dataset \textbf{(53.9\% $\to$ +84.3\%)}. + +
+
+ comment: Solving Challenging Math Word Problems Using GPT-4 Code Interpreter + with Code-based Self-Verification +
+
+
+
+
+ + ☆ Helping Hands: An Object-Aware Ego-Centric Video Recognition Model ICCV2023 + + +
+ We introduce an object-aware decoder for improving the performance of +spatio-temporal representations on ego-centric videos. The key idea is to +enhance object-awareness during training by tasking the model to predict hand +positions, object positions, and the semantic label of the objects using paired +captions when available. At inference time the model only requires RGB frames +as inputs, and is able to track and ground objects (although it has not been +trained explicitly for this). We demonstrate the performance of the +object-aware representations learnt by our model, by: (i) evaluating it for +strong transfer, i.e. through zero-shot testing, on a number of downstream +video-text retrieval and classification benchmarks; and (ii) by using the +representations learned as input for long-term video understanding tasks (e.g. +Episodic Memory in Ego4D). In all cases the performance improves over the state +of the art -- even compared to networks trained with far larger batch sizes. We +also show that by using noisy image-level detection as pseudo-labels in +training, the model learns to provide better bounding boxes using video +consistency, as well as grounding the words in the associated text +descriptions. Overall, we show that the model can act as a drop-in replacement +for an ego-centric video model to improve performance through visual-text +grounding. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ☆ Relightable and Animatable Neural Avatar from Sparse-View Video + + +
+ This paper tackles the challenge of creating relightable and animatable +neural avatars from sparse-view (or even monocular) videos of dynamic humans +under unknown illumination. Compared to studio environments, this setting is +more practical and accessible but poses an extremely challenging ill-posed +problem. Previous neural human reconstruction methods are able to reconstruct +animatable avatars from sparse views using deformed Signed Distance Fields +(SDF) but cannot recover material parameters for relighting. While +differentiable inverse rendering-based methods have succeeded in material +recovery of static objects, it is not straightforward to extend them to dynamic +humans as it is computationally intensive to compute pixel-surface intersection +and light visibility on deformed SDFs for inverse rendering. To solve this +challenge, we propose a Hierarchical Distance Query (HDQ) algorithm to +approximate the world space distances under arbitrary human poses. +Specifically, we estimate coarse distances based on a parametric human model +and compute fine distances by exploiting the local deformation invariance of +SDF. Based on the HDQ algorithm, we leverage sphere tracing to efficiently +estimate the surface intersection and light visibility. This allows us to +develop the first system to recover animatable and relightable neural avatars +from sparse view (or monocular) inputs. Experiments demonstrate that our +approach is able to produce superior results compared to state-of-the-art +methods. Our code will be released for reproducibility. + +
+
+ comment: Project page: https://zju3dv.github.io/relightable_avatar +
+
+
+
+
+ + ☆ A Foundation LAnguage-Image model of the Retina (FLAIR): Encoding expert + knowledge in text supervision + + +
+ Foundation vision-language models are currently transforming computer vision, +and are on the rise in medical imaging fueled by their very promising +generalization capabilities. However, the initial attempts to transfer this new +paradigm to medical imaging have shown less impressive performances than those +observed in other domains, due to the significant domain shift and the complex, +expert domain knowledge inherent to medical-imaging tasks. Motivated by the +need for domain-expert foundation models, we present FLAIR, a pre-trained +vision-language model for universal retinal fundus image understanding. To this +end, we compiled 37 open-access, mostly categorical fundus imaging datasets +from various sources, with up to 97 different target conditions and 284,660 +images. We integrate the expert's domain knowledge in the form of descriptive +textual prompts, during both pre-training and zero-shot inference, enhancing +the less-informative categorical supervision of the data. Such a textual +expert's knowledge, which we compiled from the relevant clinical literature and +community standards, describes the fine-grained features of the pathologies as +well as the hierarchies and dependencies between them. We report comprehensive +evaluations, which illustrate the benefit of integrating expert knowledge and +the strong generalization capabilities of FLAIR under difficult scenarios with +domain shifts or unseen categories. When adapted with a lightweight linear +probe, FLAIR outperforms fully-trained, dataset-focused models, more so in the +few-shot regimes. Interestingly, FLAIR outperforms by a large margin more +generalist, larger-scale image-language models, which emphasizes the potential +of embedding experts' domain knowledge and the limitations of generalist models +in medical imaging. + +
+
+ comment: The pre-trained model is available at: + https://github.com/jusiro/FLAIR +
+
+
+
+
+ + ☆ Memory-and-Anticipation Transformer for Online Action Understanding ICCV 2023 + + +
+ Most existing forecasting systems are memory-based methods, which attempt to +mimic human forecasting ability by employing various memory mechanisms and have +progressed in temporal modeling for memory dependency. Nevertheless, an obvious +weakness of this paradigm is that it can only model limited historical +dependence and can not transcend the past. In this paper, we rethink the +temporal dependence of event evolution and propose a novel +memory-anticipation-based paradigm to model an entire temporal structure, +including the past, present, and future. Based on this idea, we present +Memory-and-Anticipation Transformer (MAT), a memory-anticipation-based +approach, to address the online action detection and anticipation tasks. In +addition, owing to the inherent superiority of MAT, it can process online +action detection and anticipation tasks in a unified manner. The proposed MAT +model is tested on four challenging benchmarks TVSeries, THUMOS'14, HDD, and +EPIC-Kitchens-100, for online action detection and anticipation tasks, and it +significantly outperforms all existing methods. Code is available at +https://github.com/Echo0125/Memory-and-Anticipation-Transformer. + +
+
+ comment: ICCV 2023 Camera Ready +
+
+
+
+
+ + ☆ Link-Context Learning for Multimodal LLMs + + +
+ The ability to learn from context with novel concepts, and deliver +appropriate responses are essential in human conversations. Despite current +Multimodal Large Language Models (MLLMs) and Large Language Models (LLMs) being +trained on mega-scale datasets, recognizing unseen images or understanding +novel concepts in a training-free manner remains a challenge. In-Context +Learning (ICL) explores training-free few-shot learning, where models are +encouraged to ``learn to learn" from limited tasks and generalize to unseen +tasks. In this work, we propose link-context learning (LCL), which emphasizes +"reasoning from cause and effect" to augment the learning capabilities of +MLLMs. LCL goes beyond traditional ICL by explicitly strengthening the causal +relationship between the support set and the query set. By providing +demonstrations with causal links, LCL guides the model to discern not only the +analogy but also the underlying causal associations between data points, which +empowers MLLMs to recognize unseen images and understand novel concepts more +effectively. To facilitate the evaluation of this novel approach, we introduce +the ISEKAI dataset, comprising exclusively of unseen generated image-label +pairs designed for link-context learning. Extensive experiments show that our +LCL-MLLM exhibits strong link-context learning capabilities to novel concepts +over vanilla MLLMs. Code and data will be released at +https://github.com/isekai-portal/Link-Context-Learning. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ The Challenge of Fetal Cardiac MRI Reconstruction Using Deep Learning + + +
+ Dynamic free-breathing fetal cardiac MRI is one of the most challenging +modalities, which requires high temporal and spatial resolution to depict rapid +changes in a small fetal heart. The ability of deep learning methods to recover +undersampled data could help to optimise the kt-SENSE acquisition strategy and +improve non-gated kt-SENSE reconstruction quality. In this work, we explore +supervised deep learning networks for reconstruction of kt-SENSE style acquired +data using an extensive in vivo dataset. Having access to fully-sampled +low-resolution multi-coil fetal cardiac MRI, we study the performance of the +networks to recover fully-sampled data from undersampled data. We consider +model architectures together with training strategies taking into account their +application in the real clinical setup used to collect the dataset to enable +networks to recover prospectively undersampled data. We explore a set of +modifications to form a baseline performance evaluation for dynamic fetal +cardiac MRI on real data. We systematically evaluate the models on +coil-combined data to reveal the effect of the suggested changes to the +architecture in the context of fetal heart properties. We show that the +best-performers recover a detailed depiction of the maternal anatomy on a large +scale, but the dynamic properties of the fetal heart are under-represented. +Training directly on multi-coil data improves the performance of the models, +allows their prospective application to undersampled data and makes them +outperform CTFNet introduced for adult cardiac cine MRI. However, these models +deliver similar qualitative performances recovering the maternal body very well +but underestimating the dynamic properties of fetal heart. This dynamic feature +of fast change of fetal heart that is highly localised suggests both more +targeted training and evaluation methods might be needed for fetal heart +application. + +
+
+
+
+
+ + ☆ SEDA: Self-Ensembling ViT with Defensive Distillation and Adversarial + Training for robust Chest X-rays Classification MICCAI + + +
+ Deep Learning methods have recently seen increased adoption in medical +imaging applications. However, elevated vulnerabilities have been explored in +recent Deep Learning solutions, which can hinder future adoption. Particularly, +the vulnerability of Vision Transformer (ViT) to adversarial, privacy, and +confidentiality attacks raise serious concerns about their reliability in +medical settings. This work aims to enhance the robustness of self-ensembling +ViTs for the tuberculosis chest x-ray classification task. We propose +Self-Ensembling ViT with defensive Distillation and Adversarial training +(SEDA). SEDA utilizes efficient CNN blocks to learn spatial features with +various levels of abstraction from feature representations extracted from +intermediate ViT blocks, that are largely unaffected by adversarial +perturbations. Furthermore, SEDA leverages adversarial training in combination +with defensive distillation for improved robustness against adversaries. +Training using adversarial examples leads to better model generalizability and +improves its ability to handle perturbations. Distillation using soft +probabilities introduces uncertainty and variation into the output +probabilities, making it more difficult for adversarial and privacy attacks. +Extensive experiments performed with the proposed architecture and training +paradigm on publicly available Tuberculosis x-ray dataset shows SOTA efficacy +of SEDA compared to SEViT in terms of computational efficiency with 70x times +lighter framework and enhanced robustness of +9%. + +
+
+ comment: Accepted at DART (Domain Adaptation and Representation Transfer) + Workshop, MICCAI, 2023. Code: https://github.com/Razaimam45/SEDA +
+
+
+
+
+ + ☆ Emotion Embeddings $\unicode{x2014}$ Learning Stable and Homogeneous + Abstractions from Heterogeneous Affective Datasets + + +
+ Human emotion is expressed in many communication modalities and media formats +and so their computational study is equally diversified into natural language +processing, audio signal analysis, computer vision, etc. Similarly, the large +variety of representation formats used in previous research to describe +emotions (polarity scales, basic emotion categories, dimensional approaches, +appraisal theory, etc.) have led to an ever proliferating diversity of +datasets, predictive models, and software tools for emotion analysis. Because +of these two distinct types of heterogeneity, at the expressional and +representational level, there is a dire need to unify previous work on +increasingly diverging data and label types. This article presents such a +unifying computational model. We propose a training procedure that learns a +shared latent representation for emotions, so-called emotion embeddings, +independent of different natural languages, communication modalities, media or +representation label formats, and even disparate model architectures. +Experiments on a wide range of heterogeneous affective datasets indicate that +this approach yields the desired interoperability for the sake of reusability, +interpretability and flexibility, without penalizing prediction quality. Code +and data are archived under https://doi.org/10.5281/zenodo.7405327 . + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ☆ ObjectSDF++: Improved Object-Compositional Neural Implicit Surfaces ICCV 2023 + + +
+ In recent years, neural implicit surface reconstruction has emerged as a +popular paradigm for multi-view 3D reconstruction. Unlike traditional +multi-view stereo approaches, the neural implicit surface-based methods +leverage neural networks to represent 3D scenes as signed distance functions +(SDFs). However, they tend to disregard the reconstruction of individual +objects within the scene, which limits their performance and practical +applications. To address this issue, previous work ObjectSDF introduced a nice +framework of object-composition neural implicit surfaces, which utilizes 2D +instance masks to supervise individual object SDFs. In this paper, we propose a +new framework called ObjectSDF++ to overcome the limitations of ObjectSDF. +First, in contrast to ObjectSDF whose performance is primarily restricted by +its converted semantic field, the core component of our model is an +occlusion-aware object opacity rendering formulation that directly +volume-renders object opacity to be supervised with instance masks. Second, we +design a novel regularization term for object distinction, which can +effectively mitigate the issue that ObjectSDF may result in unexpected +reconstruction in invisible regions due to the lack of constraint to prevent +collisions. Our extensive experiments demonstrate that our novel framework not +only produces superior object reconstruction results but also significantly +improves the quality of scene reconstruction. Code and more resources can be +found in \url{https://qianyiwu.github.io/objectsdf++} + +
+
+ comment: ICCV 2023. Project Page: https://qianyiwu.github.io/objectsdf++ Code: + https://github.com/QianyiWu/objectsdf_plus +
+
+
+
+
+ + ☆ StyleDiffusion: Controllable Disentangled Style Transfer via Diffusion + Models ICCV 2023 + + +
+ Content and style (C-S) disentanglement is a fundamental problem and critical +challenge of style transfer. Existing approaches based on explicit definitions +(e.g., Gram matrix) or implicit learning (e.g., GANs) are neither interpretable +nor easy to control, resulting in entangled representations and less satisfying +results. In this paper, we propose a new C-S disentangled framework for style +transfer without using previous assumptions. The key insight is to explicitly +extract the content information and implicitly learn the complementary style +information, yielding interpretable and controllable C-S disentanglement and +style transfer. A simple yet effective CLIP-based style disentanglement loss +coordinated with a style reconstruction prior is introduced to disentangle C-S +in the CLIP image space. By further leveraging the powerful style removal and +generative ability of diffusion models, our framework achieves superior results +than state of the art and flexible C-S disentanglement and trade-off control. +Our work provides new insights into the C-S disentanglement in style transfer +and demonstrates the potential of diffusion models for learning +well-disentangled C-S characteristics. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ CCD-3DR: Consistent Conditioning in Diffusion for Single-Image 3D + Reconstruction + + +
+ In this paper, we present a novel shape reconstruction method leveraging +diffusion model to generate 3D sparse point cloud for the object captured in a +single RGB image. Recent methods typically leverage global embedding or local +projection-based features as the condition to guide the diffusion model. +However, such strategies fail to consistently align the denoised point cloud +with the given image, leading to unstable conditioning and inferior +performance. In this paper, we present CCD-3DR, which exploits a novel centered +diffusion probabilistic model for consistent local feature conditioning. We +constrain the noise and sampled point cloud from the diffusion model into a +subspace where the point cloud center remains unchanged during the forward +diffusion process and reverse process. The stable point cloud center further +serves as an anchor to align each point with its corresponding local +projection-based features. Extensive experiments on synthetic benchmark +ShapeNet-R2N2 demonstrate that CCD-3DR outperforms all competitors by a large +margin, with over 40% improvement. We also provide results on real-world +dataset Pix3D to thoroughly demonstrate the potential of CCD-3DR in real-world +applications. Codes will be released soon + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Learning Better Keypoints for Multi-Object 6DoF Pose Estimation + + +
+ We investigate the impact of pre-defined keypoints for pose estimation, and +found that accuracy and efficiency can be improved by training a graph network +to select a set of disperse keypoints with similarly distributed votes. These +votes, learned by a regression network to accumulate evidence for the keypoint +locations, can be regressed more accurately compared to previous heuristic +keypoint algorithms. The proposed KeyGNet, supervised by a combined loss +measuring both Wassserstein distance and dispersion, learns the color and +geometry features of the target objects to estimate optimal keypoint locations. +Experiments demonstrate the keypoints selected by KeyGNet improved the accuracy +for all evaluation metrics of all seven datasets tested, for three keypoint +voting methods. The challenging Occlusion LINEMOD dataset notably improved +ADD(S) by +16.4% on PVN3D, and all core BOP datasets showed an AR improvement +for all objects, of between +1% and +21.5%. There was also a notable increase +in performance when transitioning from single object to multiple object +training using KeyGNet keypoints, essentially eliminating the SISO-MIMO gap for +Occlusion LINEMOD. + +
+
+
+
+
+ + ☆ ImbSAM: A Closer Look at Sharpness-Aware Minimization in + Class-Imbalanced Recognition ICCV + + +
+ Class imbalance is a common challenge in real-world recognition tasks, where +the majority of classes have few samples, also known as tail classes. We +address this challenge with the perspective of generalization and empirically +find that the promising Sharpness-Aware Minimization (SAM) fails to address +generalization issues under the class-imbalanced setting. Through investigating +this specific type of task, we identify that its generalization bottleneck +primarily lies in the severe overfitting for tail classes with limited training +data. To overcome this bottleneck, we leverage class priors to restrict the +generalization scope of the class-agnostic SAM and propose a class-aware +smoothness optimization algorithm named Imbalanced-SAM (ImbSAM). With the +guidance of class priors, our ImbSAM specifically improves generalization +targeting tail classes. We also verify the efficacy of ImbSAM on two +prototypical applications of class-imbalanced recognition: long-tailed +classification and semi-supervised anomaly detection, where our ImbSAM +demonstrates remarkable performance improvements for tail classes and anomaly. +Our code implementation is available at +https://github.com/cool-xuan/Imbalanced_SAM. + +
+
+ comment: Accepted by International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ Grasp Transfer based on Self-Aligning Implicit Representations of Local + Surfaces + + +
+ Objects we interact with and manipulate often share similar parts, such as +handles, that allow us to transfer our actions flexibly due to their shared +functionality. This work addresses the problem of transferring a grasp +experience or a demonstration to a novel object that shares shape similarities +with objects the robot has previously encountered. Existing approaches for +solving this problem are typically restricted to a specific object category or +a parametric shape. Our approach, however, can transfer grasps associated with +implicit models of local surfaces shared across object categories. +Specifically, we employ a single expert grasp demonstration to learn an +implicit local surface representation model from a small dataset of object +meshes. At inference time, this model is used to transfer grasps to novel +objects by identifying the most geometrically similar surfaces to the one on +which the expert grasp is demonstrated. Our model is trained entirely in +simulation and is evaluated on simulated and real-world objects that are not +seen during training. Evaluations indicate that grasp transfer to unseen object +categories using this approach can be successfully performed both in simulation +and real-world experiments. The simulation results also show that the proposed +approach leads to better spatial precision and grasp accuracy compared to a +baseline approach. + +
+
+ comment: Accepted by IEEE RAL. 8 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ Neuromorphic Seatbelt State Detection for In-Cabin Monitoring with Event + Cameras + + +
+ Neuromorphic vision sensors, or event cameras, differ from conventional +cameras in that they do not capture images at a specified rate. Instead, they +asynchronously log local brightness changes at each pixel. As a result, event +cameras only record changes in a given scene, and do so with very high temporal +resolution, high dynamic range, and low power requirements. Recent research has +demonstrated how these characteristics make event cameras extremely practical +sensors in driver monitoring systems (DMS), enabling the tracking of high-speed +eye motion and blinks. This research provides a proof of concept to expand +event-based DMS techniques to include seatbelt state detection. Using an event +simulator, a dataset of 108,691 synthetic neuromorphic frames of car occupants +was generated from a near-infrared (NIR) dataset, and split into training, +validation, and test sets for a seatbelt state detection algorithm based on a +recurrent convolutional neural network (CNN). In addition, a smaller set of +real event data was collected and reserved for testing. In a binary +classification task, the fastened/unfastened frames were identified with an F1 +score of 0.989 and 0.944 on the simulated and real test sets respectively. When +the problem extended to also classify the action of fastening/unfastening the +seatbelt, respective F1 scores of 0.964 and 0.846 were achieved. + +
+
+ comment: 4 pages, 3 figures, IMVIP 2023 +
+
+
+
+
+ + ☆ Handwritten Stenography Recognition and the LION Dataset + + +
+ Purpose: In this paper, we establish a baseline for handwritten stenography +recognition, using the novel LION dataset, and investigate the impact of +including selected aspects of stenographic theory into the recognition process. +We make the LION dataset publicly available with the aim of encouraging future +research in handwritten stenography recognition. + Methods: A state-of-the-art text recognition model is trained to establish a +baseline. Stenographic domain knowledge is integrated by applying four +different encoding methods that transform the target sequence into +representations, which approximate selected aspects of the writing system. +Results are further improved by integrating a pre-training scheme, based on +synthetic data. + Results: The baseline model achieves an average test character error rate +(CER) of 29.81% and a word error rate (WER) of 55.14%. Test error rates are +reduced significantly by combining stenography-specific target sequence +encodings with pre-training and fine-tuning, yielding CERs in the range of +24.5% - 26% and WERs of 44.8% - 48.2%. + Conclusion: The obtained results demonstrate the challenging nature of +stenography recognition. Integrating stenography-specific knowledge, in +conjunction with pre-training and fine-tuning on synthetic data, yields +considerable improvements. Together with our precursor study on the subject, +this is the first work to apply modern handwritten text recognition to +stenography. The dataset and our code are publicly available via Zenodo. + +
+
+
+
+
+ + ☆ Learning to Identify Critical States for Reinforcement Learning from + Videos ICCV23 + + +
+ Recent work on deep reinforcement learning (DRL) has pointed out that +algorithmic information about good policies can be extracted from offline data +which lack explicit information about executed actions. For example, videos of +humans or robots may convey a lot of implicit information about rewarding +action sequences, but a DRL machine that wants to profit from watching such +videos must first learn by itself to identify and recognize relevant +states/actions/rewards. Without relying on ground-truth annotations, our new +method called Deep State Identifier learns to predict returns from episodes +encoded as videos. Then it uses a kind of mask-based sensitivity analysis to +extract/identify important critical states. Extensive experiments showcase our +method's potential for understanding and improving agent behavior. The source +code and the generated datasets are available at +https://github.com/AI-Initiative-KAUST/VideoRLCS. + +
+
+ comment: This paper was accepted to ICCV23 +
+
+
+
+
+ + ☆ DiffV2S: Diffusion-based Video-to-Speech Synthesis with Vision-guided + Speaker Embedding ICCV 2023 + + +
+ Recent research has demonstrated impressive results in video-to-speech +synthesis which involves reconstructing speech solely from visual input. +However, previous works have struggled to accurately synthesize speech due to a +lack of sufficient guidance for the model to infer the correct content with the +appropriate sound. To resolve the issue, they have adopted an extra speaker +embedding as a speaking style guidance from a reference auditory information. +Nevertheless, it is not always possible to obtain the audio information from +the corresponding video input, especially during the inference time. In this +paper, we present a novel vision-guided speaker embedding extractor using a +self-supervised pre-trained model and prompt tuning technique. In doing so, the +rich speaker embedding information can be produced solely from input visual +information, and the extra audio information is not necessary during the +inference time. Using the extracted vision-guided speaker embedding +representations, we further develop a diffusion-based video-to-speech synthesis +model, so called DiffV2S, conditioned on those speaker embeddings and the +visual representation extracted from the input video. The proposed DiffV2S not +only maintains phoneme details contained in the input video frames, but also +creates a highly intelligible mel-spectrogram in which the speaker identities +of the multiple speakers are all preserved. Our experimental results show that +DiffV2S achieves the state-of-the-art performance compared to the previous +video-to-speech synthesis technique. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Future Video Prediction from a Single Frame for Video Anomaly Detection + + +
+ Video anomaly detection (VAD) is an important but challenging task in +computer vision. The main challenge rises due to the rarity of training samples +to model all anomaly cases. Hence, semi-supervised anomaly detection methods +have gotten more attention, since they focus on modeling normals and they +detect anomalies by measuring the deviations from normal patterns. Despite +impressive advances of these methods in modeling normal motion and appearance, +long-term motion modeling has not been effectively explored so far. Inspired by +the abilities of the future frame prediction proxy-task, we introduce the task +of future video prediction from a single frame, as a novel proxy-task for video +anomaly detection. This proxy-task alleviates the challenges of previous +methods in learning longer motion patterns. Moreover, we replace the initial +and future raw frames with their corresponding semantic segmentation map, which +not only makes the method aware of object class but also makes the prediction +task less complex for the model. Extensive experiments on the benchmark +datasets (ShanghaiTech, UCSD-Ped1, and UCSD-Ped2) show the effectiveness of the +method and the superiority of its performance compared to SOTA prediction-based +VAD methods. + +
+
+
+
+
+ + ☆ Learning Image Deraining Transformer Network with Dynamic Dual + Self-Attention + + +
+ Recently, Transformer-based architecture has been introduced into single +image deraining task due to its advantage in modeling non-local information. +However, existing approaches tend to integrate global features based on a dense +self-attention strategy since it tend to uses all similarities of the tokens +between the queries and keys. In fact, this strategy leads to ignoring the most +relevant information and inducing blurry effect by the irrelevant +representations during the feature aggregation. To this end, this paper +proposes an effective image deraining Transformer with dynamic dual +self-attention (DDSA), which combines both dense and sparse attention +strategies to better facilitate clear image reconstruction. Specifically, we +only select the most useful similarity values based on top-k approximate +calculation to achieve sparse attention. In addition, we also develop a novel +spatial-enhanced feed-forward network (SEFN) to further obtain a more accurate +representation for achieving high-quality derained results. Extensive +experiments on benchmark datasets demonstrate the effectiveness of our proposed +method. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ An Interpretable Machine Learning Model with Deep Learning-based Imaging + Biomarkers for Diagnosis of Alzheimer's Disease + + +
+ Machine learning methods have shown large potential for the automatic early +diagnosis of Alzheimer's Disease (AD). However, some machine learning methods +based on imaging data have poor interpretability because it is usually unclear +how they make their decisions. Explainable Boosting Machines (EBMs) are +interpretable machine learning models based on the statistical framework of +generalized additive modeling, but have so far only been used for tabular data. +Therefore, we propose a framework that combines the strength of EBM with +high-dimensional imaging data using deep learning-based feature extraction. The +proposed framework is interpretable because it provides the importance of each +feature. We validated the proposed framework on the Alzheimer's Disease +Neuroimaging Initiative (ADNI) dataset, achieving accuracy of 0.883 and +area-under-the-curve (AUC) of 0.970 on AD and control classification. +Furthermore, we validated the proposed framework on an external testing set, +achieving accuracy of 0.778 and AUC of 0.887 on AD and subjective cognitive +decline (SCD) classification. The proposed framework significantly outperformed +an EBM model using volume biomarkers instead of deep learning-based features, +as well as an end-to-end convolutional neural network (CNN) with optimized +architecture. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Dual-path TokenLearner for Remote Photoplethysmography-based + Physiological Measurement with Facial Videos + + +
+ Remote photoplethysmography (rPPG) based physiological measurement is an +emerging yet crucial vision task, whose challenge lies in exploring accurate +rPPG prediction from facial videos accompanied by noises of illumination +variations, facial occlusions, head movements, \etc, in a non-contact manner. +Existing mainstream CNN-based models make efforts to detect physiological +signals by capturing subtle color changes in facial regions of interest (ROI) +caused by heartbeats. However, such models are constrained by the limited local +spatial or temporal receptive fields in the neural units. Unlike them, a native +Transformer-based framework called Dual-path TokenLearner (Dual-TL) is proposed +in this paper, which utilizes the concept of learnable tokens to integrate both +spatial and temporal informative contexts from the global perspective of the +video. Specifically, the proposed Dual-TL uses a Spatial TokenLearner (S-TL) to +explore associations in different facial ROIs, which promises the rPPG +prediction far away from noisy ROI disturbances. Complementarily, a Temporal +TokenLearner (T-TL) is designed to infer the quasi-periodic pattern of +heartbeats, which eliminates temporal disturbances such as head movements. The +two TokenLearners, S-TL and T-TL, are executed in a dual-path mode. This +enables the model to reduce noise disturbances for final rPPG signal +prediction. Extensive experiments on four physiological measurement benchmark +datasets are conducted. The Dual-TL achieves state-of-the-art performances in +both intra- and cross-dataset testings, demonstrating its immense potential as +a basic backbone for rPPG measurement. The source code is available at +\href{https://github.com/VUT-HFUT/Dual-TL}{https://github.com/VUT-HFUT/Dual-TL} + +
+
+
+
+
+ + ☆ Multi-scale Promoted Self-adjusting Correlation Learning for Facial + Action Unit Detection + + +
+ Facial Action Unit (AU) detection is a crucial task in affective computing +and social robotics as it helps to identify emotions expressed through facial +expressions. Anatomically, there are innumerable correlations between AUs, +which contain rich information and are vital for AU detection. Previous methods +used fixed AU correlations based on expert experience or statistical rules on +specific benchmarks, but it is challenging to comprehensively reflect complex +correlations between AUs via hand-crafted settings. There are alternative +methods that employ a fully connected graph to learn these dependencies +exhaustively. However, these approaches can result in a computational explosion +and high dependency with a large dataset. To address these challenges, this +paper proposes a novel self-adjusting AU-correlation learning (SACL) method +with less computation for AU detection. This method adaptively learns and +updates AU correlation graphs by efficiently leveraging the characteristics of +different levels of AU motion and emotion representation information extracted +in different stages of the network. Moreover, this paper explores the role of +multi-scale learning in correlation information extraction, and design a simple +yet effective multi-scale feature learning (MSFL) method to promote better +performance in AU detection. By integrating AU correlation information with +multi-scale features, the proposed method obtains a more robust feature +representation for the final AU detection. Extensive experiments show that the +proposed method outperforms the state-of-the-art methods on widely used AU +detection benchmark datasets, with only 28.7\% and 12.0\% of the parameters and +FLOPs of the best method, respectively. The code for this method is available +at \url{https://github.com/linuxsino/Self-adjusting-AU}. + +
+
+ comment: 13pages, 7 figures +
+
+
+
+
+ + ☆ Whale Detection Enhancement through Synthetic Satellite Images + + +
+ With a number of marine populations in rapid decline, collecting and +analyzing data about marine populations has become increasingly important to +develop effective conservation policies for a wide range of marine animals, +including whales. Modern computer vision algorithms allow us to detect whales +in images in a wide range of domains, further speeding up and enhancing the +monitoring process. However, these algorithms heavily rely on large training +datasets, which are challenging and time-consuming to collect particularly in +marine or aquatic environments. Recent advances in AI however have made it +possible to synthetically create datasets for training machine learning +algorithms, thus enabling new solutions that were not possible before. In this +work, we present a solution - SeaDroneSim2 benchmark suite, which addresses +this challenge by generating aerial, and satellite synthetic image datasets to +improve the detection of whales and reduce the effort required for training +data collection. We show that we can achieve a 15% performance boost on whale +detection compared to using the real data alone for training, by augmenting a +10% real data. We open source both the code of the simulation platform +SeaDroneSim2 and the dataset generated through it. + +
+
+
+
+
+ + ☆ CASPNet++: Joint Multi-Agent Motion Prediction + + +
+ The prediction of road users' future motion is a critical task in supporting +advanced driver-assistance systems (ADAS). It plays an even more crucial role +for autonomous driving (AD) in enabling the planning and execution of safe +driving maneuvers. Based on our previous work, Context-Aware Scene Prediction +Network (CASPNet), an improved system, CASPNet++, is proposed. In this work, we +focus on further enhancing the interaction modeling and scene understanding to +support the joint prediction of all road users in a scene using spatiotemporal +grids to model future occupancy. Moreover, an instance-based output head is +introduced to provide multi-modal trajectories for agents of interest. In +extensive quantitative and qualitative analysis, we demonstrate the scalability +of CASPNet++ in utilizing and fusing diverse environmental input sources such +as HD maps, Radar detection, and Lidar segmentation. Tested on the +urban-focused prediction dataset nuScenes, CASPNet++ reaches state-of-the-art +performance. The model has been deployed in a testing vehicle, running in +real-time with moderate computational resources. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Dancing Avatar: Pose and Text-Guided Human Motion Videos Synthesis with + Image Diffusion Model + + +
+ The rising demand for creating lifelike avatars in the digital realm has led +to an increased need for generating high-quality human videos guided by textual +descriptions and poses. We propose Dancing Avatar, designed to fabricate human +motion videos driven by poses and textual cues. Our approach employs a +pretrained T2I diffusion model to generate each video frame in an +autoregressive fashion. The crux of innovation lies in our adept utilization of +the T2I diffusion model for producing video frames successively while +preserving contextual relevance. We surmount the hurdles posed by maintaining +human character and clothing consistency across varying poses, along with +upholding the background's continuity amidst diverse human movements. To ensure +consistent human appearances across the entire video, we devise an intra-frame +alignment module. This module assimilates text-guided synthesized human +character knowledge into the pretrained T2I diffusion model, synergizing +insights from ChatGPT. For preserving background continuity, we put forth a +background alignment pipeline, amalgamating insights from segment anything and +image inpainting techniques. Furthermore, we propose an inter-frame alignment +module that draws inspiration from an auto-regressive pipeline to augment +temporal consistency between adjacent frames, where the preceding frame guides +the synthesis process of the current frame. Comparisons with state-of-the-art +methods demonstrate that Dancing Avatar exhibits the capacity to generate human +videos with markedly superior quality, both in terms of human and background +fidelity, as well as temporal coherence compared to existing state-of-the-art +approaches. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ Exploiting Sparsity in Automotive Radar Object Detection Networks + + +
+ Having precise perception of the environment is crucial for ensuring the +secure and reliable functioning of autonomous driving systems. Radar object +detection networks are one fundamental part of such systems. CNN-based object +detectors showed good performance in this context, but they require large +compute resources. This paper investigates sparse convolutional object +detection networks, which combine powerful grid-based detection with low +compute resources. We investigate radar specific challenges and propose sparse +kernel point pillars (SKPP) and dual voxel point convolutions (DVPC) as +remedies for the grid rendering and sparse backbone architectures. We evaluate +our SKPP-DPVCN architecture on nuScenes, which outperforms the baseline by +5.89% and the previous state of the art by 4.19% in Car AP4.0. Moreover, +SKPP-DPVCN reduces the average scale error (ASE) by 21.41% over the baseline. + +
+
+
+
+
+ + ☆ ChartDETR: A Multi-shape Detection Network for Visual Chart Recognition + + +
+ Visual chart recognition systems are gaining increasing attention due to the +growing demand for automatically identifying table headers and values from +chart images. Current methods rely on keypoint detection to estimate data +element shapes in charts but suffer from grouping errors in post-processing. To +address this issue, we propose ChartDETR, a transformer-based multi-shape +detector that localizes keypoints at the corners of regular shapes to +reconstruct multiple data elements in a single chart image. Our method predicts +all data element shapes at once by introducing query groups in set prediction, +eliminating the need for further postprocessing. This property allows ChartDETR +to serve as a unified framework capable of representing various chart types +without altering the network architecture, effectively detecting data elements +of diverse shapes. We evaluated ChartDETR on three datasets, achieving +competitive results across all chart types without any additional enhancements. +For example, ChartDETR achieved an F1 score of 0.98 on Adobe Synthetic, +significantly outperforming the previous best model with a 0.71 F1 score. +Additionally, we obtained a new state-of-the-art result of 0.97 on +ExcelChart400k. The code will be made publicly available. + +
+
+
+
+
+ + ☆ Identity-Consistent Aggregation for Video Object Detection ICCV2023 + + +
+ In Video Object Detection (VID), a common practice is to leverage the rich +temporal contexts from the video to enhance the object representations in each +frame. Existing methods treat the temporal contexts obtained from different +objects indiscriminately and ignore their different identities. While +intuitively, aggregating local views of the same object in different frames may +facilitate a better understanding of the object. Thus, in this paper, we aim to +enable the model to focus on the identity-consistent temporal contexts of each +object to obtain more comprehensive object representations and handle the rapid +object appearance variations such as occlusion, motion blur, etc. However, +realizing this goal on top of existing VID models faces low-efficiency problems +due to their redundant region proposals and nonparallel frame-wise prediction +manner. To aid this, we propose ClipVID, a VID model equipped with +Identity-Consistent Aggregation (ICA) layers specifically designed for mining +fine-grained and identity-consistent temporal contexts. It effectively reduces +the redundancies through the set prediction strategy, making the ICA layers +very efficient and further allowing us to design an architecture that makes +parallel clip-wise predictions for the whole video clip. Extensive experimental +results demonstrate the superiority of our method: a state-of-the-art (SOTA) +performance (84.7% mAP) on the ImageNet VID dataset while running at a speed +about 7x faster (39.3 fps) than previous SOTAs. + +
+
+ comment: to be appeared at ICCV2023 +
+
+
+
+
+ + ☆ Dynamic Low-Rank Instance Adaptation for Universal Neural Image + Compression ACM MM 2023 + + +
+ The latest advancements in neural image compression show great potential in +surpassing the rate-distortion performance of conventional standard codecs. +Nevertheless, there exists an indelible domain gap between the datasets +utilized for training (i.e., natural images) and those utilized for inference +(e.g., artistic images). Our proposal involves a low-rank adaptation approach +aimed at addressing the rate-distortion drop observed in out-of-domain +datasets. Specifically, we perform low-rank matrix decomposition to update +certain adaptation parameters of the client's decoder. These updated +parameters, along with image latents, are encoded into a bitstream and +transmitted to the decoder in practical scenarios. Due to the low-rank +constraint imposed on the adaptation parameters, the resulting bit rate +overhead is small. Furthermore, the bit rate allocation of low-rank adaptation +is \emph{non-trivial}, considering the diverse inputs require varying +adaptation bitstreams. We thus introduce a dynamic gating network on top of the +low-rank adaptation method, in order to decide which decoder layer should +employ adaptation. The dynamic adaptation network is optimized end-to-end using +rate-distortion loss. Our proposed method exhibits universality across diverse +image datasets. Extensive results demonstrate that this paradigm significantly +mitigates the domain gap, surpassing non-adaptive methods with an average +BD-rate improvement of approximately $19\%$ across out-of-domain images. +Furthermore, it outperforms the most advanced instance adaptive methods by +roughly $5\%$ BD-rate. Ablation studies confirm our method's ability to +universally enhance various image compression architectures. + +
+
+ comment: Accepted by ACM MM 2023, 13 pages, 12 figures +
+
+
+
+
+ + ☆ UniTR: A Unified and Efficient Multi-Modal Transformer for + Bird's-Eye-View Representation ICCV2023 + + +
+ Jointly processing information from multiple sensors is crucial to achieving +accurate and robust perception for reliable autonomous driving systems. +However, current 3D perception research follows a modality-specific paradigm, +leading to additional computation overheads and inefficient collaboration +between different sensor data. In this paper, we present an efficient +multi-modal backbone for outdoor 3D perception named UniTR, which processes a +variety of modalities with unified modeling and shared parameters. Unlike +previous works, UniTR introduces a modality-agnostic transformer encoder to +handle these view-discrepant sensor data for parallel modal-wise representation +learning and automatic cross-modal interaction without additional fusion steps. +More importantly, to make full use of these complementary sensor types, we +present a novel multi-modal integration strategy by both considering +semantic-abundant 2D perspective and geometry-aware 3D sparse neighborhood +relations. UniTR is also a fundamentally task-agnostic backbone that naturally +supports different 3D perception tasks. It sets a new state-of-the-art +performance on the nuScenes benchmark, achieving +1.1 NDS higher for 3D object +detection and +12.0 higher mIoU for BEV map segmentation with lower inference +latency. Code will be available at https://github.com/Haiyang-W/UniTR . + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Context-Aware Pseudo-Label Refinement for Source-Free Domain Adaptive + Fundus Image Segmentation MICCAI 2023 + + +
+ In the domain adaptation problem, source data may be unavailable to the +target client side due to privacy or intellectual property issues. Source-free +unsupervised domain adaptation (SF-UDA) aims at adapting a model trained on the +source side to align the target distribution with only the source model and +unlabeled target data. The source model usually produces noisy and +context-inconsistent pseudo-labels on the target domain, i.e., neighbouring +regions that have a similar visual appearance are annotated with different +pseudo-labels. This observation motivates us to refine pseudo-labels with +context relations. Another observation is that features of the same class tend +to form a cluster despite the domain gap, which implies context relations can +be readily calculated from feature distances. To this end, we propose a +context-aware pseudo-label refinement method for SF-UDA. Specifically, a +context-similarity learning module is developed to learn context relations. +Next, pseudo-label revision is designed utilizing the learned context +relations. Further, we propose calibrating the revised pseudo-labels to +compensate for wrong revision caused by inaccurate context relations. +Additionally, we adopt a pixel-level and class-level denoising scheme to select +reliable pseudo-labels for domain adaptation. Experiments on cross-domain +fundus images indicate that our approach yields the state-of-the-art results. +Code is available at https://github.com/xmed-lab/CPR. + +
+
+ comment: Accepted by MICCAI 2023, 11 pages +
+
+
+
+
+ + ☆ Real-time Automatic M-mode Echocardiography Measurement with Panel + Attention from Local-to-Global Pixels + + +
+ Motion mode (M-mode) recording is an essential part of echocardiography to +measure cardiac dimension and function. However, the current diagnosis cannot +build an automatic scheme, as there are three fundamental obstructs: Firstly, +there is no open dataset available to build the automation for ensuring +constant results and bridging M-mode echocardiography with real-time instance +segmentation (RIS); Secondly, the examination is involving the time-consuming +manual labelling upon M-mode echocardiograms; Thirdly, as objects in +echocardiograms occupy a significant portion of pixels, the limited receptive +field in existing backbones (e.g., ResNet) composed from multiple convolution +layers are inefficient to cover the period of a valve movement. Existing +non-local attentions (NL) compromise being unable real-time with a high +computation overhead or losing information from a simplified version of the +non-local block. Therefore, we proposed RAMEM, a real-time automatic M-mode +echocardiography measurement scheme, contributes three aspects to answer the +problems: 1) provide MEIS, a dataset of M-mode echocardiograms for instance +segmentation, to enable consistent results and support the development of an +automatic scheme; 2) propose panel attention, local-to-global efficient +attention by pixel-unshuffling, embedding with updated UPANets V2 in a RIS +scheme toward big object detection with global receptive field; 3) develop and +implement AMEM, an efficient algorithm of automatic M-mode echocardiography +measurement enabling fast and accurate automatic labelling among diagnosis. The +experimental results show that RAMEM surpasses existing RIS backbones (with +non-local attention) in PASCAL 2012 SBD and human performances in real-time +MEIS tested. The code of MEIS and dataset are available at +https://github.com/hanktseng131415go/RAME. + +
+
+
+
+
+ + ☆ Exploring Transfer Learning in Medical Image Segmentation using + Vision-Language Models + + +
+ Medical Image Segmentation is crucial in various clinical applications within +the medical domain. While state-of-the-art segmentation models have proven +effective, integrating textual guidance to enhance visual features for this +task remains an area with limited progress. Existing segmentation models that +utilize textual guidance are primarily trained on open-domain images, raising +concerns about their direct applicability in the medical domain without manual +intervention or fine-tuning. + To address these challenges, we propose using multimodal vision-language +models for capturing semantic information from image descriptions and images, +enabling the segmentation of diverse medical images. This study comprehensively +evaluates existing vision language models across multiple datasets to assess +their transferability from the open domain to the medical field. Furthermore, +we introduce variations of image descriptions for previously unseen images in +the dataset, revealing notable variations in model performance based on the +generated prompts. + Our findings highlight the distribution shift between the open-domain images +and the medical domain and show that the segmentation models trained on +open-domain images are not directly transferrable to the medical field. But +their performance can be increased by finetuning them in the medical datasets. +We report the zero-shot and finetuned segmentation performance of 4 Vision +Language Models (VLMs) on 11 medical datasets using 9 types of prompts derived +from 14 attributes. + +
+
+ comment: 25 pages, 9 figures +
+
+
+
+
+ + ☆ Enhancing Network Initialization for Medical AI Models Using + Large-Scale, Unlabeled Natural Images + + +
+ Pre-training datasets, like ImageNet, have become the gold standard in +medical image analysis. However, the emergence of self-supervised learning +(SSL), which leverages unlabeled data to learn robust features, presents an +opportunity to bypass the intensive labeling process. In this study, we +explored if SSL for pre-training on non-medical images can be applied to chest +radiographs and how it compares to supervised pre-training on non-medical +images and on medical images. We utilized a vision transformer and initialized +its weights based on (i) SSL pre-training on natural images (DINOv2), (ii) SL +pre-training on natural images (ImageNet dataset), and (iii) SL pre-training on +chest radiographs from the MIMIC-CXR database. We tested our approach on over +800,000 chest radiographs from six large global datasets, diagnosing more than +20 different imaging findings. Our SSL pre-training on curated images not only +outperformed ImageNet-based pre-training (P<0.001 for all datasets) but, in +certain cases, also exceeded SL on the MIMIC-CXR dataset. Our findings suggest +that selecting the right pre-training strategy, especially with SSL, can be +pivotal for improving artificial intelligence (AI)'s diagnostic accuracy in +medical imaging. By demonstrating the promise of SSL in chest radiograph +analysis, we underline a transformative shift towards more efficient and +accurate AI models in medical imaging. + +
+
+
+
+
+ + ☆ DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using + Pre-trained Diffusion Models ICCV2023 + + +
+ Given a classifier, the inherent property of semantic Out-of-Distribution +(OOD) samples is that their contents differ from all legal classes in terms of +semantics, namely semantic mismatch. There is a recent work that directly +applies it to OOD detection, which employs a conditional Generative Adversarial +Network (cGAN) to enlarge semantic mismatch in the image space. While achieving +remarkable OOD detection performance on small datasets, it is not applicable to +ImageNet-scale datasets due to the difficulty in training cGANs with both input +images and labels as conditions. As diffusion models are much easier to train +and amenable to various conditions compared to cGANs, in this work, we propose +to directly use pre-trained diffusion models for semantic mismatch-guided OOD +detection, named DiffGuard. Specifically, given an OOD input image and the +predicted label from the classifier, we try to enlarge the semantic difference +between the reconstructed OOD image under these conditions and the original +input image. We also present several test-time techniques to further strengthen +such differences. Experimental results show that DiffGuard is effective on both +Cifar-10 and hard cases of the large-scale ImageNet, and it can be easily +combined with existing OOD detection techniques to achieve state-of-the-art OOD +detection results. + +
+
+ comment: Accepted by ICCV2023, with supplementary materials +
+
+
+
+
+ + ☆ Boosting Multi-modal Model Performance with Adaptive Gradient Modulation ICCV2023 + + +
+ While the field of multi-modal learning keeps growing fast, the deficiency of +the standard joint training paradigm has become clear through recent studies. +They attribute the sub-optimal performance of the jointly trained model to the +modality competition phenomenon. Existing works attempt to improve the jointly +trained model by modulating the training process. Despite their effectiveness, +those methods can only apply to late fusion models. More importantly, the +mechanism of the modality competition remains unexplored. In this paper, we +first propose an adaptive gradient modulation method that can boost the +performance of multi-modal models with various fusion strategies. Extensive +experiments show that our method surpasses all existing modulation methods. +Furthermore, to have a quantitative understanding of the modality competition +and the mechanism behind the effectiveness of our modulation method, we +introduce a novel metric to measure the competition strength. This metric is +built on the mono-modal concept, a function that is designed to represent the +competition-less state of a modality. Through systematic investigation, our +results confirm the intuition that the modulation encourages the model to rely +on the more informative modality. In addition, we find that the jointly trained +model typically has a preferred modality on which the competition is weaker +than other modalities. However, this preferred modality need not dominate +others. Our code will be available at +https://github.com/lihong2303/AGM_ICCV2023. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ A Review of Adversarial Attacks in Computer Vision + + +
+ Deep neural networks have been widely used in various downstream tasks, +especially those safety-critical scenario such as autonomous driving, but deep +networks are often threatened by adversarial samples. Such adversarial attacks +can be invisible to human eyes, but can lead to DNN misclassification, and +often exhibits transferability between deep learning and machine learning +models and real-world achievability. Adversarial attacks can be divided into +white-box attacks, for which the attacker knows the parameters and gradient of +the model, and black-box attacks, for the latter, the attacker can only obtain +the input and output of the model. In terms of the attacker's purpose, it can +be divided into targeted attacks and non-targeted attacks, which means that the +attacker wants the model to misclassify the original sample into the specified +class, which is more practical, while the non-targeted attack just needs to +make the model misclassify the sample. The black box setting is a scenario we +will encounter in practice. + +
+
+
+
+
+ + ☆ Inversion-by-Inversion: Exemplar-based Sketch-to-Photo Synthesis via + Stochastic Differential Equations without Training + + +
+ Exemplar-based sketch-to-photo synthesis allows users to generate +photo-realistic images based on sketches. Recently, diffusion-based methods +have achieved impressive performance on image generation tasks, enabling +highly-flexible control through text-driven generation or energy functions. +However, generating photo-realistic images with color and texture from sketch +images remains challenging for diffusion models. Sketches typically consist of +only a few strokes, with most regions left blank, making it difficult for +diffusion-based methods to produce photo-realistic images. In this work, we +propose a two-stage method named ``Inversion-by-Inversion" for exemplar-based +sketch-to-photo synthesis. This approach includes shape-enhancing inversion and +full-control inversion. During the shape-enhancing inversion process, an +uncolored photo is generated with the guidance of a shape-energy function. This +step is essential to ensure control over the shape of the generated photo. In +the full-control inversion process, we propose an appearance-energy function to +control the color and texture of the final generated photo.Importantly, our +Inversion-by-Inversion pipeline is training-free and can accept different types +of exemplars for color and texture control. We conducted extensive experiments +to evaluate our proposed method, and the results demonstrate its effectiveness. + +
+
+ comment: 15 pages, preprint version +
+
+
+
+
+ + ☆ Gradient-Based Post-Training Quantization: Challenging the Status Quo + + +
+ Quantization has become a crucial step for the efficient deployment of deep +neural networks, where floating point operations are converted to simpler fixed +point operations. In its most naive form, it simply consists in a combination +of scaling and rounding transformations, leading to either a limited +compression rate or a significant accuracy drop. Recently, Gradient-based +post-training quantization (GPTQ) methods appears to be constitute a suitable +trade-off between such simple methods and more powerful, yet expensive +Quantization-Aware Training (QAT) approaches, particularly when attempting to +quantize LLMs, where scalability of the quantization process is of paramount +importance. GPTQ essentially consists in learning the rounding operation using +a small calibration set. In this work, we challenge common choices in GPTQ +methods. In particular, we show that the process is, to a certain extent, +robust to a number of variables (weight selection, feature augmentation, choice +of calibration set). More importantly, we derive a number of best practices for +designing more efficient and scalable GPTQ methods, regarding the problem +formulation (loss, degrees of freedom, use of non-uniform quantization schemes) +or optimization process (choice of variable and optimizer). Lastly, we propose +a novel importance-based mixed-precision technique. Those guidelines lead to +significant performance improvements on all the tested state-of-the-art GPTQ +methods and networks (e.g. +6.819 points on ViT for 4-bit quantization), paving +the way for the design of scalable, yet effective quantization methods. + +
+
+
+
+
+ + ☆ Geometry of the Visual Cortex with Applications to Image Inpainting and + Enhancement + + +
+ Equipping the rototranslation group $SE(2)$ with a sub-Riemannian structure +inspired by the visual cortex V1, we propose algorithms for image inpainting +and enhancement based on hypoelliptic diffusion. We innovate on previous +implementations of the methods by Citti, Sarti and Boscain et al., by proposing +an alternative that prevents fading and capable of producing sharper results in +a procedure that we call WaxOn-WaxOff. We also exploit the sub-Riemannian +structure to define a completely new unsharp using $SE(2)$, analogous of the +classical unsharp filter for 2D image processing, with applications to image +enhancement. We demonstrate our method on blood vessels enhancement in retinal +scans. + +
+
+ comment: Associated python package available at + https://github.com/ballerin/v1diffusion +
+
+
+
+
+ + ☆ EQ-Net: Elastic Quantization Neural Networks + + +
+ Current model quantization methods have shown their promising capability in +reducing storage space and computation complexity. However, due to the +diversity of quantization forms supported by different hardware, one limitation +of existing solutions is that usually require repeated optimization for +different scenarios. How to construct a model with flexible quantization forms +has been less studied. In this paper, we explore a one-shot network +quantization regime, named Elastic Quantization Neural Networks (EQ-Net), which +aims to train a robust weight-sharing quantization supernet. First of all, we +propose an elastic quantization space (including elastic bit-width, +granularity, and symmetry) to adapt to various mainstream quantitative forms. +Secondly, we propose the Weight Distribution Regularization Loss (WDR-Loss) and +Group Progressive Guidance Loss (GPG-Loss) to bridge the inconsistency of the +distribution for weights and output logits in the elastic quantization space +gap. Lastly, we incorporate genetic algorithms and the proposed Conditional +Quantization-Aware Accuracy Predictor (CQAP) as an estimator to quickly search +mixed-precision quantized neural networks in supernet. Extensive experiments +demonstrate that our EQ-Net is close to or even better than its static +counterparts as well as state-of-the-art robust bit-width methods. Code can be +available at +\href{https://github.com/xuke225/EQ-Net.git}{https://github.com/xuke225/EQ-Net}. + +
+
+
+
+
+ + ☆ Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval ICCV2023 + + +
+ In text-video retrieval, recent works have benefited from the powerful +learning capabilities of pre-trained text-image foundation models (e.g., CLIP) +by adapting them to the video domain. A critical problem for them is how to +effectively capture the rich semantics inside the video using the image encoder +of CLIP. To tackle this, state-of-the-art methods adopt complex cross-modal +modeling techniques to fuse the text information into video frame +representations, which, however, incurs severe efficiency issues in large-scale +retrieval systems as the video representations must be recomputed online for +every text query. In this paper, we discard this problematic cross-modal fusion +process and aim to learn semantically-enhanced representations purely from the +video, so that the video representations can be computed offline and reused for +different texts. Concretely, we first introduce a spatial-temporal "Prompt +Cube" into the CLIP image encoder and iteratively switch it within the encoder +layers to efficiently incorporate the global video semantics into frame +representations. We then propose to apply an auxiliary video captioning +objective to train the frame representations, which facilitates the learning of +detailed video semantics by providing fine-grained guidance in the semantic +space. With a naive temporal fusion strategy (i.e., mean-pooling) on the +enhanced frame representations, we obtain state-of-the-art performances on +three benchmark datasets, i.e., MSR-VTT, MSVD, and LSMDC. + +
+
+ comment: to be appeared in ICCV2023 +
+
+
+
+
+ + ☆ Backpropagation Path Search On Adversarial Transferability ICCV2023 + + +
+ Deep neural networks are vulnerable to adversarial examples, dictating the +imperativeness to test the model's robustness before deployment. Transfer-based +attackers craft adversarial examples against surrogate models and transfer them +to victim models deployed in the black-box situation. To enhance the +adversarial transferability, structure-based attackers adjust the +backpropagation path to avoid the attack from overfitting the surrogate model. +However, existing structure-based attackers fail to explore the convolution +module in CNNs and modify the backpropagation graph heuristically, leading to +limited effectiveness. In this paper, we propose backPropagation pAth Search +(PAS), solving the aforementioned two problems. We first propose SkipConv to +adjust the backpropagation path of convolution by structural +reparameterization. To overcome the drawback of heuristically designed +backpropagation paths, we further construct a DAG-based search space, utilize +one-step approximation for path evaluation and employ Bayesian Optimization to +search for the optimal path. We conduct comprehensive experiments in a wide +range of transfer settings, showing that PAS improves the attack success rate +by a huge margin for both normally trained and defense models. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Self-Prompting Large Vision Models for Few-Shot Medical Image + Segmentation MICCAI + + +
+ Recent advancements in large foundation models have shown promising potential +in the medical industry due to their flexible prompting capability. One such +model, the Segment Anything Model (SAM), a prompt-driven segmentation model, +has shown remarkable performance improvements, surpassing state-of-the-art +approaches in medical image segmentation. However, existing methods primarily +rely on tuning strategies that require extensive data or prior prompts tailored +to the specific task, making it particularly challenging when only a limited +number of data samples are available. In this paper, we propose a novel +perspective on self-prompting in medical vision applications. Specifically, we +harness the embedding space of SAM to prompt itself through a simple yet +effective linear pixel-wise classifier. By preserving the encoding capabilities +of the large model, the contextual information from its decoder, and leveraging +its interactive promptability, we achieve competitive results on multiple +datasets (i.e. improvement of more than 15% compared to fine-tuning the mask +decoder using a few images). + +
+
+ comment: 8.5 pages + 2 pages of supplementary materials + 2 pages of + references, 3 figures, submitted to 5th MICCAI Workshop on Domain Adaptation + and Representation Transfer (DART) +
+
+
+
+
+ + ☆ Self-supervised Hypergraphs for Learning Multiple World Interpretations + + +
+ We present a method for learning multiple scene representations given a small +labeled set, by exploiting the relationships between such representations in +the form of a multi-task hypergraph. We also show how we can use the hypergraph +to improve a powerful pretrained VisTransformer model without any additional +labeled data. In our hypergraph, each node is an interpretation layer (e.g., +depth or segmentation) of the scene. Within each hyperedge, one or several +input nodes predict the layer at the output node. Thus, each node could be an +input node in some hyperedges and an output node in others. In this way, +multiple paths can reach the same node, to form ensembles from which we obtain +robust pseudolabels, which allow self-supervised learning in the hypergraph. We +test different ensemble models and different types of hyperedges and show +superior performance to other multi-task graph models in the field. We also +introduce Dronescapes, a large video dataset captured with UAVs in different +complex real-world scenes, with multiple representations, suitable for +multi-task learning. + +
+
+
+
+
+ + ☆ GAMER-MRIL identifies Disability-Related Brain Changes in Multiple + Sclerosis + + +
+ Objective: Identifying disability-related brain changes is important for +multiple sclerosis (MS) patients. Currently, there is no clear understanding +about which pathological features drive disability in single MS patients. In +this work, we propose a novel comprehensive approach, GAMER-MRIL, leveraging +whole-brain quantitative MRI (qMRI), convolutional neural network (CNN), and an +interpretability method from classifying MS patients with severe disability to +investigating relevant pathological brain changes. Methods: +One-hundred-sixty-six MS patients underwent 3T MRI acquisitions. qMRI +informative of microstructural brain properties was reconstructed, including +quantitative T1 (qT1), myelin water fraction (MWF), and neurite density index +(NDI). To fully utilize the qMRI, GAMER-MRIL extended a gated-attention-based +CNN (GAMER-MRI), which was developed to select patch-based qMRI important for a +given task/question, to the whole-brain image. To find out disability-related +brain regions, GAMER-MRIL modified a structure-aware interpretability method, +Layer-wise Relevance Propagation (LRP), to incorporate qMRI. Results: The test +performance was AUC=0.885. qT1 was the most sensitive measure related to +disability, followed by NDI. The proposed LRP approach obtained more +specifically relevant regions than other interpretability methods, including +the saliency map, the integrated gradients, and the original LRP. The relevant +regions included the corticospinal tract, where average qT1 and NDI +significantly correlated with patients' disability scores ($\rho$=-0.37 and +0.44). Conclusion: These results demonstrated that GAMER-MRIL can classify +patients with severe disability using qMRI and subsequently identify brain +regions potentially important to the integrity of the mobile function. +Significance: GAMER-MRIL holds promise for developing biomarkers and increasing +clinicians' trust in NN. + +
+
+
+
+
+ + ☆ SGDiff: A Style Guided Diffusion Model for Fashion Synthesis ACM MM'23 + + +
+ This paper reports on the development of \textbf{a novel style guided +diffusion model (SGDiff)} which overcomes certain weaknesses inherent in +existing models for image synthesis. The proposed SGDiff combines image +modality with a pretrained text-to-image diffusion model to facilitate creative +fashion image synthesis. It addresses the limitations of text-to-image +diffusion models by incorporating supplementary style guidance, substantially +reducing training costs, and overcoming the difficulties of controlling +synthesized styles with text-only inputs. This paper also introduces a new +dataset -- SG-Fashion, specifically designed for fashion image synthesis +applications, offering high-resolution images and an extensive range of garment +categories. By means of comprehensive ablation study, we examine the +application of classifier-free guidance to a variety of conditions and validate +the effectiveness of the proposed model for generating fashion images of the +desired categories, product attributes, and styles. The contributions of this +paper include a novel classifier-free guidance method for multi-modal feature +fusion, a comprehensive dataset for fashion image synthesis application, a +thorough investigation on conditioned text-to-image synthesis, and valuable +insights for future research in the text-to-image synthesis domain. The code +and dataset are available at: \url{https://github.com/taited/SGDiff}. + +
+
+ comment: Accepted by ACM MM'23 +
+
+
+
+
+ + ☆ AKVSR: Audio Knowledge Empowered Visual Speech Recognition by + Compressing Audio Knowledge of a Pretrained Model + + +
+ Visual Speech Recognition (VSR) is the task of predicting spoken words from +silent lip movements. VSR is regarded as a challenging task because of the +insufficient information on lip movements. In this paper, we propose an Audio +Knowledge empowered Visual Speech Recognition framework (AKVSR) to complement +the insufficient speech information of visual modality by using audio modality. +Different from the previous methods, the proposed AKVSR 1) utilizes rich audio +knowledge encoded by a large-scale pretrained audio model, 2) saves the +linguistic information of audio knowledge in compact audio memory by discarding +the non-linguistic information from the audio through quantization, and 3) +includes Audio Bridging Module which can find the best-matched audio features +from the compact audio memory, which makes our training possible without audio +inputs, once after the compact audio memory is composed. We validate the +effectiveness of the proposed method through extensive experiments, and achieve +new state-of-the-art performances on the widely-used datasets, LRS2 and LRS3. + +
+
+
+
+
+ + ☆ Graph-Segmenter: Graph Transformer with Boundary-aware Attention for + Semantic Segmentation + + +
+ The transformer-based semantic segmentation approaches, which divide the +image into different regions by sliding windows and model the relation inside +each window, have achieved outstanding success. However, since the relation +modeling between windows was not the primary emphasis of previous work, it was +not fully utilized. To address this issue, we propose a Graph-Segmenter, +including a Graph Transformer and a Boundary-aware Attention module, which is +an effective network for simultaneously modeling the more profound relation +between windows in a global view and various pixels inside each window as a +local one, and for substantial low-cost boundary adjustment. Specifically, we +treat every window and pixel inside the window as nodes to construct graphs for +both views and devise the Graph Transformer. The introduced boundary-aware +attention module optimizes the edge information of the target objects by +modeling the relationship between the pixel on the object's edge. Extensive +experiments on three widely used semantic segmentation datasets (Cityscapes, +ADE-20k and PASCAL Context) demonstrate that our proposed network, a Graph +Transformer with Boundary-aware Attention, can achieve state-of-the-art +segmentation performance. + +
+
+
+
+
+ + ☆ ADD: An Automatic Desensitization Fisheye Dataset for Autonomous Driving + + +
+ Autonomous driving systems require many images for analyzing the surrounding +environment. However, there is fewer data protection for private information +among these captured images, such as pedestrian faces or vehicle license +plates, which has become a significant issue. In this paper, in response to the +call for data security laws and regulations and based on the advantages of +large Field of View(FoV) of the fisheye camera, we build the first Autopilot +Desensitization Dataset, called ADD, and formulate the first +deep-learning-based image desensitization framework, to promote the study of +image desensitization in autonomous driving scenarios. The compiled dataset +consists of 650K images, including different face and vehicle license plate +information captured by the surround-view fisheye camera. It covers various +autonomous driving scenarios, including diverse facial characteristics and +license plate colors. Then, we propose an efficient multitask desensitization +network called DesCenterNet as a benchmark on the ADD dataset, which can +perform face and vehicle license plate detection and desensitization tasks. +Based on ADD, we further provide an evaluation criterion for desensitization +performance, and extensive comparison experiments have verified the +effectiveness and superiority of our method on image desensitization. + +
+
+
+
+
+ + ☆ AutoLTS: Automating Cycling Stress Assessment via Contrastive Learning + and Spatial Post-processing + + +
+ Cycling stress assessment, which quantifies cyclists' perceived stress +imposed by the built environment and motor traffics, increasingly informs +cycling infrastructure planning and cycling route recommendation. However, +currently calculating cycling stress is slow and data-intensive, which hinders +its broader application. In this paper, We propose a deep learning framework to +support accurate, fast, and large-scale cycling stress assessments for urban +road networks based on street-view images. Our framework features i) a +contrastive learning approach that leverages the ordinal relationship among +cycling stress labels, and ii) a post-processing technique that enforces +spatial smoothness into our predictions. On a dataset of 39,153 road segments +collected in Toronto, Canada, our results demonstrate the effectiveness of our +deep learning framework and the value of using image data for cycling stress +assessment in the absence of high-quality road geometry and motor traffic data. + +
+
+
+
+
+ + ☆ Story Visualization by Online Text Augmentation with Context Memory ICCV 2023 + + +
+ Story visualization (SV) is a challenging text-to-image generation task for +the difficulty of not only rendering visual details from the text descriptions +but also encoding a long-term context across multiple sentences. While prior +efforts mostly focus on generating a semantically relevant image for each +sentence, encoding a context spread across the given paragraph to generate +contextually convincing images (e.g., with a correct character or with a proper +background of the scene) remains a challenge. To this end, we propose a novel +memory architecture for the Bi-directional Transformers with an online text +augmentation that generates multiple pseudo-descriptions as supplementary +supervision during training, for better generalization to the language +variation at inference. In extensive experiments on the two popular SV +benchmarks, i.e., the Pororo-SV and Flintstones-SV, the proposed method +significantly outperforms the state of the arts in various evaluation metrics +including FID, character F1, frame accuracy, BLEU-2/3, and R-precision with +similar or less computational complexity. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Synthetic data generation method for hybrid image-tabular data using two + generative adversarial networks + + +
+ The generation of synthetic medical records using generative adversarial +networks (GANs) has become increasingly important for addressing privacy +concerns and promoting data sharing in the medical field. In this paper, we +propose a novel method for generating synthetic hybrid medical records +consisting of chest X-ray images (CXRs) and structured tabular data (including +anthropometric data and laboratory tests) using an auto-encoding GAN +({\alpha}GAN) and a conditional tabular GAN (CTGAN). Our approach involves +training a {\alpha}GAN model on a large public database (pDB) to reduce the +dimensionality of CXRs. We then applied the trained encoder of the GAN model to +the images in original database (oDB) to obtain the latent vectors. These +latent vectors were combined with tabular data in oDB, and these joint data +were used to train the CTGAN model. We successfully generated diverse synthetic +records of hybrid CXR and tabular data, maintaining correspondence between +them. We evaluated this synthetic database (sDB) through visual assessment, +distribution of interrecord distances, and classification tasks. Our evaluation +results showed that the sDB captured the features of the oDB while maintaining +the correspondence between the images and tabular data. Although our approach +relies on the availability of a large-scale pDB containing a substantial number +of images with the same modality and imaging region as those in the oDB, this +method has the potential for the public release of synthetic datasets without +compromising the secondary use of data. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Ske2Grid: Skeleton-to-Grid Representation Learning for Action + Recognition ICML 2023 + + +
+ This paper presents Ske2Grid, a new representation learning framework for +improved skeleton-based action recognition. In Ske2Grid, we define a regular +convolution operation upon a novel grid representation of human skeleton, which +is a compact image-like grid patch constructed and learned through three novel +designs. Specifically, we propose a graph-node index transform (GIT) to +construct a regular grid patch through assigning the nodes in the skeleton +graph one by one to the desired grid cells. To ensure that GIT is a bijection +and enrich the expressiveness of the grid representation, an up-sampling +transform (UPT) is learned to interpolate the skeleton graph nodes for filling +the grid patch to the full. To resolve the problem when the one-step UPT is +aggressive and further exploit the representation capability of the grid patch +with increasing spatial size, a progressive learning strategy (PLS) is proposed +which decouples the UPT into multiple steps and aligns them to multiple paired +GITs through a compact cascaded design learned progressively. We construct +networks upon prevailing graph convolution networks and conduct experiments on +six mainstream skeleton-based action recognition datasets. Experiments show +that our Ske2Grid significantly outperforms existing GCN-based solutions under +different benchmark settings, without bells and whistles. Code and models are +available at https://github.com/OSVAI/Ske2Grid + +
+
+ comment: The paper of Ske2Grid is published at ICML 2023. Code and models are + available at https://github.com/OSVAI/Ske2Grid +
+
+
+
+
+ + ☆ Action Class Relation Detection and Classification Across Multiple Video + Datasets + + +
+ The Meta Video Dataset (MetaVD) provides annotated relations between action +classes in major datasets for human action recognition in videos. Although +these annotated relations enable dataset augmentation, it is only applicable to +those covered by MetaVD. For an external dataset to enjoy the same benefit, the +relations between its action classes and those in MetaVD need to be determined. +To address this issue, we consider two new machine learning tasks: action class +relation detection and classification. We propose a unified model to predict +relations between action classes, using language and visual information +associated with classes. Experimental results show that (i) pre-trained recent +neural network models for texts and videos contribute to high predictive +performance, (ii) the relation prediction based on action label texts is more +accurate than based on videos, and (iii) a blending approach that combines +predictions by both modalities can further improve the predictive performance +in some cases. + +
+
+ comment: Accepted to Pattern Recognition Letters. 12 pages, 4 figures +
+
+
+
+
+ + ☆ SST: A Simplified Swin Transformer-based Model for Taxi Destination + Prediction based on Existing Trajectory SC + + +
+ Accurately predicting the destination of taxi trajectories can have various +benefits for intelligent location-based services. One potential method to +accomplish this prediction is by converting the taxi trajectory into a +two-dimensional grid and using computer vision techniques. While the Swin +Transformer is an innovative computer vision architecture with demonstrated +success in vision downstream tasks, it is not commonly used to solve real-world +trajectory problems. In this paper, we propose a simplified Swin Transformer +(SST) structure that does not use the shifted window idea in the traditional +Swin Transformer, as trajectory data is consecutive in nature. Our +comprehensive experiments, based on real trajectory data, demonstrate that SST +can achieve higher accuracy compared to state-of-the-art methods. + +
+
+ comment: Accepted by IEEE ITSC +
+
+
+
+
+ + ☆ Multi-view 3D Face Reconstruction Based on Flame + + +
+ At present, face 3D reconstruction has broad application prospects in various +fields, but the research on it is still in the development stage. In this +paper, we hope to achieve better face 3D reconstruction quality by combining +multi-view training framework with face parametric model Flame, propose a +multi-view training and testing model MFNet (Multi-view Flame Network). We +build a self-supervised training framework and implement constraints such as +multi-view optical flow loss function and face landmark loss, and finally +obtain a complete MFNet. We propose innovative implementations of multi-view +optical flow loss and the covisible mask. We test our model on AFLW and +facescape datasets and also take pictures of our faces to reconstruct 3D faces +while simulating actual scenarios as much as possible, which achieves good +results. Our work mainly addresses the problem of combining parametric models +of faces with multi-view face 3D reconstruction and explores the implementation +of a Flame based multi-view training and testing framework for contributing to +the field of face 3D reconstruction. + +
+
+
+
+
+ + ☆ 3DHacker: Spectrum-based Decision Boundary Generation for Hard-label 3D + Point Cloud Attack ICCV 2023 + + +
+ With the maturity of depth sensors, the vulnerability of 3D point cloud +models has received increasing attention in various applications such as +autonomous driving and robot navigation. Previous 3D adversarial attackers +either follow the white-box setting to iteratively update the coordinate +perturbations based on gradients, or utilize the output model logits to +estimate noisy gradients in the black-box setting. However, these attack +methods are hard to be deployed in real-world scenarios since realistic 3D +applications will not share any model details to users. Therefore, we explore a +more challenging yet practical 3D attack setting, \textit{i.e.}, attacking +point clouds with black-box hard labels, in which the attacker can only have +access to the prediction label of the input. To tackle this setting, we propose +a novel 3D attack method, termed \textbf{3D} \textbf{H}ard-label +att\textbf{acker} (\textbf{3DHacker}), based on the developed decision boundary +algorithm to generate adversarial samples solely with the knowledge of class +labels. Specifically, to construct the class-aware model decision boundary, +3DHacker first randomly fuses two point clouds of different classes in the +spectral domain to craft their intermediate sample with high imperceptibility, +then projects it onto the decision boundary via binary search. To restrict the +final perturbation size, 3DHacker further introduces an iterative optimization +strategy to move the intermediate sample along the decision boundary for +generating adversarial point clouds with smallest trivial perturbations. +Extensive evaluations show that, even in the challenging hard-label setting, +3DHacker still competitively outperforms existing 3D attacks regarding the +attack performance as well as adversary quality. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Multimodal Dataset Distillation for Image-Text Retrieval + + +
+ Dataset distillation methods offer the promise of reducing a large-scale +dataset down to a significantly smaller set of (potentially synthetic) training +examples, which preserve sufficient information for training a new model from +scratch. So far dataset distillation methods have been developed for image +classification. However, with the rise in capabilities of vision-language +models, and especially given the scale of datasets necessary to train these +models, the time is ripe to expand dataset distillation methods beyond image +classification. In this work, we take the first steps towards this goal by +expanding on the idea of trajectory matching to create a distillation method +for vision-language datasets. The key challenge is that vision-language +datasets do not have a set of discrete classes. To overcome this, our proposed +multimodal dataset distillation method jointly distill the images and their +corresponding language descriptions in a contrastive formulation. Since there +are no existing baselines, we compare our approach to three coreset selection +methods (strategic subsampling of the training dataset), which we adapt to the +vision-language setting. We demonstrate significant improvements on the +challenging Flickr30K and COCO retrieval benchmark: the best coreset selection +method which selects 1000 image-text pairs for training is able to achieve only +5.6% image-to-text retrieval accuracy (recall@1); in contrast, our dataset +distillation approach almost doubles that with just 100 (an order of magnitude +fewer) training pairs. + +
+
+ comment: 28 pages, 11 figures +
+
+
+
+
+ + ☆ Visual and Textual Prior Guided Mask Assemble for Few-Shot Segmentation + and Beyond + + +
+ Few-shot segmentation (FSS) aims to segment the novel classes with a few +annotated images. Due to CLIP's advantages of aligning visual and textual +information, the integration of CLIP can enhance the generalization ability of +FSS model. However, even with the CLIP model, the existing CLIP-based FSS +methods are still subject to the biased prediction towards base classes, which +is caused by the class-specific feature level interactions. To solve this +issue, we propose a visual and textual Prior Guided Mask Assemble Network +(PGMA-Net). It employs a class-agnostic mask assembly process to alleviate the +bias, and formulates diverse tasks into a unified manner by assembling the +prior through affinity. Specifically, the class-relevant textual and visual +features are first transformed to class-agnostic prior in the form of +probability map. Then, a Prior-Guided Mask Assemble Module (PGMAM) including +multiple General Assemble Units (GAUs) is introduced. It considers diverse and +plug-and-play interactions, such as visual-textual, inter- and intra-image, +training-free, and high-order ones. Lastly, to ensure the class-agnostic +ability, a Hierarchical Decoder with Channel-Drop Mechanism (HDCDM) is proposed +to flexibly exploit the assembled masks and low-level features, without relying +on any class-specific information. It achieves new state-of-the-art results in +the FSS task, with mIoU of $77.6$ on $\text{PASCAL-}5^i$ and $59.4$ on +$\text{COCO-}20^i$ in 1-shot scenario. Beyond this, we show that without extra +re-training, the proposed PGMA-Net can solve bbox-level and cross-domain FSS, +co-segmentation, zero-shot segmentation (ZSS) tasks, leading an any-shot +segmentation framework. + +
+
+
+
+
+ + ☆ AttMOT: Improving Multiple-Object Tracking by Introducing Auxiliary + Pedestrian Attributes + + +
+ Multi-object tracking (MOT) is a fundamental problem in computer vision with +numerous applications, such as intelligent surveillance and automated driving. +Despite the significant progress made in MOT, pedestrian attributes, such as +gender, hairstyle, body shape, and clothing features, which contain rich and +high-level information, have been less explored. To address this gap, we +propose a simple, effective, and generic method to predict pedestrian +attributes to support general Re-ID embedding. We first introduce AttMOT, a +large, highly enriched synthetic dataset for pedestrian tracking, containing +over 80k frames and 6 million pedestrian IDs with different time, weather +conditions, and scenarios. To the best of our knowledge, AttMOT is the first +MOT dataset with semantic attributes. Subsequently, we explore different +approaches to fuse Re-ID embedding and pedestrian attributes, including +attention mechanisms, which we hope will stimulate the development of +attribute-assisted MOT. The proposed method AAM demonstrates its effectiveness +and generality on several representative pedestrian multi-object tracking +benchmarks, including MOT17 and MOT20, through experiments on the AttMOT +dataset. When applied to state-of-the-art trackers, AAM achieves consistent +improvements in MOTA, HOTA, AssA, IDs, and IDF1 scores. For instance, on MOT17, +the proposed method yields a +1.1 MOTA, +1.7 HOTA, and +1.8 IDF1 improvement +when used with FairMOT. To encourage further research on attribute-assisted +MOT, we will release the AttMOT dataset. + +
+
+
+
+
+ + ☆ Improved Region Proposal Network for Enhanced Few-Shot Object Detection + + +
+ Despite significant success of deep learning in object detection tasks, the +standard training of deep neural networks requires access to a substantial +quantity of annotated images across all classes. Data annotation is an arduous +and time-consuming endeavor, particularly when dealing with infrequent objects. +Few-shot object detection (FSOD) methods have emerged as a solution to the +limitations of classic object detection approaches based on deep learning. FSOD +methods demonstrate remarkable performance by achieving robust object detection +using a significantly smaller amount of training data. A challenge for FSOD is +that instances from novel classes that do not belong to the fixed set of +training classes appear in the background and the base model may pick them up +as potential objects. These objects behave similarly to label noise because +they are classified as one of the training dataset classes, leading to FSOD +performance degradation. We develop a semi-supervised algorithm to detect and +then utilize these unlabeled novel objects as positive samples during the FSOD +training stage to improve FSOD performance. Specifically, we develop a +hierarchical ternary classification region proposal network (HTRPN) to localize +the potential unlabeled novel objects and assign them new objectness labels to +distinguish these objects from the base training dataset classes. Our improved +hierarchical sampling strategy for the region proposal network (RPN) also +boosts the perception ability of the object detection model for large objects. +We test our approach and COCO and PASCAL VOC baselines that are commonly used +in FSOD literature. Our experimental results indicate that our method is +effective and outperforms the existing state-of-the-art (SOTA) FSOD methods. +Our implementation is provided as a supplement to support reproducibility of +the results. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.10422 +
+
+
+
+
+ + ☆ Confidence Contours: Uncertainty-Aware Annotation for Medical Semantic + Segmentation + + +
+ Medical image segmentation modeling is a high-stakes task where understanding +of uncertainty is crucial for addressing visual ambiguity. Prior work has +developed segmentation models utilizing probabilistic or generative mechanisms +to infer uncertainty from labels where annotators draw a singular boundary. +However, as these annotations cannot represent an individual annotator's +uncertainty, models trained on them produce uncertainty maps that are difficult +to interpret. We propose a novel segmentation representation, Confidence +Contours, which uses high- and low-confidence ``contours'' to capture +uncertainty directly, and develop a novel annotation system for collecting +contours. We conduct an evaluation on the Lung Image Dataset Consortium (LIDC) +and a synthetic dataset. From an annotation study with 30 participants, results +show that Confidence Contours provide high representative capacity without +considerably higher annotator effort. We also find that general-purpose +segmentation models can learn Confidence Contours at the same performance level +as standard singular annotations. Finally, from interviews with 5 medical +experts, we find that Confidence Contour maps are more interpretable than +Bayesian maps due to representation of structural uncertainty. + +
+
+ comment: 10 pages content, 12 pages total. Accepted to HCOMP '23 +
+
+
+
+
+ + ☆ Boosting Semi-Supervised Learning by bridging high and low-confidence + predictions ICCV + + +
+ Pseudo-labeling is a crucial technique in semi-supervised learning (SSL), +where artificial labels are generated for unlabeled data by a trained model, +allowing for the simultaneous training of labeled and unlabeled data in a +supervised setting. However, several studies have identified three main issues +with pseudo-labeling-based approaches. Firstly, these methods heavily rely on +predictions from the trained model, which may not always be accurate, leading +to a confirmation bias problem. Secondly, the trained model may be overfitted +to easy-to-learn examples, ignoring hard-to-learn ones, resulting in the +\textit{"Matthew effect"} where the already strong become stronger and the weak +weaker. Thirdly, most of the low-confidence predictions of unlabeled data are +discarded due to the use of a high threshold, leading to an underutilization of +unlabeled data during training. To address these issues, we propose a new +method called ReFixMatch, which aims to utilize all of the unlabeled data +during training, thus improving the generalizability of the model and +performance on SSL benchmarks. Notably, ReFixMatch achieves 41.05\% top-1 +accuracy with 100k labeled examples on ImageNet, outperforming the baseline +FixMatch and current state-of-the-art methods. + +
+
+ comment: Accepted to ICCVW2023 (Workshop on representation learning with very + limited images: the potential of self-, synthetic- and formula-supervision) +
+
+
+
+
+ + ☆ Benchmarking Scalable Epistemic Uncertainty Quantification in Organ + Segmentation MICCAI 2023 + + +
+ Deep learning based methods for automatic organ segmentation have shown +promise in aiding diagnosis and treatment planning. However, quantifying and +understanding the uncertainty associated with model predictions is crucial in +critical clinical applications. While many techniques have been proposed for +epistemic or model-based uncertainty estimation, it is unclear which method is +preferred in the medical image analysis setting. This paper presents a +comprehensive benchmarking study that evaluates epistemic uncertainty +quantification methods in organ segmentation in terms of accuracy, uncertainty +calibration, and scalability. We provide a comprehensive discussion of the +strengths, weaknesses, and out-of-distribution detection capabilities of each +method as well as recommendations for future improvements. These findings +contribute to the development of reliable and robust models that yield accurate +segmentations while effectively quantifying epistemic uncertainty. + +
+
+ comment: Accepted to the UNSURE Workshop held in conjunction with MICCAI 2023 +
+
+
+
+
+ + ☆ ICAFusion: Iterative Cross-Attention Guided Feature Fusion for + Multispectral Object Detection + + +
+ Effective feature fusion of multispectral images plays a crucial role in +multi-spectral object detection. Previous studies have demonstrated the +effectiveness of feature fusion using convolutional neural networks, but these +methods are sensitive to image misalignment due to the inherent deffciency in +local-range feature interaction resulting in the performance degradation. To +address this issue, a novel feature fusion framework of dual cross-attention +transformers is proposed to model global feature interaction and capture +complementary information across modalities simultaneously. This framework +enhances the discriminability of object features through the query-guided +cross-attention mechanism, leading to improved performance. However, stacking +multiple transformer blocks for feature enhancement incurs a large number of +parameters and high spatial complexity. To handle this, inspired by the human +process of reviewing knowledge, an iterative interaction mechanism is proposed +to share parameters among block-wise multimodal transformers, reducing model +complexity and computation cost. The proposed method is general and effective +to be integrated into different detection frameworks and used with different +backbones. Experimental results on KAIST, FLIR, and VEDAI datasets show that +the proposed method achieves superior performance and faster inference, making +it suitable for various practical scenarios. Code will be available at +https://github.com/chanchanchan97/ICAFusion. + +
+
+ comment: submitted to Pattern Recognition Journal, minor revision +
+
+
+
+
+ + ☆ Deep Learning Framework for Spleen Volume Estimation from 2D + Cross-sectional Views + + +
+ Abnormal spleen enlargement (splenomegaly) is regarded as a clinical +indicator for a range of conditions, including liver disease, cancer and blood +diseases. While spleen length measured from ultrasound images is a commonly +used surrogate for spleen size, spleen volume remains the gold standard metric +for assessing splenomegaly and the severity of related clinical conditions. +Computed tomography is the main imaging modality for measuring spleen volume, +but it is less accessible in areas where there is a high prevalence of +splenomegaly (e.g., the Global South). Our objective was to enable automated +spleen volume measurement from 2D cross-sectional segmentations, which can be +obtained from ultrasound imaging. In this study, we describe a variational +autoencoder-based framework to measure spleen volume from single- or dual-view +2D spleen segmentations. We propose and evaluate three volume estimation +methods within this framework. We also demonstrate how 95\% confidence +intervals of volume estimates can be produced to make our method more +clinically useful. Our best model achieved mean relative volume accuracies of +86.62\% and 92.58\% for single- and dual-view segmentations, respectively, +surpassing the performance of the clinical standard approach of linear +regression using manual measurements and a comparative deep learning-based +2D-3D reconstruction-based approach. The proposed spleen volume estimation +framework can be integrated into standard clinical workflows which currently +use 2D ultrasound images to measure spleen length. To the best of our +knowledge, this is the first work to achieve direct 3D spleen volume estimation +from 2D spleen segmentations. + +
+
+ comment: 22 pages, 7 figures +
+
+
+
+
+ + ☆ Shortcut-V2V: Compression Framework for Video-to-Video Translation based + on Temporal Redundancy Reduction + + +
+ Video-to-video translation aims to generate video frames of a target domain +from an input video. Despite its usefulness, the existing networks require +enormous computations, necessitating their model compression for wide use. +While there exist compression methods that improve computational efficiency in +various image/video tasks, a generally-applicable compression method for +video-to-video translation has not been studied much. In response, we present +Shortcut-V2V, a general-purpose compression framework for video-to-video +translation. Shourcut-V2V avoids full inference for every neighboring video +frame by approximating the intermediate features of a current frame from those +of the previous frame. Moreover, in our framework, a newly-proposed block +called AdaBD adaptively blends and deforms features of neighboring frames, +which makes more accurate predictions of the intermediate features possible. We +conduct quantitative and qualitative evaluations using well-known +video-to-video translation models on various tasks to demonstrate the general +applicability of our framework. The results show that Shourcut-V2V achieves +comparable performance compared to the original video-to-video translation +model while saving 3.2-5.7x computational cost and 7.8-44x memory at test time. + +
+
+ comment: to be updated +
+
+
+
+
+ + ☆ $A^2$Nav: Action-Aware Zero-Shot Robot Navigation by Exploiting + Vision-and-Language Ability of Foundation Models + + +
+ We study the task of zero-shot vision-and-language navigation (ZS-VLN), a +practical yet challenging problem in which an agent learns to navigate +following a path described by language instructions without requiring any +path-instruction annotation data. Normally, the instructions have complex +grammatical structures and often contain various action descriptions (e.g., +"proceed beyond", "depart from"). How to correctly understand and execute these +action demands is a critical problem, and the absence of annotated data makes +it even more challenging. Note that a well-educated human being can easily +understand path instructions without the need for any special training. In this +paper, we propose an action-aware zero-shot VLN method ($A^2$Nav) by exploiting +the vision-and-language ability of foundation models. Specifically, the +proposed method consists of an instruction parser and an action-aware +navigation policy. The instruction parser utilizes the advanced reasoning +ability of large language models (e.g., GPT-3) to decompose complex navigation +instructions into a sequence of action-specific object navigation sub-tasks. +Each sub-task requires the agent to localize the object and navigate to a +specific goal position according to the associated action demand. To accomplish +these sub-tasks, an action-aware navigation policy is learned from freely +collected action-specific datasets that reveal distinct characteristics of each +action demand. We use the learned navigation policy for executing sub-tasks +sequentially to follow the navigation instruction. Extensive experiments show +$A^2$Nav achieves promising ZS-VLN performance and even surpasses the +supervised learning methods on R2R-Habitat and RxR-Habitat datasets. + +
+
+
+
+
+ + ☆ YODA: You Only Diffuse Areas. An Area-Masked Diffusion Approach For + Image Super-Resolution + + +
+ This work introduces "You Only Diffuse Areas" (YODA), a novel method for +partial diffusion in Single-Image Super-Resolution (SISR). The core idea is to +utilize diffusion selectively on spatial regions based on attention maps +derived from the low-resolution image and the current time step in the +diffusion process. This time-dependent targeting enables a more effective +conversion to high-resolution outputs by focusing on areas that benefit the +most from the iterative refinement process, i.e., detail-rich objects. We +empirically validate YODA by extending leading diffusion-based SISR methods SR3 +and SRDiff. Our experiments demonstrate new state-of-the-art performance gains +in face and general SR across PSNR, SSIM, and LPIPS metrics. A notable finding +is YODA's stabilization effect on training by reducing color shifts, especially +when induced by small batch sizes, potentially contributing to +resource-constrained scenarios. The proposed spatial and temporal adaptive +diffusion mechanism opens promising research directions, including developing +enhanced attention map extraction techniques and optimizing inference latency +based on sparser diffusion. + +
+
+ comment: Brian B. Moser and Stanislav Frolov contributed equally +
+
+
+
+
+ + ☆ Boosting Cross-Quality Face Verification using Blind Face Restoration + + +
+ In recent years, various Blind Face Restoration (BFR) techniques were +developed. These techniques transform low quality faces suffering from multiple +degradations to more realistic and natural face images with high perceptual +quality. However, it is crucial for the task of face verification to not only +enhance the perceptual quality of the low quality images but also to improve +the biometric-utility face quality metrics. Furthermore, preserving the +valuable identity information is of great importance. In this paper, we +investigate the impact of applying three state-of-the-art blind face +restoration techniques namely, GFP-GAN, GPEN and SGPN on the performance of +face verification system under very challenging environment characterized by +very low quality images. Extensive experimental results on the recently +proposed cross-quality LFW database using three state-of-the-art deep face +recognition models demonstrate the effectiveness of GFP-GAN in boosting +significantly the face verification accuracy. + +
+
+ comment: paper accepted at BIOSIG 2023 conference +
+
+
+
+
+ + ☆ Leveraging Symmetries in Pick and Place + + +
+ Robotic pick and place tasks are symmetric under translations and rotations +of both the object to be picked and the desired place pose. For example, if the +pick object is rotated or translated, then the optimal pick action should also +rotate or translate. The same is true for the place pose; if the desired place +pose changes, then the place action should also transform accordingly. A +recently proposed pick and place framework known as Transporter Net captures +some of these symmetries, but not all. This paper analytically studies the +symmetries present in planar robotic pick and place and proposes a method of +incorporating equivariant neural models into Transporter Net in a way that +captures all symmetries. The new model, which we call Equivariant Transporter +Net, is equivariant to both pick and place symmetries and can immediately +generalize pick and place knowledge to different pick and place poses. We +evaluate the new model empirically and show that it is much more sample +efficient than the non-symmetric version, resulting in a system that can +imitate demonstrated pick and place behavior using very few human +demonstrations on a variety of imitation learning tasks. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2202.09400 +
+
+
+
+
+ + ♻ ☆ Tirtha -- An Automated Platform to Crowdsource Images and Create 3D + Models of Heritage Sites + + +
+ Digital preservation of Cultural Heritage (CH) sites is crucial to protect +them against damage from natural disasters or human activities. Creating 3D +models of CH sites has become a popular method of digital preservation thanks +to advancements in computer vision and photogrammetry. However, the process is +time-consuming, expensive, and typically requires specialized equipment and +expertise, posing challenges in resource-limited developing countries. +Additionally, the lack of an open repository for 3D models hinders research and +public engagement with their heritage. To address these issues, we propose +Tirtha, a web platform for crowdsourcing images of CH sites and creating their +3D models. Tirtha utilizes state-of-the-art Structure from Motion (SfM) and +Multi-View Stereo (MVS) techniques. It is modular, extensible and +cost-effective, allowing for the incorporation of new techniques as +photogrammetry advances. Tirtha is accessible through a web interface at +https://tirtha.niser.ac.in and can be deployed on-premise or in a cloud +environment. In our case studies, we demonstrate the pipeline's effectiveness +by creating 3D models of temples in Odisha, India, using crowdsourced images. +These models are available for viewing, interaction, and download on the Tirtha +website. Our work aims to provide a dataset of crowdsourced images and 3D +reconstructions for research in computer vision, heritage conservation, and +related domains. Overall, Tirtha is a step towards democratizing digital +preservation, primarily in resource-limited developing countries. + +
+
+ comment: Accepted at The 28th International ACM Conference on 3D Web + Technology (Web3D 2023) +
+
+
+
+
+ + ♻ ☆ Whose Emotion Matters? Speaking Activity Localisation without Prior + Knowledge + + +
+ The task of emotion recognition in conversations (ERC) benefits from the +availability of multiple modalities, as provided, for example, in the +video-based Multimodal EmotionLines Dataset (MELD). However, only a few +research approaches use both acoustic and visual information from the MELD +videos. There are two reasons for this: First, label-to-video alignments in +MELD are noisy, making those videos an unreliable source of emotional speech +data. Second, conversations can involve several people in the same scene, which +requires the localisation of the utterance source. In this paper, we introduce +MELD with Fixed Audiovisual Information via Realignment (MELD-FAIR) by using +recent active speaker detection and automatic speech recognition models, we are +able to realign the videos of MELD and capture the facial expressions from +speakers in 96.92% of the utterances provided in MELD. Experiments with a +self-supervised voice recognition model indicate that the realigned MELD-FAIR +videos more closely match the transcribed utterances given in the MELD dataset. +Finally, we devise a model for emotion recognition in conversations trained on +the realigned MELD-FAIR videos, which outperforms state-of-the-art models for +ERC based on vision alone. This indicates that localising the source of +speaking activities is indeed effective for extracting facial expressions from +the uttering speakers and that faces provide more informative visual cues than +the visual features state-of-the-art models have been using so far. The +MELD-FAIR realignment data, and the code of the realignment procedure and of +the emotional recognition, are available at +https://github.com/knowledgetechnologyuhh/MELD-FAIR. + +
+
+ comment: 17 pages, 8 figures, 7 tables, Published in Neurocomputing +
+
+
+
+
+ + ♻ ☆ SEMI-CenterNet: A Machine Learning Facilitated Approach for + Semiconductor Defect Inspection + + +
+ Continual shrinking of pattern dimensions in the semiconductor domain is +making it increasingly difficult to inspect defects due to factors such as the +presence of stochastic noise and the dynamic behavior of defect patterns and +types. Conventional rule-based methods and non-parametric supervised machine +learning algorithms like KNN mostly fail at the requirements of semiconductor +defect inspection at these advanced nodes. Deep Learning (DL)-based methods +have gained popularity in the semiconductor defect inspection domain because +they have been proven robust towards these challenging scenarios. In this +research work, we have presented an automated DL-based approach for efficient +localization and classification of defects in SEM images. We have proposed +SEMI-CenterNet (SEMI-CN), a customized CN architecture trained on SEM images of +semiconductor wafer defects. The use of the proposed CN approach allows +improved computational efficiency compared to previously studied DL models. +SEMI-CN gets trained to output the center, class, size, and offset of a defect +instance. This is different from the approach of most object detection models +that use anchors for bounding box prediction. Previous methods predict +redundant bounding boxes, most of which are discarded in postprocessing. CN +mitigates this by only predicting boxes for likely defect center points. We +train SEMI-CN on two datasets and benchmark two ResNet backbones for the +framework. Initially, ResNet models pretrained on the COCO dataset undergo +training using two datasets separately. Primarily, SEMI-CN shows significant +improvement in inference time against previous research works. Finally, +transfer learning (using weights of custom SEM dataset) is applied from ADI +dataset to AEI dataset and vice-versa, which reduces the required training time +for both backbones to reach the best mAP against conventional training method. + +
+
+
+
+
+ + ♻ ☆ DIG In: Evaluating Disparities in Image Generations with Indicators for + Geographic Diversity + + +
+ The unprecedented photorealistic results achieved by recent text-to-image +generative systems and their increasing use as plug-and-play content creation +solutions make it crucial to understand their potential biases. In this work, +we introduce three indicators to evaluate the realism, diversity and +prompt-generation consistency of text-to-image generative systems when prompted +to generate objects from across the world. Our indicators complement +qualitative analysis of the broader impact of such systems by enabling +automatic and efficient benchmarking of geographic disparities, an important +step towards building responsible visual content creation systems. We use our +proposed indicators to analyze potential geographic biases in state-of-the-art +visual content creation systems and find that: (1) models have less realism and +diversity of generations when prompting for Africa and West Asia than Europe, +(2) prompting with geographic information comes at a cost to prompt-consistency +and diversity of generated images, and (3) models exhibit more region-level +disparities for some objects than others. Perhaps most interestingly, our +indicators suggest that progress in image generation quality has come at the +cost of real-world geographic representation. Our comprehensive evaluation +constitutes a crucial step towards ensuring a positive experience of visual +content creation for everyone. + +
+
+
+
+
+ + ♻ ☆ A Framework For Refining Text Classification and Object Recognition from + Academic Articles + + +
+ With the widespread use of the internet, it has become increasingly crucial +to extract specific information from vast amounts of academic articles +efficiently. Data mining techniques are generally employed to solve this issue. +However, data mining for academic articles is challenging since it requires +automatically extracting specific patterns in complex and unstructured layout +documents. Current data mining methods for academic articles employ +rule-based(RB) or machine learning(ML) approaches. However, using rule-based +methods incurs a high coding cost for complex typesetting articles. On the +other hand, simply using machine learning methods requires annotation work for +complex content types within the paper, which can be costly. Furthermore, only +using machine learning can lead to cases where patterns easily recognized by +rule-based methods are mistakenly extracted. To overcome these issues, from the +perspective of analyzing the standard layout and typesetting used in the +specified publication, we emphasize implementing specific methods for specific +characteristics in academic articles. We have developed a novel Text Block +Refinement Framework (TBRF), a machine learning and rule-based scheme hybrid. +We used the well-known ACL proceeding articles as experimental data for the +validation experiment. The experiment shows that our approach achieved over 95% +classification accuracy and 90% detection accuracy for tables and figures. + +
+
+ comment: This paper has been accepted at 'The International Symposium on + Innovations in Intelligent Systems and Applications 2023 (INISTA 2023)' +
+
+
+
+
+ + ♻ ☆ Source-free Domain Adaptive Human Pose Estimation ICCV 2023 + + +
+ Human Pose Estimation (HPE) is widely used in various fields, including +motion analysis, healthcare, and virtual reality. However, the great expenses +of labeled real-world datasets present a significant challenge for HPE. To +overcome this, one approach is to train HPE models on synthetic datasets and +then perform domain adaptation (DA) on real-world data. Unfortunately, existing +DA methods for HPE neglect data privacy and security by using both source and +target data in the adaptation process. To this end, we propose a new task, +named source-free domain adaptive HPE, which aims to address the challenges of +cross-domain learning of HPE without access to source data during the +adaptation process. We further propose a novel framework that consists of three +models: source model, intermediate model, and target model, which explores the +task from both source-protect and target-relevant perspectives. The +source-protect module preserves source information more effectively while +resisting noise, and the target-relevant module reduces the sparsity of spatial +representations by building a novel spatial probability space, and +pose-specific contrastive learning and information maximization are proposed on +the basis of this space. Comprehensive experiments on several domain adaptive +HPE benchmarks show that the proposed method outperforms existing approaches by +a considerable margin. The codes are available at +https://github.com/davidpengucf/SFDAHPE. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Feature Embedding by Template Matching as a ResNet Block BMVC 2022 + + +
+ Convolution blocks serve as local feature extractors and are the key to +success of the neural networks. To make local semantic feature embedding rather +explicit, we reformulate convolution blocks as feature selection according to +the best matching kernel. In this manner, we show that typical ResNet blocks +indeed perform local feature embedding via template matching once batch +normalization (BN) followed by a rectified linear unit (ReLU) is interpreted as +arg-max optimizer. Following this perspective, we tailor a residual block that +explicitly forces semantically meaningful local feature embedding through using +label information. Specifically, we assign a feature vector to each local +region according to the classes that the corresponding region matches. We +evaluate our method on three popular benchmark datasets with several +architectures for image classification and consistently show that our approach +substantially improves the performance of the baseline architectures. + +
+
+ comment: Accepted at the British Machine Vision Conference 2022 (BMVC 2022) +
+
+
+
+
+ + ♻ ☆ Quantum Image Denoising: A Framework via Boltzmann Machines, QUBO, and + Quantum Annealing + + +
+ We investigate a framework for binary image denoising via restricted +Boltzmann machines (RBMs) that introduces a denoising objective in quadratic +unconstrained binary optimization (QUBO) form and is well-suited for quantum +annealing. The denoising objective is attained by balancing the distribution +learned by a trained RBM with a penalty term for derivations from the noisy +image. We derive the statistically optimal choice of the penalty parameter +assuming the target distribution has been well-approximated, and further +suggest an empirically supported modification to make the method robust to that +idealistic assumption. We also show under additional assumptions that the +denoised images attained by our method are, in expectation, strictly closer to +the noise-free images than the noisy images are. While we frame the model as an +image denoising model, it can be applied to any binary data. As the QUBO +formulation is well-suited for implementation on quantum annealers, we test the +model on a D-Wave Advantage machine, and also test on data too large for +current quantum annealers by approximating QUBO solutions through classical +heuristics. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Towards Nonlinear-Motion-Aware and Occlusion-Robust Rolling Shutter + Correction ICCV 2023 + + +
+ This paper addresses the problem of rolling shutter correction in complex +nonlinear and dynamic scenes with extreme occlusion. Existing methods suffer +from two main drawbacks. Firstly, they face challenges in estimating the +accurate correction field due to the uniform velocity assumption, leading to +significant image correction errors under complex motion. Secondly, the drastic +occlusion in dynamic scenes prevents current solutions from achieving better +image quality because of the inherent difficulties in aligning and aggregating +multiple frames. To tackle these challenges, we model the curvilinear +trajectory of pixels analytically and propose a geometry-based Quadratic +Rolling Shutter (QRS) motion solver, which precisely estimates the high-order +correction field of individual pixels. Besides, to reconstruct high-quality +occlusion frames in dynamic scenes, we present a 3D video architecture that +effectively Aligns and Aggregates multi-frame context, namely, RSA2-Net. We +evaluate our method across a broad range of cameras and video sequences, +demonstrating its significant superiority. Specifically, our method surpasses +the state-of-the-art by +4.98, +0.77, and +4.33 of PSNR on Carla-RS, Fastec-RS, +and BS-RSC datasets, respectively. Code is available at +https://github.com/DelinQu/qrsc. + +
+
+ comment: accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Investigating and Improving Latent Density Segmentation Models for + Aleatoric Uncertainty Quantification in Medical Imaging + + +
+ Data uncertainties, such as sensor noise or occlusions, can introduce +irreducible ambiguities in images, which result in varying, yet plausible, +semantic hypotheses. In Machine Learning, this ambiguity is commonly referred +to as aleatoric uncertainty. Latent density models can be utilized to address +this problem in image segmentation. The most popular approach is the +Probabilistic U-Net (PU-Net), which uses latent Normal densities to optimize +the conditional data log-likelihood Evidence Lower Bound. In this work, we +demonstrate that the PU- Net latent space is severely inhomogenous. As a +result, the effectiveness of gradient descent is inhibited and the model +becomes extremely sensitive to the localization of the latent space samples, +resulting in defective predictions. To address this, we present the Sinkhorn +PU-Net (SPU-Net), which uses the Sinkhorn Divergence to promote homogeneity +across all latent dimensions, effectively improving gradient-descent updates +and model robustness. Our results show that by applying this on public datasets +of various clinical segmentation problems, the SPU-Net receives up to 11% +performance gains compared against preceding latent variable models for +probabilistic segmentation on the Hungarian-Matched metric. The results +indicate that by encouraging a homogeneous latent space, one can significantly +improve latent density modeling for medical image segmentation. + +
+
+ comment: 12 pages incl. references, 11 figures +
+
+
+
+
+ + ♻ ☆ DAC: Detector-Agnostic Spatial Covariances for Deep Local Features + + +
+ Current deep visual local feature detectors do not model the spatial +uncertainty of detected features, producing suboptimal results in downstream +applications. In this work, we propose two post-hoc covariance estimates that +can be plugged into any pretrained deep feature detector: a simple, isotropic +covariance estimate that uses the predicted score at a given pixel location, +and a full covariance estimate via the local structure tensor of the learned +score maps. Both methods are easy to implement and can be applied to any deep +feature detector. We show that these covariances are directly related to errors +in feature matching, leading to improvements in downstream tasks, including +solving the perspective-n-point problem and motion-only bundle adjustment. Code +is available at https://github.com/javrtg/DAC + +
+
+
+
+
+ + ♻ ☆ RIGID: Recurrent GAN Inversion and Editing of Real Face Videos ICCV2023 + + +
+ GAN inversion is indispensable for applying the powerful editability of GAN +to real images. However, existing methods invert video frames individually +often leading to undesired inconsistent results over time. In this paper, we +propose a unified recurrent framework, named \textbf{R}ecurrent v\textbf{I}deo +\textbf{G}AN \textbf{I}nversion and e\textbf{D}iting (RIGID), to explicitly and +simultaneously enforce temporally coherent GAN inversion and facial editing of +real videos. Our approach models the temporal relations between current and +previous frames from three aspects. To enable a faithful real video +reconstruction, we first maximize the inversion fidelity and consistency by +learning a temporal compensated latent code. Second, we observe incoherent +noises lie in the high-frequency domain that can be disentangled from the +latent space. Third, to remove the inconsistency after attribute manipulation, +we propose an \textit{in-between frame composition constraint} such that the +arbitrary frame must be a direct composite of its neighboring frames. Our +unified framework learns the inherent coherence between input frames in an +end-to-end manner, and therefore it is agnostic to a specific attribute and can +be applied to arbitrary editing of the same video without re-training. +Extensive experiments demonstrate that RIGID outperforms state-of-the-art +methods qualitatively and quantitatively in both inversion and editing tasks. +The deliverables can be found in \url{https://cnnlstm.github.io/RIGID} + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ♻ ☆ SuS-X: Training-Free Name-Only Transfer of Vision-Language Models ICCV2023 + + +
+ Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet +effective way to train large-scale vision-language models. CLIP demonstrates +impressive zero-shot classification and retrieval on diverse downstream tasks. +However, to leverage its full potential, fine-tuning still appears to be +necessary. Fine-tuning the entire CLIP model can be resource-intensive and +unstable. Moreover, recent methods that aim to circumvent this need for +fine-tuning still require access to images from the target distribution. In +this paper, we pursue a different approach and explore the regime of +training-free "name-only transfer" in which the only knowledge we possess about +the downstream task comprises the names of downstream target categories. We +propose a novel method, SuS-X, consisting of two key building blocks -- SuS and +TIP-X, that requires neither intensive fine-tuning nor costly labelled data. +SuS-X achieves state-of-the-art zero-shot classification results on 19 +benchmark datasets. We further show the utility of TIP-X in the training-free +few-shot setting, where we again achieve state-of-the-art results over strong +training-free baselines. Code is available at +https://github.com/vishaal27/SuS-X. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Common Limitations of Image Processing Metrics: A Picture Story + + +
+ While the importance of automatic image analysis is continuously increasing, +recent meta-research revealed major flaws with respect to algorithm validation. +Performance metrics are particularly key for meaningful, objective, and +transparent performance assessment and validation of the used automatic +algorithms, but relatively little attention has been given to the practical +pitfalls when using specific metrics for a given image analysis task. These are +typically related to (1) the disregard of inherent metric properties, such as +the behaviour in the presence of class imbalance or small target structures, +(2) the disregard of inherent data set properties, such as the non-independence +of the test cases, and (3) the disregard of the actual biomedical domain +interest that the metrics should reflect. This living dynamically document has +the purpose to illustrate important limitations of performance metrics commonly +applied in the field of image analysis. In this context, it focuses on +biomedical image analysis problems that can be phrased as image-level +classification, semantic segmentation, instance segmentation, or object +detection task. The current version is based on a Delphi process on metrics +conducted by an international consortium of image analysis experts from more +than 60 institutions worldwide. + +
+
+ comment: Shared first authors: Annika Reinke and Minu D. Tizabi. This is a + dynamic paper on limitations of commonly used metrics. It discusses metrics + for image-level classification, semantic and instance segmentation, and + object detection. For missing use cases, comments or questions, please + contact a.reinke@dkfz.de. Substantial contributions to this document will be + acknowledged with a co-authorship +
+
+
+
+
+ + ♻ ☆ DDColor: Towards Photo-Realistic Image Colorization via Dual Decoders ICCV 2023 + + +
+ Image colorization is a challenging problem due to multi-modal uncertainty +and high ill-posedness. Directly training a deep neural network usually leads +to incorrect semantic colors and low color richness. While transformer-based +methods can deliver better results, they often rely on manually designed +priors, suffer from poor generalization ability, and introduce color bleeding +effects. To address these issues, we propose DDColor, an end-to-end method with +dual decoders for image colorization. Our approach includes a pixel decoder and +a query-based color decoder. The former restores the spatial resolution of the +image, while the latter utilizes rich visual features to refine color queries, +thus avoiding hand-crafted priors. Our two decoders work together to establish +correlations between color and multi-scale semantic representations via +cross-attention, significantly alleviating the color bleeding effect. +Additionally, a simple yet effective colorfulness loss is introduced to enhance +the color richness. Extensive experiments demonstrate that DDColor achieves +superior performance to existing state-of-the-art works both quantitatively and +qualitatively. The codes and models are publicly available at +https://github.com/piddnad/DDColor. + +
+
+ comment: ICCV 2023; Code: https://github.com/piddnad/DDColor +
+
+
+
+
+ + ♻ ☆ SALUDA: Surface-based Automotive Lidar Unsupervised Domain Adaptation + + +
+ Learning models on one labeled dataset that generalize well on another domain +is a difficult task, as several shifts might happen between the data domains. +This is notably the case for lidar data, for which models can exhibit large +performance discrepancies due for instance to different lidar patterns or +changes in acquisition conditions. This paper addresses the corresponding +Unsupervised Domain Adaptation (UDA) task for semantic segmentation. To +mitigate this problem, we introduce an unsupervised auxiliary task of learning +an implicit underlying surface representation simultaneously on source and +target data. As both domains share the same latent representation, the model is +forced to accommodate discrepancies between the two sources of data. This novel +strategy differs from classical minimization of statistical divergences or +lidar-specific domain adaptation techniques. Our experiments demonstrate that +our method achieves a better performance than the current state of the art, +both in real-to-real and synthetic-to-real scenarios. + +
+
+ comment: Project repository: github.com/valeoai/SALUDA +
+
+
+
+
+ + ♻ ☆ XMem++: Production-level Video Segmentation From Few Annotated Frames ICCV 2023 + + +
+ Despite advancements in user-guided video segmentation, extracting complex +objects consistently for highly complex scenes is still a labor-intensive task, +especially for production. It is not uncommon that a majority of frames need to +be annotated. We introduce a novel semi-supervised video object segmentation +(SSVOS) model, XMem++, that improves existing memory-based models, with a +permanent memory module. Most existing methods focus on single frame +annotations, while our approach can effectively handle multiple user-selected +frames with varying appearances of the same object or region. Our method can +extract highly consistent results while keeping the required number of frame +annotations low. We further introduce an iterative and attention-based frame +suggestion mechanism, which computes the next best frame for annotation. Our +method is real-time and does not require retraining after each user input. We +also introduce a new dataset, PUMaVOS, which covers new challenging use cases +not found in previous benchmarks. We demonstrate SOTA performance on +challenging (partial and multi-class) segmentation scenarios as well as long +videos, while ensuring significantly fewer frame annotations than any existing +method. Project page: https://max810.github.io/xmem2-project-page/ + +
+
+ comment: Accepted to ICCV 2023. 18 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Quality at the Tail + + +
+ Benchmarking and evaluating deep learning models and systems necessitate a +meticulous approach to ensure comprehensive assessment. In practical +applications, it is paramount to consider both the inference quality and the +inference time, particularly within critical contexts, where stringent +requirements demand the simultaneous satisfaction of both metrics. Neglecting +either aspect can result in severe and irreversible consequences, including +loss of human life and property damage. Unfortunately, many studies lack a +comprehensive consideration of these metrics, often conducted under ideal or +permissive conditions, thereby leading to incomplete or non-intuitive +evaluation methodologies. + This study reveals that deep learning inference quality exhibits +fluctuations, which further introduces complications and challenges to the +benchmarking and evaluation. To better characterize the phenomenon, the concept +of "tail quality" is introduced, which indicates the quality at the tail of +distributions. "Tail quality" can offer a more objective evaluation, overcoming +the limitations of conventional inference quality and inference time metrics in +capturing the quality fluctuation phenomenon. To capture the phenomenon, this +paper also proposes a pioneering evaluation framework for comprehensive +assessment and analysis of various factors affecting inference time and +quality. Leveraging this framework enables the anticipation of the potential +distribution of inference time and inference quality, thus capturing "tail +quality" before practically applying deep learning. The effectiveness of the +evaluation framework is validated through experiments conducted on deep +learning models for three different tasks across four systems. Furthermore, +employing this evaluation framework, the experiments conducted a preliminary +analysis of several factors influencing inference quality and inference time. + +
+
+ comment: 11 pages, 4 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ FUSQA: Fetal Ultrasound Segmentation Quality Assessment + + +
+ Deep learning models have been effective for various fetal ultrasound +segmentation tasks. However, generalization to new unseen data has raised +questions about their effectiveness for clinical adoption. Normally, a +transition to new unseen data requires time-consuming and costly quality +assurance processes to validate the segmentation performance post-transition. +Segmentation quality assessment efforts have focused on natural images, where +the problem has been typically formulated as a dice score regression task. In +this paper, we propose a simplified Fetal Ultrasound Segmentation Quality +Assessment (FUSQA) model to tackle the segmentation quality assessment when no +masks exist to compare with. We formulate the segmentation quality assessment +process as an automated classification task to distinguish between good and +poor-quality segmentation masks for more accurate gestational age estimation. +We validate the performance of our proposed approach on two datasets we collect +from two hospitals using different ultrasound machines. We compare different +architectures, with our best-performing architecture achieving over 90% +classification accuracy on distinguishing between good and poor-quality +segmentation masks from an unseen dataset. Additionally, there was only a +1.45-day difference between the gestational age reported by doctors and +estimated based on CRL measurements using well-segmented masks. On the other +hand, this difference increased and reached up to 7.73 days when we calculated +CRL from the poorly segmented masks. As a result, AI-based approaches can +potentially aid fetal ultrasound segmentation quality assessment and might +detect poor segmentation in real-time screening in the future. + +
+
+ comment: 13 pages, 3 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ A Time-aware tensor decomposition for tracking evolving patterns + + +
+ Time-evolving data sets can often be arranged as a higher-order tensor with +one of the modes being the time mode. While tensor factorizations have been +successfully used to capture the underlying patterns in such higher-order data +sets, the temporal aspect is often ignored, allowing for the reordering of time +points. In recent studies, temporal regularizers are incorporated in the time +mode to tackle this issue. Nevertheless, existing approaches still do not allow +underlying patterns to change in time (e.g., spatial changes in the brain, +contextual changes in topics). In this paper, we propose temporal PARAFAC2 +(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal +regularization to extract gradually evolving patterns from temporal data. +Through extensive experiments on synthetic data, we demonstrate that tPARAFAC2 +can capture the underlying evolving patterns accurately performing better than +PARAFAC2 and coupled matrix factorization with temporal smoothness +regularization. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Echo from noise: synthetic ultrasound image generation using diffusion + models for real image segmentation + + +
+ We propose a novel pipeline for the generation of synthetic ultrasound images +via Denoising Diffusion Probabilistic Models (DDPMs) guided by cardiac semantic +label maps. We show that these synthetic images can serve as a viable +substitute for real data in the training of deep-learning models for ultrasound +image analysis tasks such as cardiac segmentation. To demonstrate the +effectiveness of this approach, we generated synthetic 2D echocardiograms and +trained a neural network for segmenting the left ventricle and left atrium. The +performance of the network trained on exclusively synthetic images was +evaluated on an unseen dataset of real images and yielded mean Dice scores of +88.6 $\pm 4.91$ , 91.9 $\pm 4.22$, 85.2 $\pm 4.83$ \% for left ventricular +endocardium, epicardium and left atrial segmentation respectively. This +represents a relative increase of $9.2$, $3.3$ and $13.9$ \% in Dice scores +compared to the previous state-of-the-art. The proposed pipeline has potential +for application to a wide range of other tasks across various medical imaging +modalities. + +
+
+
+
+
+ + ♻ ☆ Multiscale Attention via Wavelet Neural Operators for Vision + Transformers + + +
+ Transformers have achieved widespread success in computer vision. At their +heart, there is a Self-Attention (SA) mechanism, an inductive bias that +associates each token in the input with every other token through a weighted +basis. The standard SA mechanism has quadratic complexity with the sequence +length, which impedes its utility to long sequences appearing in high +resolution vision. Recently, inspired by operator learning for PDEs, Adaptive +Fourier Neural Operators (AFNO) were introduced for high resolution attention +based on global convolution that is efficiently implemented via FFT. However, +the AFNO global filtering cannot well represent small and moderate scale +structures that commonly appear in natural images. To leverage the +coarse-to-fine scale structures we introduce a Multiscale Wavelet Attention +(MWA) by leveraging wavelet neural operators which incurs linear complexity in +the sequence size. We replace the attention in ViT with MWA and our experiments +with CIFAR and Tiny-ImageNet classification demonstrate significant improvement +over alternative Fourier-based attentions such as AFNO and Global Filter +Network (GFN). + +
+
+
+
+
+ + ♻ ☆ R2C-GAN: Restore-to-Classify GANs for Blind X-Ray Restoration and + COVID-19 Classification + + +
+ Restoration of poor quality images with a blended set of artifacts plays a +vital role for a reliable diagnosis. Existing studies have focused on specific +restoration problems such as image deblurring, denoising, and exposure +correction where there is usually a strong assumption on the artifact type and +severity. As a pioneer study in blind X-ray restoration, we propose a joint +model for generic image restoration and classification: Restore-to-Classify +Generative Adversarial Networks (R2C-GANs). Such a jointly optimized model +keeps any disease intact after the restoration. Therefore, this will naturally +lead to a higher diagnosis performance thanks to the improved X-ray image +quality. To accomplish this crucial objective, we define the restoration task +as an Image-to-Image translation problem from poor quality having noisy, +blurry, or over/under-exposed images to high quality image domain. The proposed +R2C-GAN model is able to learn forward and inverse transforms between the two +domains using unpaired training samples. Simultaneously, the joint +classification preserves the disease label during restoration. Moreover, the +R2C-GANs are equipped with operational layers/neurons reducing the network +depth and further boosting both restoration and classification performances. +The proposed joint model is extensively evaluated over the QaTa-COV19 dataset +for Coronavirus Disease 2019 (COVID-19) classification. The proposed +restoration approach achieves over 90% F1-Score which is significantly higher +than the performance of any deep model. Moreover, in the qualitative analysis, +the restoration performance of R2C-GANs is approved by a group of medical +doctors. We share the software implementation at +https://github.com/meteahishali/R2C-GAN. + +
+
+
+
+
+ + ♻ ☆ PromptStyler: Prompt-driven Style Generation for Source-free Domain + Generalization ICCV 2023 + + +
+ In a joint vision-language space, a text feature (e.g., from "a photo of a +dog") could effectively represent its relevant image features (e.g., from dog +photos). Also, a recent study has demonstrated the cross-modal transferability +phenomenon of this joint space. From these observations, we propose +PromptStyler which simulates various distribution shifts in the joint space by +synthesizing diverse styles via prompts without using any images to deal with +source-free domain generalization. The proposed method learns to generate a +variety of style features (from "a S* style of a") via learnable style word +vectors for pseudo-words S*. To ensure that learned styles do not distort +content information, we force style-content features (from "a S* style of a +[class]") to be located nearby their corresponding content features (from +"[class]") in the joint vision-language space. After learning style word +vectors, we train a linear classifier using synthesized style-content features. +PromptStyler achieves the state of the art on PACS, VLCS, OfficeHome and +DomainNet, even though it does not require any images for training. + +
+
+ comment: Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/ +
+
+
+
+
+ + ♻ ☆ Learn More for Food Recognition via Progressive Self-Distillation AAAI 2023 + + +
+ Food recognition has a wide range of applications, such as health-aware +recommendation and self-service restaurants. Most previous methods of food +recognition firstly locate informative regions in some weakly-supervised +manners and then aggregate their features. However, location errors of +informative regions limit the effectiveness of these methods to some extent. +Instead of locating multiple regions, we propose a Progressive +Self-Distillation (PSD) method, which progressively enhances the ability of +network to mine more details for food recognition. The training of PSD +simultaneously contains multiple self-distillations, in which a teacher network +and a student network share the same embedding network. Since the student +network receives a modified image from its teacher network by masking some +informative regions, the teacher network outputs stronger semantic +representations than the student network. Guided by such teacher network with +stronger semantics, the student network is encouraged to mine more useful +regions from the modified image by enhancing its own ability. The ability of +the teacher network is also enhanced with the shared embedding network. By +using progressive training, the teacher network incrementally improves its +ability to mine more discriminative regions. In inference phase, only the +teacher network is used without the help of the student network. Extensive +experiments on three datasets demonstrate the effectiveness of our proposed +method and state-of-the-art performance. + +
+
+ comment: Accepted by AAAI 2023 +
+
+
+
+
+ + ♻ ☆ ExposureDiffusion: Learning to Expose for Low-light Image Enhancement ICCV2023 + + +
+ Previous raw image-based low-light image enhancement methods predominantly +relied on feed-forward neural networks to learn deterministic mappings from +low-light to normally-exposed images. However, they failed to capture critical +distribution information, leading to visually undesirable results. This work +addresses the issue by seamlessly integrating a diffusion model with a +physics-based exposure model. Different from a vanilla diffusion model that has +to perform Gaussian denoising, with the injected physics-based exposure model, +our restoration process can directly start from a noisy image instead of pure +noise. As such, our method obtains significantly improved performance and +reduced inference time compared with vanilla diffusion models. To make full use +of the advantages of different intermediate steps, we further propose an +adaptive residual layer that effectively screens out the side-effect in the +iterative refinement when the intermediate results have been already +well-exposed. The proposed framework can work with both real-paired datasets, +SOTA noise models, and different backbone networks. Note that, the proposed +framework is compatible with real-paired datasets, real/synthetic noise models, +and different backbone networks. We evaluate the proposed method on various +public benchmarks, achieving promising results with consistent improvements +using different exposure models and backbones. Besides, the proposed method +achieves better generalization capacity for unseen amplifying ratios and better +performance than a larger feedforward neural model when few parameters are +adopted. + +
+
+ comment: accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Implicit Temporal Modeling with Learnable Alignment for Video + Recognition ICCV 2023 + + +
+ Contrastive language-image pretraining (CLIP) has demonstrated remarkable +success in various image tasks. However, how to extend CLIP with effective +temporal modeling is still an open and crucial problem. Existing factorized or +joint spatial-temporal modeling trades off between the efficiency and +performance. While modeling temporal information within straight through tube +is widely adopted in literature, we find that simple frame alignment already +provides enough essence without temporal attention. To this end, in this paper, +we proposed a novel Implicit Learnable Alignment (ILA) method, which minimizes +the temporal modeling effort while achieving incredibly high performance. +Specifically, for a frame pair, an interactive point is predicted in each +frame, serving as a mutual information rich region. By enhancing the features +around the interactive point, two frames are implicitly aligned. The aligned +features are then pooled into a single token, which is leveraged in the +subsequent spatial self-attention. Our method allows eliminating the costly or +insufficient temporal self-attention in video. Extensive experiments on +benchmarks demonstrate the superiority and generality of our module. +Particularly, the proposed ILA achieves a top-1 accuracy of 88.7% on +Kinetics-400 with much fewer FLOPs compared with Swin-L and ViViT-H. Code is +released at https://github.com/Francis-Rings/ILA . + +
+
+ comment: ICCV 2023 oral. 14 pages, 7 figures. Code released at + https://github.com/Francis-Rings/ILA +
+
+
+
+
+ + ♻ ☆ Minimum Latency Deep Online Video Stabilization ICCV 2023 + + +
+ We present a novel camera path optimization framework for the task of online +video stabilization. Typically, a stabilization pipeline consists of three +steps: motion estimating, path smoothing, and novel view rendering. Most +previous methods concentrate on motion estimation, proposing various global or +local motion models. In contrast, path optimization receives relatively less +attention, especially in the important online setting, where no future frames +are available. In this work, we adopt recent off-the-shelf high-quality deep +motion models for motion estimation to recover the camera trajectory and focus +on the latter two steps. Our network takes a short 2D camera path in a sliding +window as input and outputs the stabilizing warp field of the last frame in the +window, which warps the coming frame to its stabilized position. A hybrid loss +is well-defined to constrain the spatial and temporal consistency. In addition, +we build a motion dataset that contains stable and unstable motion pairs for +the training. Extensive experiments demonstrate that our approach significantly +outperforms state-of-the-art online methods both qualitatively and +quantitatively and achieves comparable performance to offline methods. Our code +and dataset are available at https://github.com/liuzhen03/NNDVS + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ μSplit: efficient image decomposition for microscopy data ICCV 2023 + + +
+ We present {\mu}Split, a dedicated approach for trained image decomposition +in the context of fluorescence microscopy images. We find that best results +using regular deep architectures are achieved when large image patches are used +during training, making memory consumption the limiting factor to further +improving performance. We therefore introduce lateral contextualization (LC), a +memory efficient way to train powerful networks and show that LC leads to +consistent and significant improvements on the task at hand. We integrate LC +with U-Nets, Hierarchical AEs, and Hierarchical VAEs, for which we formulate a +modified ELBO loss. Additionally, LC enables training deeper hierarchical +models than otherwise possible and, interestingly, helps to reduce tiling +artefacts that are inherently impossible to avoid when using tiled VAE +predictions. We apply {\mu}Split to five decomposition tasks, one on a +synthetic dataset, four others derived from real microscopy data. LC achieves +SOTA results (average improvements to the best baseline of 2.36 dB PSNR), while +simultaneously requiring considerably less GPU memory. + +
+
+ comment: Published at ICCV 2023. 10 pages, 7 figures, 9 pages supplement, 8 + supplementary figures +
+
+
+
+
+ + ♻ ☆ Catastrophic overfitting can be induced with discriminative non-robust + features + + +
+ Adversarial training (AT) is the de facto method for building robust neural +networks, but it can be computationally expensive. To mitigate this, fast +single-step attacks can be used, but this may lead to catastrophic overfitting +(CO). This phenomenon appears when networks gain non-trivial robustness during +the first stages of AT, but then reach a breaking point where they become +vulnerable in just a few iterations. The mechanisms that lead to this failure +mode are still poorly understood. In this work, we study the onset of CO in +single-step AT methods through controlled modifications of typical datasets of +natural images. In particular, we show that CO can be induced at much smaller +$\epsilon$ values than it was observed before just by injecting images with +seemingly innocuous features. These features aid non-robust classification but +are not enough to achieve robustness on their own. Through extensive +experiments we analyze this novel phenomenon and discover that the presence of +these easy features induces a learning shortcut that leads to CO. Our findings +provide new insights into the mechanisms of CO and improve our understanding of +the dynamics of AT. The code to reproduce our experiments can be found at +https://github.com/gortizji/co_features. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ♻ ☆ GPGait: Generalized Pose-based Gait Recognition ICCV + + +
+ Recent works on pose-based gait recognition have demonstrated the potential +of using such simple information to achieve results comparable to +silhouette-based methods. However, the generalization ability of pose-based +methods on different datasets is undesirably inferior to that of +silhouette-based ones, which has received little attention but hinders the +application of these methods in real-world scenarios. To improve the +generalization ability of pose-based methods across datasets, we propose a +\textbf{G}eneralized \textbf{P}ose-based \textbf{Gait} recognition +(\textbf{GPGait}) framework. First, a Human-Oriented Transformation (HOT) and a +series of Human-Oriented Descriptors (HOD) are proposed to obtain a unified +pose representation with discriminative multi-features. Then, given the slight +variations in the unified representation after HOT and HOD, it becomes crucial +for the network to extract local-global relationships between the keypoints. To +this end, a Part-Aware Graph Convolutional Network (PAGCN) is proposed to +enable efficient graph partition and local-global spatial feature extraction. +Experiments on four public gait recognition datasets, CASIA-B, OUMVLP-Pose, +Gait3D and GREW, show that our model demonstrates better and more stable +cross-domain capabilities compared to existing skeleton-based methods, +achieving comparable recognition results to silhouette-based ones. Code is +available at https://github.com/BNU-IVC/FastPoseGait. + +
+
+ comment: ICCV Camera Ready +
+
+
+
+
+ + ♻ ☆ SAMFlow: Eliminating Any Fragmentation in Optical Flow with Segment + Anything Model + + +
+ Optical Flow Estimation aims to find the 2D dense motion field between two +frames. Due to the limitation of model structures and training datasets, +existing methods often rely too much on local clues and ignore the integrity of +objects, resulting in fragmented motion estimation. Through theoretical +analysis, we find the pre-trained large vision models are helpful in optical +flow estimation, and we notice that the recently famous Segment Anything Model +(SAM) demonstrates a strong ability to segment complete objects, which is +suitable for solving the fragmentation problem. We thus propose a solution to +embed the frozen SAM image encoder into FlowFormer to enhance object +perception. To address the challenge of in-depth utilizing SAM in +non-segmentation tasks like optical flow estimation, we propose an Optical Flow +Task-Specific Adaption scheme, including a Context Fusion Module to fuse the +SAM encoder with the optical flow context encoder, and a Context Adaption +Module to adapt the SAM features for optical flow task with Learned +Task-Specific Embedding. Our proposed SAMFlow model reaches 0.86/2.10 +clean/final EPE and 3.55/12.32 EPE/F1-all on Sintel and KITTI-15 training set, +surpassing Flowformer by 8.5%/9.9% and 13.2%/16.3%. Furthermore, our model +achieves state-of-the-art performance on the Sintel and KITTI-15 benchmarks, +ranking #1 among all two-frame methods on Sintel clean pass. + +
+
+
+
+
+ + ♻ ☆ Learning Distinct and Representative Styles for Image Captioning NeurIPS 2022 + + +
+ Over the years, state-of-the-art (SoTA) image captioning methods have +achieved promising results on some evaluation metrics (e.g., CIDEr). However, +recent findings show that the captions generated by these methods tend to be +biased toward the "average" caption that only captures the most general mode +(a.k.a, language pattern) in the training corpus, i.e., the so-called mode +collapse problem. Affected by it, the generated captions are limited in +diversity and usually less informative than natural image descriptions made by +humans. In this paper, we seek to avoid this problem by proposing a Discrete +Mode Learning (DML) paradigm for image captioning. Our innovative idea is to +explore the rich modes in the training caption corpus to learn a set of "mode +embeddings", and further use them to control the mode of the generated captions +for existing image captioning models. Specifically, the proposed DML optimizes +a dual architecture that consists of an image-conditioned discrete variational +autoencoder (CdVAE) branch and a mode-conditioned image captioning (MIC) +branch. The CdVAE branch maps each image caption to one of the mode embeddings +stored in a learned codebook, and is trained with a pure non-autoregressive +generation objective to make the modes distinct and representative. The MIC +branch can be simply modified from an existing image captioning model, where +the mode embedding is added to the original word embeddings as the control +signal. In the experiments, we apply the proposed DML to two widely used image +captioning models, Transformer and AoANet. The results show that the learned +mode embedding successfully facilitates these models to generate high-quality +image captions with different modes, further leading to better performance for +both diversity and quality on the MSCOCO dataset. + +
+
+ comment: NeurIPS 2022 +
+
+
+
+
+ + ♻ ☆ LayoutDiffusion: Improving Graphic Layout Generation by Discrete + Diffusion Probabilistic Models ICCV2023 + + +
+ Creating graphic layouts is a fundamental step in graphic designs. In this +work, we present a novel generative model named LayoutDiffusion for automatic +layout generation. As layout is typically represented as a sequence of discrete +tokens, LayoutDiffusion models layout generation as a discrete denoising +diffusion process. It learns to reverse a mild forward process, in which +layouts become increasingly chaotic with the growth of forward steps and +layouts in the neighboring steps do not differ too much. Designing such a mild +forward process is however very challenging as layout has both categorical +attributes and ordinal attributes. To tackle the challenge, we summarize three +critical factors for achieving a mild forward process for the layout, i.e., +legality, coordinate proximity and type disruption. Based on the factors, we +propose a block-wise transition matrix coupled with a piece-wise linear noise +schedule. Experiments on RICO and PubLayNet datasets show that LayoutDiffusion +outperforms state-of-the-art approaches significantly. Moreover, it enables two +conditional layout generation tasks in a plug-and-play manner without +re-training and achieves better performance than existing methods. + +
+
+ comment: Accepted by ICCV2023, project page: https://layoutdiffusion.github.io +
+
+
+
+
+ + ♻ ☆ Cross-Ray Neural Radiance Fields for Novel-view Synthesis from + Unconstrained Image Collections ICCV 2023 + + +
+ Neural Radiance Fields (NeRF) is a revolutionary approach for rendering +scenes by sampling a single ray per pixel and it has demonstrated impressive +capabilities in novel-view synthesis from static scene images. However, in +practice, we usually need to recover NeRF from unconstrained image collections, +which poses two challenges: 1) the images often have dynamic changes in +appearance because of different capturing time and camera settings; 2) the +images may contain transient objects such as humans and cars, leading to +occlusion and ghosting artifacts. Conventional approaches seek to address these +challenges by locally utilizing a single ray to synthesize a color of a pixel. +In contrast, humans typically perceive appearance and objects by globally +utilizing information across multiple pixels. To mimic the perception process +of humans, in this paper, we propose Cross-Ray NeRF (CR-NeRF) that leverages +interactive information across multiple rays to synthesize occlusion-free novel +views with the same appearances as the images. Specifically, to model varying +appearances, we first propose to represent multiple rays with a novel cross-ray +feature and then recover the appearance by fusing global statistics, i.e., +feature covariance of the rays and the image appearance. Moreover, to avoid +occlusion introduced by transient objects, we propose a transient objects +handler and introduce a grid sampling strategy for masking out the transient +objects. We theoretically find that leveraging correlation across multiple rays +promotes capturing more global information. Moreover, extensive experimental +results on large real-world datasets verify the effectiveness of CR-NeRF. + +
+
+ comment: ICCV 2023 Oral +
+
+
+
+
+ + ♻ ☆ Spatial Self-Distillation for Object Detection with Inaccurate Bounding + Boxes ICCV 2023 + + +
+ Object detection via inaccurate bounding boxes supervision has boosted a +broad interest due to the expensive high-quality annotation data or the +occasional inevitability of low annotation quality (\eg tiny objects). The +previous works usually utilize multiple instance learning (MIL), which highly +depends on category information, to select and refine a low-quality box. Those +methods suffer from object drift, group prediction and part domination problems +without exploring spatial information. In this paper, we heuristically propose +a \textbf{Spatial Self-Distillation based Object Detector (SSD-Det)} to mine +spatial information to refine the inaccurate box in a self-distillation +fashion. SSD-Det utilizes a Spatial Position Self-Distillation \textbf{(SPSD)} +module to exploit spatial information and an interactive structure to combine +spatial information and category information, thus constructing a high-quality +proposal bag. To further improve the selection procedure, a Spatial Identity +Self-Distillation \textbf{(SISD)} module is introduced in SSD-Det to obtain +spatial confidence to help select the best proposals. Experiments on MS-COCO +and VOC datasets with noisy box annotation verify our method's effectiveness +and achieve state-of-the-art performance. The code is available at +https://github.com/ucas-vg/PointTinyBenchmark/tree/SSD-Det. + +
+
+ comment: accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Supervised Homography Learning with Realistic Dataset Generation ICCV 2023 + + +
+ In this paper, we propose an iterative framework, which consists of two +phases: a generation phase and a training phase, to generate realistic training +data and yield a supervised homography network. In the generation phase, given +an unlabeled image pair, we utilize the pre-estimated dominant plane masks and +homography of the pair, along with another sampled homography that serves as +ground truth to generate a new labeled training pair with realistic motion. In +the training phase, the generated data is used to train the supervised +homography network, in which the training data is refined via a content +consistency module and a quality assessment module. Once an iteration is +finished, the trained network is used in the next data generation phase to +update the pre-estimated homography. Through such an iterative strategy, the +quality of the dataset and the performance of the network can be gradually and +simultaneously improved. Experimental results show that our method achieves +state-of-the-art performance and existing supervised methods can be also +improved based on the generated dataset. Code and dataset are available at +https://github.com/JianghaiSCU/RealSH. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ HaMuCo: Hand Pose Estimation via Multiview Collaborative Self-Supervised + Learning ICCV 2023 + + +
+ Recent advancements in 3D hand pose estimation have shown promising results, +but its effectiveness has primarily relied on the availability of large-scale +annotated datasets, the creation of which is a laborious and costly process. To +alleviate the label-hungry limitation, we propose a self-supervised learning +framework, HaMuCo, that learns a single-view hand pose estimator from +multi-view pseudo 2D labels. However, one of the main challenges of +self-supervised learning is the presence of noisy labels and the ``groupthink'' +effect from multiple views. To overcome these issues, we introduce a cross-view +interaction network that distills the single-view estimator by utilizing the +cross-view correlated features and enforcing multi-view consistency to achieve +collaborative learning. Both the single-view estimator and the cross-view +interaction network are trained jointly in an end-to-end manner. Extensive +experiments show that our method can achieve state-of-the-art performance on +multi-view self-supervised hand pose estimation. Furthermore, the proposed +cross-view interaction network can also be applied to hand pose estimation from +multi-view input and outperforms previous methods under the same settings. + +
+
+ comment: Accepted to ICCV 2023. Won first place in the HANDS22 Challenge Task + 2. Project page: https://zxz267.github.io/HaMuCo +
+
+
+
+
+ + ♻ ☆ MiAMix: Enhancing Image Classification through a Multi-stage Augmented + Mixed Sample Data Augmentation Method + + +
+ Despite substantial progress in the field of deep learning, overfitting +persists as a critical challenge, and data augmentation has emerged as a +particularly promising approach due to its capacity to enhance model +generalization in various computer vision tasks. While various strategies have +been proposed, Mixed Sample Data Augmentation (MSDA) has shown great potential +for enhancing model performance and generalization. We introduce a novel mixup +method called MiAMix, which stands for Multi-stage Augmented Mixup. MiAMix +integrates image augmentation into the mixup framework, utilizes multiple +diversified mixing methods concurrently, and improves the mixing method by +randomly selecting mixing mask augmentation methods. Recent methods utilize +saliency information and the MiAMix is designed for computational efficiency as +well, reducing additional overhead and offering easy integration into existing +training pipelines. We comprehensively evaluate MiaMix using four image +benchmarks and pitting it against current state-of-the-art mixed sample data +augmentation techniques to demonstrate that MIAMix improves performance without +heavy computational overhead. + +
+
+
+
+
+ + ♻ ☆ Multi-Scale Hybrid Vision Transformer for Learning Gastric Histology: + AI-Based Decision Support System for Gastric Cancer Treatment + + +
+ Gastric endoscopic screening is an effective way to decide appropriate +gastric cancer (GC) treatment at an early stage, reducing GC-associated +mortality rate. Although artificial intelligence (AI) has brought a great +promise to assist pathologist to screen digitalized whole slide images, +existing AI systems are limited in fine-grained cancer subclassifications and +have little usability in planning cancer treatment. We propose a practical AI +system that enables five subclassifications of GC pathology, which can be +directly matched to general GC treatment guidance. The AI system is designed to +efficiently differentiate multi-classes of GC through multi-scale +self-attention mechanism using 2-stage hybrid Vision Transformer (ViT) +networks, by mimicking the way how human pathologists understand histology. The +AI system demonstrates reliable diagnostic performance by achieving +class-average sensitivity of above 0.85 on a total of 1,212 slides from +multicentric cohort. Furthermore, AI-assisted pathologists show significantly +improved diagnostic sensitivity by 12% in addition to 18% reduced screening +time compared to human pathologists. Our results demonstrate that AI-assisted +gastric endoscopic screening has a great potential for providing presumptive +pathologic opinion and appropriate cancer treatment of gastric cancer in +practical clinical settings. + +
+
+
+
+
+ + ♻ ☆ DiffSketcher: Text Guided Vector Sketch Synthesis through Latent + Diffusion Models + + +
+ Even though trained mainly on images, we discover that pretrained diffusion +models show impressive power in guiding sketch synthesis. In this paper, we +present DiffSketcher, an innovative algorithm that creates vectorized free-hand +sketches using natural language input. DiffSketcher is developed based on a +pre-trained text-to-image diffusion model. It performs the task by directly +optimizing a set of Bezier curves with an extended version of the score +distillation sampling (SDS) loss, which allows us to use a raster-level +diffusion model as a prior for optimizing a parametric vectorized sketch +generator. Furthermore, we explore attention maps embedded in the diffusion +model for effective stroke initialization to speed up the generation process. +The generated sketches demonstrate multiple levels of abstraction while +maintaining recognizability, underlying structure, and essential visual details +of the subject drawn. Our experiments show that DiffSketcher achieves greater +quality than prior work. + +
+
+ comment: 14 pages, 8 figures. update: improved experiment analysis, fixed + typos, and fixed image errors +
+
+
+
+
+ + ♻ ☆ I2P-Rec: Recognizing Images on Large-scale Point Cloud Maps through + Bird's Eye View Projections IROS 2023 + + +
+ Place recognition is an important technique for autonomous cars to achieve +full autonomy since it can provide an initial guess to online localization +algorithms. Although current methods based on images or point clouds have +achieved satisfactory performance, localizing the images on a large-scale point +cloud map remains a fairly unexplored problem. This cross-modal matching task +is challenging due to the difficulty in extracting consistent descriptors from +images and point clouds. In this paper, we propose the I2P-Rec method to solve +the problem by transforming the cross-modal data into the same modality. +Specifically, we leverage on the recent success of depth estimation networks to +recover point clouds from images. We then project the point clouds into Bird's +Eye View (BEV) images. Using the BEV image as an intermediate representation, +we extract global features with a Convolutional Neural Network followed by a +NetVLAD layer to perform matching. The experimental results evaluated on the +KITTI dataset show that, with only a small set of training data, I2P-Rec +achieves recall rates at Top-1\% over 80\% and 90\%, when localizing monocular +and stereo images on point cloud maps, respectively. We further evaluate +I2P-Rec on a 1 km trajectory dataset collected by an autonomous logistics car +and show that I2P-Rec can generalize well to previously unseen environments. + +
+
+ comment: Accepted by IROS 2023 +
+
+
+
+
+ + ♻ ☆ CTP-Net: Character Texture Perception Network for Document Image Forgery + Localization + + +
+ Due to the progression of information technology in recent years, document +images have been widely disseminated on social networks. With the help of +powerful image editing tools, document images are easily forged without leaving +visible manipulation traces, which leads to severe issues if significant +information is falsified for malicious use. Therefore, the research of document +image forensics is worth further exploring. In this paper, we propose a +Character Texture Perception Network (CTP-Net) to localize the forged regions +in document images. Specifically, considering the characters with semantics in +a document image are highly vulnerable, capturing the forgery traces is the key +to localize the forged regions. We design a Character Texture Stream (CTS) +based on optical character recognition to capture features of text areas that +are essential components of a document image. Meanwhile, texture features of +the whole document image are exploited by an Image Texture Stream (ITS). +Combining the features extracted from the CTS and the ITS, the CTP-Net can +reveal more subtle forgery traces from document images. Moreover, to overcome +the challenge caused by the lack of fake document images, we design a data +generation strategy that is utilized to construct a Fake Chinese Trademark +dataset (FCTM). Experimental results on different datasets demonstrate that the +proposed CTP-Net is able to localize multi-scale forged areas in document +images, and outperform the state-of-the-art forgery localization methods, even +though post-processing operations are applied. + +
+
+
+
+
+ + ♻ ☆ BEVPlace: Learning LiDAR-based Place Recognition using Bird's Eye View + Images ICCV 2023 + + +
+ Place recognition is a key module for long-term SLAM systems. Current +LiDAR-based place recognition methods usually use representations of point +clouds such as unordered points or range images. These methods achieve high +recall rates of retrieval, but their performance may degrade in the case of +view variation or scene changes. In this work, we explore the potential of a +different representation in place recognition, i.e. bird's eye view (BEV) +images. We observe that the structural contents of BEV images are less +influenced by rotations and translations of point clouds. We validate that, +without any delicate design, a simple VGGNet trained on BEV images achieves +comparable performance with the state-of-the-art place recognition methods in +scenes of slight viewpoint changes. For more robust place recognition, we +design a rotation-invariant network called BEVPlace. We use group convolution +to extract rotation-equivariant local features from the images and NetVLAD for +global feature aggregation. In addition, we observe that the distance between +BEV features is correlated with the geometry distance of point clouds. Based on +the observation, we develop a method to estimate the position of the query +cloud, extending the usage of place recognition. The experiments conducted on +large-scale public datasets show that our method 1) achieves state-of-the-art +performance in terms of recall rates, 2) is robust to view changes, 3) shows +strong generalization ability, and 4) can estimate the positions of query point +clouds. Source codes are publicly available at +https://github.com/zjuluolun/BEVPlace. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ MMF-Track: Multi-modal Multi-level Fusion for 3D Single Object Tracking + + +
+ 3D single object tracking plays a crucial role in computer vision. Mainstream +methods mainly rely on point clouds to achieve geometry matching between target +template and search area. However, textureless and incomplete point clouds make +it difficult for single-modal trackers to distinguish objects with similar +structures. To overcome the limitations of geometry matching, we propose a +Multi-modal Multi-level Fusion Tracker (MMF-Track), which exploits the image +texture and geometry characteristic of point clouds to track 3D target. +Specifically, we first propose a Space Alignment Module (SAM) to align RGB +images with point clouds in 3D space, which is the prerequisite for +constructing inter-modal associations. Then, in feature interaction level, we +design a Feature Interaction Module (FIM) based on dual-stream structure, which +enhances intra-modal features in parallel and constructs inter-modal semantic +associations. Meanwhile, in order to refine each modal feature, we introduce a +Coarse-to-Fine Interaction Module (CFIM) to realize the hierarchical feature +interaction at different scales. Finally, in similarity fusion level, we +propose a Similarity Fusion Module (SFM) to aggregate geometry and texture +clues from the target. Experiments show that our method achieves +state-of-the-art performance on KITTI (39% Success and 42% Precision gains +against previous multi-modal method) and is also competitive on NuScenes. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Improved Visual Fine-tuning with Natural Language Supervision ICCV'23 + + +
+ Fine-tuning a visual pre-trained model can leverage the semantic information +from large-scale pre-training data and mitigate the over-fitting problem on +downstream vision tasks with limited training examples. While the problem of +catastrophic forgetting in pre-trained backbone has been extensively studied +for fine-tuning, its potential bias from the corresponding pre-training task +and data, attracts less attention. In this work, we investigate this problem by +demonstrating that the obtained classifier after fine-tuning will be close to +that induced by the pre-trained model. To reduce the bias in the classifier +effectively, we introduce a reference distribution obtained from a fixed text +classifier, which can help regularize the learned vision classifier. The +proposed method, Text Supervised fine-tuning (TeS), is evaluated with diverse +pre-trained vision models including ResNet and ViT, and text encoders including +BERT and CLIP, on 11 downstream tasks. The consistent improvement with a clear +margin over distinct scenarios confirms the effectiveness of our proposal. Code +is available at \url{https://github.com/idstcv/TeS}. + +
+
+ comment: accepted by ICCV'23 +
+
+
+
+
+ + ♻ ☆ FOLT: Fast Multiple Object Tracking from UAV-captured Videos Based on + Optical Flow + + +
+ Multiple object tracking (MOT) has been successfully investigated in computer +vision. + However, MOT for the videos captured by unmanned aerial vehicles (UAV) is +still challenging due to small object size, blurred object appearance, and very +large and/or irregular motion in both ground objects and UAV platforms. + In this paper, we propose FOLT to mitigate these problems and reach fast and +accurate MOT in UAV view. + Aiming at speed-accuracy trade-off, FOLT adopts a modern detector and +light-weight optical flow extractor to extract object detection features and +motion features at a minimum cost. + Given the extracted flow, the flow-guided feature augmentation is designed to +augment the object detection feature based on its optical flow, which improves +the detection of small objects. + Then the flow-guided motion prediction is also proposed to predict the +object's position in the next frame, which improves the tracking performance of +objects with very large displacements between adjacent frames. + Finally, the tracker matches the detected objects and predicted objects using +a spatially matching scheme to generate tracks for every object. + Experiments on Visdrone and UAVDT datasets show that our proposed model can +successfully track small objects with large and irregular motion and outperform +existing state-of-the-art methods in UAV-MOT tasks. + +
+
+ comment: Accepted by ACM Multi-Media 2023 +
+
+
+
+
+ + ♻ ☆ Improved YOLOv8 Detection Algorithm in Security Inspection Image + + +
+ Security inspection is the first line of defense to ensure the safety of +people's lives and property, and intelligent security inspection is an +inevitable trend in the future development of the security inspection industry. +Aiming at the problems of overlapping detection objects, false detection of +contraband, and missed detection in the process of X-ray image detection, an +improved X-ray contraband detection algorithm CSS-YOLO based on YOLOv8s is +proposed. + +
+
+ comment: 23 pages,23 figures +
+
+
+
+
+ + ♻ ☆ Anomaly Detection in Automated Fibre Placement: Learning with Data + Limitations + + +
+ Conventional defect detection systems in Automated Fibre Placement (AFP) +typically rely on end-to-end supervised learning, necessitating a substantial +number of labelled defective samples for effective training. However, the +scarcity of such labelled data poses a challenge. To overcome this limitation, +we present a comprehensive framework for defect detection and localization in +Automated Fibre Placement. Our approach combines unsupervised deep learning and +classical computer vision algorithms, eliminating the need for labelled data or +manufacturing defect samples. It efficiently detects various surface issues +while requiring fewer images of composite parts for training. Our framework +employs an innovative sample extraction method leveraging AFP's inherent +symmetry to expand the dataset. By inputting a depth map of the fibre layup +surface, we extract local samples aligned with each composite strip (tow). +These samples are processed through an autoencoder, trained on normal samples +for precise reconstructions, highlighting anomalies through reconstruction +errors. Aggregated values form an anomaly map for insightful visualization. The +framework employs blob detection on this map to locate manufacturing defects. +The experimental findings reveal that despite training the autoencoder with a +limited number of images, our proposed method exhibits satisfactory detection +accuracy and accurately identifies defect locations. Our framework demonstrates +comparable performance to existing methods, while also offering the advantage +of detecting all types of anomalies without relying on an extensive labelled +dataset of defects. + +
+
+
+
+
+ + ♻ ☆ InfiniCity: Infinite-Scale City Synthesis + + +
+ Toward infinite-scale 3D city synthesis, we propose a novel framework, +InfiniCity, which constructs and renders an unconstrainedly large and +3D-grounded environment from random noises. InfiniCity decomposes the seemingly +impractical task into three feasible modules, taking advantage of both 2D and +3D data. First, an infinite-pixel image synthesis module generates +arbitrary-scale 2D maps from the bird's-eye view. Next, an octree-based voxel +completion module lifts the generated 2D map to 3D octrees. Finally, a +voxel-based neural rendering module texturizes the voxels and renders 2D +images. InfiniCity can thus synthesize arbitrary-scale and traversable 3D city +environments, and allow flexible and interactive editing from users. We +quantitatively and qualitatively demonstrate the efficacy of the proposed +framework. Project page: https://hubert0527.github.io/infinicity/ + +
+
+
+
+
+ + ♻ ☆ Event and Entity Extraction from Generated Video Captions + + +
+ Annotation of multimedia data by humans is time-consuming and costly, while +reliable automatic generation of semantic metadata is a major challenge. We +propose a framework to extract semantic metadata from automatically generated +video captions. As metadata, we consider entities, the entities' properties, +relations between entities, and the video category. We employ two +state-of-the-art dense video captioning models with masked transformer (MT) and +parallel decoding (PVDC) to generate captions for videos of the ActivityNet +Captions dataset. Our experiments show that it is possible to extract entities, +their properties, relations between entities, and the video category from the +generated captions. We observe that the quality of the extracted information is +mainly influenced by the quality of the event localization in the video as well +as the performance of the event caption generation. + +
+
+ comment: Paper accepted at CD-MAKE 2023 +
+
+
+
+
+ + ♻ ☆ ERM++: An Improved Baseline for Domain Generalization + + +
+ Multi-source Domain Generalization (DG) measures a classifier's ability to +generalize to new distributions of data it was not trained on, given several +training domains. While several multi-source DG methods have been proposed, +they incur additional complexity during training by using domain labels. Recent +work has shown that a well-tuned Empirical Risk Minimization (ERM) training +procedure, that is simply minimizing the empirical risk on the source domains, +can outperform most existing DG methods. We identify several key candidate +techniques to further improve ERM performance, such as better utilization of +training data, model parameter selection, and weight-space regularization. We +call the resulting method ERM++, and show it significantly improves the +performance of DG on five multi-source datasets by over 5% compared to standard +ERM, and beats state-of-the-art despite being less computationally expensive. +Additionally, we demonstrate the efficacy of ERM++ on the WILDS-FMOW dataset, a +challenging DG benchmark. We hope that ERM++ becomes a strong baseline for +future DG research. Code is released at +https://github.com/piotr-teterwak/erm_plusplus. + +
+
+ comment: An improved baseline for Domain Generalization +
+
+
+
+
+ + ♻ ☆ Accurate Eye Tracking from Dense 3D Surface Reconstructions using + Single-Shot Deflectometry + + +
+ Eye-tracking plays a crucial role in the development of virtual reality +devices, neuroscience research, and psychology. Despite its significance in +numerous applications, achieving an accurate, robust, and fast eye-tracking +solution remains a considerable challenge for current state-of-the-art methods. +While existing reflection-based techniques (e.g., "glint tracking") are +considered the most accurate, their performance is limited by their reliance on +sparse 3D surface data acquired solely from the cornea surface. In this paper, +we rethink the way how specular reflections can be used for eye tracking: We +propose a novel method for accurate and fast evaluation of the gaze direction +that exploits teachings from single-shot phase-measuring-deflectometry (PMD). +In contrast to state-of-the-art reflection-based methods, our method acquires +dense 3D surface information of both cornea and sclera within only one single +camera frame (single-shot). Improvements in acquired reflection surface +points("glints") of factors $>3300 \times$ are easily achievable. We show the +feasibility of our approach with experimentally evaluated gaze errors of only +$\leq 0.25^\circ$ demonstrating a significant improvement over the current +state-of-the-art. + +
+
+
+
+
+ + ♻ ☆ Pseudo Supervised Metrics: Evaluating Unsupervised Image to Image + Translation Models In Unsupervised Cross-Domain Classification Frameworks + + +
+ The ability to classify images accurately and efficiently is dependent on +having access to large labeled datasets and testing on data from the same +domain that the model is trained on. Classification becomes more challenging +when dealing with new data from a different domain, where collecting a large +labeled dataset and training a new classifier from scratch is time-consuming, +expensive, and sometimes infeasible or impossible. Cross-domain classification +frameworks were developed to handle this data domain shift problem by utilizing +unsupervised image-to-image (UI2I) translation models to translate an input +image from the unlabeled domain to the labeled domain. The problem with these +unsupervised models lies in their unsupervised nature. For lack of annotations, +it is not possible to use the traditional supervised metrics to evaluate these +translation models to pick the best-saved checkpoint model. In this paper, we +introduce a new method called Pseudo Supervised Metrics that was designed +specifically to support cross-domain classification applications contrary to +other typically used metrics such as the FID which was designed to evaluate the +model in terms of the quality of the generated image from a human-eye +perspective. We show that our metric not only outperforms unsupervised metrics +such as the FID, but is also highly correlated with the true supervised +metrics, robust, and explainable. Furthermore, we demonstrate that it can be +used as a standard metric for future research in this field by applying it to a +critical real-world problem (the boiling crisis problem). + +
+
+ comment: arXiv admin note: text overlap with arXiv:2212.09107 +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ Investigation Toward The Economic Feasibility of Personalized Medicine + For Healthcare Service Providers: The Case of Bladder Cancer + + +
+ In today's complex healthcare landscape, the pursuit of delivering optimal +patient care while navigating intricate economic dynamics poses a significant +challenge for healthcare service providers (HSPs). In this already complex +dynamics, the emergence of clinically promising personalized medicine based +treatment aims to revolutionize medicine. While personalized medicine holds +tremendous potential for enhancing therapeutic outcomes, its integration within +resource-constrained HSPs presents formidable challenges. In this study, we +investigate the economic feasibility of implementing personalized medicine. The +central objective is to strike a balance between catering to individual patient +needs and making economically viable decisions. Unlike conventional binary +approaches to personalized treatment, we propose a more nuanced perspective by +treating personalization as a spectrum. This approach allows for greater +flexibility in decision-making and resource allocation. To this end, we propose +a mathematical framework to investigate our proposal, focusing on Bladder +Cancer (BC) as a case study. Our results show that while it is feasible to +introduce personalized medicine, a highly efficient but highly expensive one +would be short-lived relative to its less effective but cheaper alternative as +the latter can be provided to a larger cohort of patients, optimizing the HSP's +objective better. + +
+
+
+
+
+ + ☆ Synthesizing Political Zero-Shot Relation Classification via Codebook + Knowledge, NLI, and ChatGPT + + +
+ Recent supervised models for event coding vastly outperform pattern-matching +methods. However, their reliance solely on new annotations disregards the vast +knowledge within expert databases, hindering their applicability to +fine-grained classification. To address these limitations, we explore zero-shot +approaches for political event ontology relation classification, by leveraging +knowledge from established annotation codebooks. Our study encompasses both +ChatGPT and a novel natural language inference (NLI) based approach named ZSP. +ZSP adopts a tree-query framework that deconstructs the task into context, +modality, and class disambiguation levels. This framework improves +interpretability, efficiency, and adaptability to schema changes. By conducting +extensive experiments on our newly curated datasets, we pinpoint the +instability issues within ChatGPT and highlight the superior performance of +ZSP. ZSP achieves an impressive 40% improvement in F1 score for fine-grained +Rootcode classification. ZSP demonstrates competitive performance compared to +supervised BERT models, positioning it as a valuable tool for event record +validation and ontology development. Our work underscores the potential of +leveraging transfer learning and existing expertise to enhance the efficiency +and scalability of research in the field. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Impression-Aware Recommender Systems + + +
+ Novel data sources bring new opportunities to improve the quality of +recommender systems. Impressions are a novel data source containing past +recommendations (shown items) and traditional interactions. Researchers may use +impressions to refine user preferences and overcome the current limitations in +recommender systems research. The relevance and interest of impressions have +increased over the years; hence, the need for a review of relevant work on this +type of recommenders. We present a systematic literature review on recommender +systems using impressions, focusing on three fundamental angles in research: +recommenders, datasets, and evaluation methodologies. We provide three +categorizations of papers describing recommenders using impressions, present +each reviewed paper in detail, describe datasets with impressions, and analyze +the existing evaluation methodologies. Lastly, we present open questions and +future directions of interest, highlighting aspects missing in the literature +that can be addressed in future works. + +
+
+ comment: 34 pages, 103 references, 6 tables, 2 figures, ACM UNDER REVIEW +
+
+
+
+
+ + ☆ Dynamic Embedding Size Search with Minimum Regret for Streaming + Recommender System CIKM2023 + + +
+ With the continuous increase of users and items, conventional recommender +systems trained on static datasets can hardly adapt to changing environments. +The high-throughput data requires the model to be updated in a timely manner +for capturing the user interest dynamics, which leads to the emergence of +streaming recommender systems. Due to the prevalence of deep learning-based +recommender systems, the embedding layer is widely adopted to represent the +characteristics of users, items, and other features in low-dimensional vectors. +However, it has been proved that setting an identical and static embedding size +is sub-optimal in terms of recommendation performance and memory cost, +especially for streaming recommendations. To tackle this problem, we first +rethink the streaming model update process and model the dynamic embedding size +search as a bandit problem. Then, we analyze and quantify the factors that +influence the optimal embedding sizes from the statistics perspective. Based on +this, we propose the \textbf{D}ynamic \textbf{E}mbedding \textbf{S}ize +\textbf{S}earch (\textbf{DESS}) method to minimize the embedding size selection +regret on both user and item sides in a non-stationary manner. Theoretically, +we obtain a sublinear regret upper bound superior to previous methods. +Empirical results across two recommendation tasks on four public datasets also +demonstrate that our approach can achieve better streaming recommendation +performance with lower memory cost and higher time efficiency. + +
+
+ comment: Accepted for publication on CIKM2023 +
+
+
+
+
+ + ☆ Self-Supervised Dynamic Hypergraph Recommendation based on + Hyper-Relational Knowledge Graph + + +
+ Knowledge graphs (KGs) are commonly used as side information to enhance +collaborative signals and improve recommendation quality. In the context of +knowledge-aware recommendation (KGR), graph neural networks (GNNs) have emerged +as promising solutions for modeling factual and semantic information in KGs. +However, the long-tail distribution of entities leads to sparsity in +supervision signals, which weakens the quality of item representation when +utilizing KG enhancement. Additionally, the binary relation representation of +KGs simplifies hyper-relational facts, making it challenging to model complex +real-world information. Furthermore, the over-smoothing phenomenon results in +indistinguishable representations and information loss. To address these +challenges, we propose the SDK (Self-Supervised Dynamic Hypergraph +Recommendation based on Hyper-Relational Knowledge Graph) framework. This +framework establishes a cross-view hypergraph self-supervised learning +mechanism for KG enhancement. Specifically, we model hyper-relational facts in +KGs to capture interdependencies between entities under complete semantic +conditions. With the refined representation, a hypergraph is dynamically +constructed to preserve features in the deep vector space, thereby alleviating +the over-smoothing problem. Furthermore, we mine external supervision signals +from both the global perspective of the hypergraph and the local perspective of +collaborative filtering (CF) to guide the model prediction process. Extensive +experiments conducted on different datasets demonstrate the superiority of the +SDK framework over state-of-the-art models. The results showcase its ability to +alleviate the effects of over-smoothing and supervision signal sparsity. + +
+
+
+
+
+ + ☆ SPM: Structured Pretraining and Matching Architectures for Relevance + Modeling in Meituan Search CIKM '23 + + +
+ In e-commerce search, relevance between query and documents is an essential +requirement for satisfying user experience. Different from traditional +e-commerce platforms that offer products, users search on life service +platforms such as Meituan mainly for product providers, which usually have +abundant structured information, e.g. name, address, category, thousands of +products. Modeling search relevance with these rich structured contents is +challenging due to the following issues: (1) there is language distribution +discrepancy among different fields of structured document, making it difficult +to directly adopt off-the-shelf pretrained language model based methods like +BERT. (2) different fields usually have different importance and their length +vary greatly, making it difficult to extract document information helpful for +relevance matching. + To tackle these issues, in this paper we propose a novel two-stage +pretraining and matching architecture for relevance matching with rich +structured documents. At pretraining stage, we propose an effective pretraining +method that employs both query and multiple fields of document as inputs, +including an effective information compression method for lengthy fields. At +relevance matching stage, a novel matching method is proposed by leveraging +domain knowledge in search query to generate more effective document +representations for relevance scoring. Extensive offline experiments and online +A/B tests on millions of users verify that the proposed architectures +effectively improve the performance of relevance modeling. The model has +already been deployed online, serving the search traffic of Meituan for over a +year. + +
+
+ comment: Accepted by CIKM '23 +
+
+
+
+
+ + ☆ Learning from All Sides: Diversified Positive Augmentation via + Self-distillation in Recommendation + + +
+ Personalized recommendation relies on user historical behaviors to provide +user-interested items, and thus seriously struggles with the data sparsity +issue. A powerful positive item augmentation is beneficial to address the +sparsity issue, while few works could jointly consider both the accuracy and +diversity of these augmented training labels. In this work, we propose a novel +model-agnostic Diversified self-distillation guided positive augmentation +(DivSPA) for accurate and diverse positive item augmentations. Specifically, +DivSPA first conducts three types of retrieval strategies to collect +high-quality and diverse positive item candidates according to users' overall +interests, short-term intentions, and similar users. Next, a self-distillation +module is conducted to double-check and rerank these candidates as the final +positive augmentations. Extensive offline and online evaluations verify the +effectiveness of our proposed DivSPA on both accuracy and diversity. DivSPA is +simple and effective, which could be conveniently adapted to other base models +and systems. Currently, DivSPA has been deployed on multiple widely-used +real-world recommender systems. + +
+
+
+
+
+ + ☆ Delphic Costs and Benefits in Web Search: A utilitarian and historical + analysis + + +
+ We present a new framework to conceptualize and operationalize the total user +experience of search, by studying the entirety of a search journey from an +utilitarian point of view. + Web search engines are widely perceived as "free". But search requires time +and effort: in reality there are many intermingled non-monetary costs (e.g. +time costs, cognitive costs, interactivity costs) and the benefits may be +marred by various impairments, such as misunderstanding and misinformation. +This characterization of costs and benefits appears to be inherent to the human +search for information within the pursuit of some larger task: most of the +costs and impairments can be identified in interactions with any web search +engine, interactions with public libraries, and even in interactions with +ancient oracles. To emphasize this innate connection, we call these costs and +benefits Delphic, in contrast to explicitly financial costs and benefits. + Our main thesis is that the users' satisfaction with a search engine mostly +depends on their experience of Delphic cost and benefits, in other words on +their utility. The consumer utility is correlated with classic measures of +search engine quality, such as ranking, precision, recall, etc., but is not +completely determined by them. To argue our thesis, we catalog the Delphic +costs and benefits and show how the development of search engines over the last +quarter century, from classic Information Retrieval roots to the integration of +Large Language Models, was driven to a great extent by the quest of decreasing +Delphic costs and increasing Delphic benefits. + We hope that the Delphic costs framework will engender new ideas and new +research for evaluating and improving the web experience for everyone. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Decentralized Graph Neural Network for Privacy-Preserving Recommendation + + +
+ Building a graph neural network (GNN)-based recommender system without +violating user privacy proves challenging. Existing methods can be divided into +federated GNNs and decentralized GNNs. But both methods have undesirable +effects, i.e., low communication efficiency and privacy leakage. This paper +proposes DGREC, a novel decentralized GNN for privacy-preserving +recommendations, where users can choose to publicize their interactions. It +includes three stages, i.e., graph construction, local gradient calculation, +and global gradient passing. The first stage builds a local inner-item +hypergraph for each user and a global inter-user graph. The second stage models +user preference and calculates gradients on each local device. The third stage +designs a local differential privacy mechanism named secure gradient-sharing, +which proves strong privacy-preserving of users' private data. We conduct +extensive experiments on three public datasets to validate the consistent +superiority of our framework. + +
+
+
+
+
+ + ☆ Temporal Interest Network for Click-Through Rate Prediction + + +
+ The history of user behaviors constitutes one of the most significant +characteristics in predicting the click-through rate (CTR), owing to their +strong semantic and temporal correlation with the target item. While the +literature has individually examined each of these correlations, research has +yet to analyze them in combination, that is, the quadruple correlation of +(behavior semantics, target semantics, behavior temporal, and target temporal). +The effect of this correlation on performance and the extent to which existing +methods learn it remain unknown. To address this gap, we empirically measure +the quadruple correlation and observe intuitive yet robust quadruple patterns. +We measure the learned correlation of several representative user behavior +methods, but to our surprise, none of them learn such a pattern, especially the +temporal one. + In this paper, we propose the Temporal Interest Network (TIN) to capture the +quadruple semantic and temporal correlation between behaviors and the target. +We achieve this by incorporating target-aware temporal encoding, in addition to +semantic embedding, to represent behaviors and the target. Furthermore, we +deploy target-aware attention, along with target-aware representation, to +explicitly conduct the 4-way interaction. We performed comprehensive +evaluations on the Amazon and Alibaba datasets. Our proposed TIN outperforms +the best-performing baselines by 0.43\% and 0.29\% on two datasets, +respectively. Comprehensive analysis and visualization show that TIN is indeed +capable of learning the quadruple correlation effectively, while all existing +methods fail to do so. We provide our implementation of TIN in Tensorflow. + +
+
+
+
+
+ + ♻ ☆ Data augmentation and refinement for recommender system: A + semi-supervised approach using maximum margin matrix factorization + + +
+ Collaborative filtering (CF) has become a popular method for developing +recommender systems (RSs) where ratings of a user for new items are predicted +based on her past preferences and available preference information of other +users. Despite the popularity of CF-based methods, their performance is often +greatly limited by the sparsity of observed entries. In this study, we explore +the data augmentation and refinement aspects of Maximum Margin Matrix +Factorization (MMMF), a widely accepted CF technique for rating predictions, +which has not been investigated before. We exploit the inherent characteristics +of CF algorithms to assess the confidence level of individual ratings and +propose a semi-supervised approach for rating augmentation based on +self-training. We hypothesize that any CF algorithm's predictions with low +confidence are due to some deficiency in the training data and hence, the +performance of the algorithm can be improved by adopting a systematic data +augmentation strategy. We iteratively use some of the ratings predicted with +high confidence to augment the training data and remove low-confidence entries +through a refinement process. By repeating this process, the system learns to +improve prediction accuracy. Our method is experimentally evaluated on several +state-of-the-art CF algorithms and leads to informative rating augmentation, +improving the performance of the baseline approaches. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Large Language Models for Information Retrieval: A Survey + + +
+ As a primary means of information acquisition, information retrieval (IR) +systems, such as search engines, have integrated themselves into our daily +lives. These systems also serve as components of dialogue, question-answering, +and recommender systems. The trajectory of IR has evolved dynamically from its +origins in term-based methods to its integration with advanced neural models. +While the neural models excel at capturing complex contextual signals and +semantic nuances, thereby reshaping the IR landscape, they still face +challenges such as data scarcity, interpretability, and the generation of +contextually plausible yet potentially inaccurate responses. This evolution +requires a combination of both traditional methods (such as term-based sparse +retrieval methods with rapid response) and modern neural architectures (such as +language models with powerful language understanding capacity). Meanwhile, the +emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has +revolutionized natural language processing due to their remarkable language +understanding, generation, generalization, and reasoning abilities. +Consequently, recent research has sought to leverage LLMs to improve IR +systems. Given the rapid evolution of this research trajectory, it is necessary +to consolidate existing methodologies and provide nuanced insights through a +comprehensive overview. In this survey, we delve into the confluence of LLMs +and IR systems, including crucial aspects such as query rewriters, retrievers, +rerankers, and readers. Additionally, we explore promising directions within +this expanding field. + +
+
+
+
+
+ + ♻ ☆ Probe: Learning Users' Personalized Projection Bias in Intertemporal + Bundle Choices + + +
+ Intertemporal choices involve making decisions that require weighing the +costs in the present against the benefits in the future. One specific type of +intertemporal choice is the decision between purchasing an individual item or +opting for a bundle that includes that item. Previous research assumes that +individuals have accurate expectations of the factors involved in these +choices. However, in reality, users' perceptions of these factors are often +biased, leading to irrational and suboptimal decision-making. In this work, we +specifically focus on two commonly observed biases: projection bias and the +reference-point effect. To address these biases, we propose a novel +bias-embedded preference model called Probe. The Probe incorporates a weight +function to capture users' projection bias and a value function to account for +the reference-point effect, and introduce prospect theory from behavioral +economics to combine the weight and value functions. This allows us to +determine the probability of users selecting the bundle or a single item. We +provide a thorough theoretical analysis to demonstrate the impact of projection +bias on the design of bundle sales strategies. Through experimental results, we +show that the proposed Probe model outperforms existing methods and contributes +to a better understanding of users' irrational behaviors in bundle purchases. +This investigation can facilitate a deeper comprehension of users' +decision-making mechanisms, enable the provision of personalized services, and +assist users in making more rational and optimal decisions. + +
+
+
+
+
+
+
+
+ + Machine Learning 145 + +
+
+
+ + ☆ RAVEN: In-Context Learning with Retrieval Augmented Encoder-Decoder + Language Models + + +
+ In this paper, we investigate the in-context learning ability of +retrieval-augmented encoder-decoder language models. We first conduct a +comprehensive analysis of the state-of-the-art ATLAS model and identify its +limitations in in-context learning, primarily due to a mismatch between +pretraining and testing, as well as a restricted context length. To address +these issues, we propose RAVEN, a model that combines retrieval-augmented +masked language modeling and prefix language modeling. We further introduce +Fusion-in-Context Learning to enhance the few-shot performance by enabling the +model to leverage more in-context examples without requiring additional +training or model modifications. Through extensive experiments, we demonstrate +that RAVEN significantly outperforms ATLAS and achieves results comparable to +the most advanced language models in certain scenarios, despite having +substantially fewer parameters. Our work underscores the potential of +retrieval-augmented encoder-decoder language models for in-context learning and +encourages further research in this direction. + +
+
+
+
+
+ + ☆ The Regular Expression Inference Challenge + + +
+ We propose \emph{regular expression inference (REI)} as a challenge for +code/language modelling, and the wider machine learning community. REI is a +supervised machine learning (ML) and program synthesis task, and poses the +problem of finding minimal regular expressions from examples: Given two finite +sets of strings $P$ and $N$ and a cost function $\text{cost}(\cdot)$, the task +is to generate an expression $r$ that accepts all strings in $P$ and rejects +all strings in $N$, while no other such expression $r'$ exists with +$\text{cost}(r')<\text{cost}(r)$. + REI has advantages as a challenge problem: (i) regular expressions are +well-known, widely used, and a natural idealisation of code; (ii) REI's +asymptotic worst-case complexity is well understood; (iii) REI has a small +number of easy to understand parameters (e.g.~$P$ or $N$ cardinality, string +lengths of examples, or the cost function); this lets us easily finetune +REI-hardness; (iv) REI is an unsolved problem for deep learning based ML. + Recently, an REI solver was implemented on GPUs, using program synthesis +techniques. This enabled, for the first time, fast generation of minimal +expressions for complex REI instances. Building on this advance, we generate +and publish the first large-scale datasets for REI, and devise and evaluate +several initial heuristic and machine learning baselines. + We invite the community to participate and explore ML methods that learn to +solve REI problems. We believe that progress in REI directly translates to +code/language modelling. + +
+
+ comment: 7 pages, 3 pages appendix, 6 tables +
+
+
+
+
+ + ☆ SciRE-Solver: Efficient Sampling of Diffusion Probabilistic Models by + Score-integrand Solver with Recursive Derivative Estimation + + +
+ Diffusion probabilistic models (DPMs) are a powerful class of generative +models known for their ability to generate high-fidelity image samples. A major +challenge in the implementation of DPMs is the slow sampling process. In this +work, we bring a high-efficiency sampler for DPMs. Specifically, we propose a +score-based exact solution paradigm for the diffusion ODEs corresponding to the +sampling process of DPMs, which introduces a new perspective on developing +numerical algorithms for solving diffusion ODEs. To achieve an efficient +sampler, we propose a recursive derivative estimation (RDE) method to reduce +the estimation error. With our proposed solution paradigm and RDE method, we +propose the score-integrand solver with the convergence order guarantee as +efficient solver (SciRE-Solver) for solving diffusion ODEs. The SciRE-Solver +attains state-of-the-art (SOTA) sampling performance with a limited number of +score function evaluations (NFE) on both discrete-time and continuous-time DPMs +in comparison to existing training-free sampling algorithms. Such as, we +achieve $3.48$ FID with $12$ NFE and $2.42$ FID with $20$ NFE for +continuous-time DPMs on CIFAR10, respectively. Different from other samplers, +SciRE-Solver has the promising potential to surpass the FIDs achieved in the +original papers of some pre-trained models with just fewer NFEs. For example, +we reach SOTA value of $2.40$ FID with $100$ NFE for continuous-time DPM and of +$3.15$ FID with $84$ NFE for discrete-time DPM on CIFAR-10, as well as of +$2.17$ ($2.02$) FID with $18$ ($50$) NFE for discrete-time DPM on CelebA +64$\times$64. + +
+
+ comment: 42 pages,23 figures. arXiv admin note: text overlap with + arXiv:2206.00927 by other authors +
+
+
+
+
+ + ☆ On regularized Radon-Nikodym differentiation + + +
+ We discuss the problem of estimating Radon-Nikodym derivatives. This problem +appears in various applications, such as covariate shift adaptation, +likelihood-ratio testing, mutual information estimation, and conditional +probability estimation. To address the above problem, we employ the general +regularization scheme in reproducing kernel Hilbert spaces. The convergence +rate of the corresponding regularized algorithm is established by taking into +account both the smoothness of the derivative and the capacity of the space in +which it is estimated. This is done in terms of general source conditions and +the regularized Christoffel functions. We also find that the reconstruction of +Radon-Nikodym derivatives at any particular point can be done with high order +of accuracy. Our theoretical results are illustrated by numerical simulations. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2307.11503 +
+
+
+
+
+ + ☆ Back to Basics: A Sanity Check on Modern Time Series Classification + Algorithms + + +
+ The state-of-the-art in time series classification has come a long way, from +the 1NN-DTW algorithm to the ROCKET family of classifiers. However, in the +current fast-paced development of new classifiers, taking a step back and +performing simple baseline checks is essential. These checks are often +overlooked, as researchers are focused on establishing new state-of-the-art +results, developing scalable algorithms, and making models explainable. +Nevertheless, there are many datasets that look like time series at first +glance, but classic algorithms such as tabular methods with no time ordering +may perform better on such problems. For example, for spectroscopy datasets, +tabular methods tend to significantly outperform recent time series methods. In +this study, we compare the performance of tabular models using classic machine +learning approaches (e.g., Ridge, LDA, RandomForest) with the ROCKET family of +classifiers (e.g., Rocket, MiniRocket, MultiRocket). Tabular models are simple +and very efficient, while the ROCKET family of classifiers are more complex and +have state-of-the-art accuracy and efficiency among recent time series +classifiers. We find that tabular models outperform the ROCKET family of +classifiers on approximately 19% of univariate and 28% of multivariate datasets +in the UCR/UEA benchmark and achieve accuracy within 10 percentage points on +about 50% of datasets. Our results suggest that it is important to consider +simple tabular models as baselines when developing time series classifiers. +These models are very fast, can be as effective as more complex methods and may +be easier to understand and deploy. + +
+
+
+
+
+ + ☆ The Challenge of Fetal Cardiac MRI Reconstruction Using Deep Learning + + +
+ Dynamic free-breathing fetal cardiac MRI is one of the most challenging +modalities, which requires high temporal and spatial resolution to depict rapid +changes in a small fetal heart. The ability of deep learning methods to recover +undersampled data could help to optimise the kt-SENSE acquisition strategy and +improve non-gated kt-SENSE reconstruction quality. In this work, we explore +supervised deep learning networks for reconstruction of kt-SENSE style acquired +data using an extensive in vivo dataset. Having access to fully-sampled +low-resolution multi-coil fetal cardiac MRI, we study the performance of the +networks to recover fully-sampled data from undersampled data. We consider +model architectures together with training strategies taking into account their +application in the real clinical setup used to collect the dataset to enable +networks to recover prospectively undersampled data. We explore a set of +modifications to form a baseline performance evaluation for dynamic fetal +cardiac MRI on real data. We systematically evaluate the models on +coil-combined data to reveal the effect of the suggested changes to the +architecture in the context of fetal heart properties. We show that the +best-performers recover a detailed depiction of the maternal anatomy on a large +scale, but the dynamic properties of the fetal heart are under-represented. +Training directly on multi-coil data improves the performance of the models, +allows their prospective application to undersampled data and makes them +outperform CTFNet introduced for adult cardiac cine MRI. However, these models +deliver similar qualitative performances recovering the maternal body very well +but underestimating the dynamic properties of fetal heart. This dynamic feature +of fast change of fetal heart that is highly localised suggests both more +targeted training and evaluation methods might be needed for fetal heart +application. + +
+
+
+
+
+ + ☆ Towards Temporal Edge Regression: A Case Study on Agriculture Trade + Between Nations + + +
+ Recently, Graph Neural Networks (GNNs) have shown promising performance in +tasks on dynamic graphs such as node classification, link prediction and graph +regression. However, few work has studied the temporal edge regression task +which has important real-world applications. In this paper, we explore the +application of GNNs to edge regression tasks in both static and dynamic +settings, focusing on predicting food and agriculture trade values between +nations. We introduce three simple yet strong baselines and comprehensively +evaluate one static and three dynamic GNN models using the UN Trade dataset. +Our experimental results reveal that the baselines exhibit remarkably strong +performance across various settings, highlighting the inadequacy of existing +GNNs. We also find that TGN outperforms other GNN models, suggesting TGN is a +more appropriate choice for edge regression tasks. Moreover, we note that the +proportion of negative edges in the training samples significantly affects the +test performance. The companion source code can be found at: +https://github.com/scylj1/GNN_Edge_Regression. + +
+
+ comment: 12 pages, 4 figures, 4 tables +
+
+
+
+
+ + ☆ Synthesizing Political Zero-Shot Relation Classification via Codebook + Knowledge, NLI, and ChatGPT + + +
+ Recent supervised models for event coding vastly outperform pattern-matching +methods. However, their reliance solely on new annotations disregards the vast +knowledge within expert databases, hindering their applicability to +fine-grained classification. To address these limitations, we explore zero-shot +approaches for political event ontology relation classification, by leveraging +knowledge from established annotation codebooks. Our study encompasses both +ChatGPT and a novel natural language inference (NLI) based approach named ZSP. +ZSP adopts a tree-query framework that deconstructs the task into context, +modality, and class disambiguation levels. This framework improves +interpretability, efficiency, and adaptability to schema changes. By conducting +extensive experiments on our newly curated datasets, we pinpoint the +instability issues within ChatGPT and highlight the superior performance of +ZSP. ZSP achieves an impressive 40% improvement in F1 score for fine-grained +Rootcode classification. ZSP demonstrates competitive performance compared to +supervised BERT models, positioning it as a valuable tool for event record +validation and ontology development. Our work underscores the potential of +leveraging transfer learning and existing expertise to enhance the efficiency +and scalability of research in the field. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Emotion Embeddings $\unicode{x2014}$ Learning Stable and Homogeneous + Abstractions from Heterogeneous Affective Datasets + + +
+ Human emotion is expressed in many communication modalities and media formats +and so their computational study is equally diversified into natural language +processing, audio signal analysis, computer vision, etc. Similarly, the large +variety of representation formats used in previous research to describe +emotions (polarity scales, basic emotion categories, dimensional approaches, +appraisal theory, etc.) have led to an ever proliferating diversity of +datasets, predictive models, and software tools for emotion analysis. Because +of these two distinct types of heterogeneity, at the expressional and +representational level, there is a dire need to unify previous work on +increasingly diverging data and label types. This article presents such a +unifying computational model. We propose a training procedure that learns a +shared latent representation for emotions, so-called emotion embeddings, +independent of different natural languages, communication modalities, media or +representation label formats, and even disparate model architectures. +Experiments on a wide range of heterogeneous affective datasets indicate that +this approach yields the desired interoperability for the sake of reusability, +interpretability and flexibility, without penalizing prediction quality. Code +and data are archived under https://doi.org/10.5281/zenodo.7405327 . + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ☆ Brain-Inspired Computational Intelligence via Predictive Coding + + +
+ Artificial intelligence (AI) is rapidly becoming one of the key technologies +of this century. The majority of results in AI thus far have been achieved +using deep neural networks trained with the error backpropagation learning +algorithm. However, the ubiquitous adoption of this approach has highlighted +some important limitations such as substantial computational cost, difficulty +in quantifying uncertainty, lack of robustness, unreliability, and biological +implausibility. It is possible that addressing these limitations may require +schemes that are inspired and guided by neuroscience theories. One such theory, +called predictive coding (PC), has shown promising performance in machine +intelligence tasks, exhibiting exciting properties that make it potentially +valuable for the machine learning community: PC can model information +processing in different brain areas, can be used in cognitive control and +robotics, and has a solid mathematical grounding in variational inference, +offering a powerful inversion scheme for a specific class of continuous-state +generative models. With the hope of foregrounding research in this direction, +we survey the literature that has contributed to this perspective, highlighting +the many ways that PC might play a role in the future of machine learning and +computational intelligence at large. + +
+
+ comment: 37 Pages, 9 Figures +
+
+
+
+
+ + ☆ Graph-Structured Kernel Design for Power Flow Learning using Gaussian + Processes + + +
+ This paper presents a physics-inspired graph-structured kernel designed for +power flow learning using Gaussian Process (GP). The kernel, named the +vertex-degree kernel (VDK), relies on latent decomposition of voltage-injection +relationship based on the network graph or topology. Notably, VDK design avoids +the need to solve optimization problems for kernel search. To enhance +efficiency, we also explore a graph-reduction approach to obtain a VDK +representation with lesser terms. Additionally, we propose a novel +network-swipe active learning scheme, which intelligently selects sequential +training inputs to accelerate the learning of VDK. Leveraging the additive +structure of VDK, the active learning algorithm performs a block-descent type +procedure on GP's predictive variance, serving as a proxy for information gain. +Simulations demonstrate that the proposed VDK-GP achieves more than two fold +sample complexity reduction, compared to full GP on medium scale 500-Bus and +large scale 1354-Bus power systems. The network-swipe algorithm outperforms +mean performance of 500 random trials on test predictions by two fold for +medium-sized 500-Bus systems and best performance of 25 random trials for +large-scale 1354-Bus systems by 10%. Moreover, we demonstrate that the proposed +method's performance for uncertainty quantification applications with +distributionally shifted testing data sets. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Impression-Aware Recommender Systems + + +
+ Novel data sources bring new opportunities to improve the quality of +recommender systems. Impressions are a novel data source containing past +recommendations (shown items) and traditional interactions. Researchers may use +impressions to refine user preferences and overcome the current limitations in +recommender systems research. The relevance and interest of impressions have +increased over the years; hence, the need for a review of relevant work on this +type of recommenders. We present a systematic literature review on recommender +systems using impressions, focusing on three fundamental angles in research: +recommenders, datasets, and evaluation methodologies. We provide three +categorizations of papers describing recommenders using impressions, present +each reviewed paper in detail, describe datasets with impressions, and analyze +the existing evaluation methodologies. Lastly, we present open questions and +future directions of interest, highlighting aspects missing in the literature +that can be addressed in future works. + +
+
+ comment: 34 pages, 103 references, 6 tables, 2 figures, ACM UNDER REVIEW +
+
+
+
+
+ + ☆ Dyadic Reinforcement Learning + + +
+ Mobile health aims to enhance health outcomes by delivering interventions to +individuals as they go about their daily life. The involvement of care partners +and social support networks often proves crucial in helping individuals +managing burdensome medical conditions. This presents opportunities in mobile +health to design interventions that target the dyadic relationship -- the +relationship between a target person and their care partner -- with the aim of +enhancing social support. In this paper, we develop dyadic RL, an online +reinforcement learning algorithm designed to personalize intervention delivery +based on contextual factors and past responses of a target person and their +care partner. Here, multiple sets of interventions impact the dyad across +multiple time intervals. The developed dyadic RL is Bayesian and hierarchical. +We formally introduce the problem setup, develop dyadic RL and establish a +regret bound. We demonstrate dyadic RL's empirical performance through +simulation studies on both toy scenarios and on a realistic test bed +constructed from data collected in a mobile health study. + +
+
+
+
+
+ + ☆ Simple and Efficient Partial Graph Adversarial Attack: A New Perspective + + +
+ As the study of graph neural networks becomes more intensive and +comprehensive, their robustness and security have received great research +interest. The existing global attack methods treat all nodes in the graph as +their attack targets. Although existing methods have achieved excellent +results, there is still considerable space for improvement. The key problem is +that the current approaches rigidly follow the definition of global attacks. +They ignore an important issue, i.e., different nodes have different robustness +and are not equally resilient to attacks. From a global attacker's view, we +should arrange the attack budget wisely, rather than wasting them on highly +robust nodes. To this end, we propose a totally new method named partial graph +attack (PGA), which selects the vulnerable nodes as attack targets. First, to +select the vulnerable items, we propose a hierarchical target selection policy, +which allows attackers to only focus on easy-to-attack nodes. Then, we propose +a cost-effective anchor-picking policy to pick the most promising anchors for +adding or removing edges, and a more aggressive iterative greedy-based attack +method to perform more efficient attacks. Extensive experimental results +demonstrate that PGA can achieve significant improvements in both attack effect +and attack efficiency compared to other existing graph global attack methods. + +
+
+
+
+
+ + ☆ REFORMS: Reporting Standards for Machine Learning Based Science + + +
+ Machine learning (ML) methods are proliferating in scientific research. +However, the adoption of these methods has been accompanied by failures of +validity, reproducibility, and generalizability. These failures can hinder +scientific progress, lead to false consensus around invalid claims, and +undermine the credibility of ML-based science. ML methods are often applied and +fail in similar ways across disciplines. Motivated by this observation, our +goal is to provide clear reporting standards for ML-based science. Drawing from +an extensive review of past literature, we present the REFORMS checklist +($\textbf{Re}$porting Standards $\textbf{For}$ $\textbf{M}$achine Learning +Based $\textbf{S}$cience). It consists of 32 questions and a paired set of +guidelines. REFORMS was developed based on a consensus of 19 researchers across +computer science, data science, mathematics, social sciences, and biomedical +sciences. REFORMS can serve as a resource for researchers when designing and +implementing a study, for referees when reviewing papers, and for journals when +enforcing standards for transparency and reproducibility. + +
+
+
+
+
+ + ☆ Cerberus: A Deep Learning Hybrid Model for Lithium-Ion Battery Aging + Estimation and Prediction Based on Relaxation Voltage Curves + + +
+ The degradation process of lithium-ion batteries is intricately linked to +their entire lifecycle as power sources and energy storage devices, +encompassing aspects such as performance delivery and cycling utilization. +Consequently, the accurate and expedient estimation or prediction of the aging +state of lithium-ion batteries has garnered extensive attention. Nonetheless, +prevailing research predominantly concentrates on either aging estimation or +prediction, neglecting the dynamic fusion of both facets. This paper proposes a +hybrid model for capacity aging estimation and prediction based on deep +learning, wherein salient features highly pertinent to aging are extracted from +charge and discharge relaxation processes. By amalgamating historical capacity +decay data, the model dynamically furnishes estimations of the present capacity +and forecasts of future capacity for lithium-ion batteries. Our approach is +validated against a novel dataset involving charge and discharge cycles at +varying rates. Specifically, under a charging condition of 0.25C, a mean +absolute percentage error (MAPE) of 0.29% is achieved. This outcome underscores +the model's adeptness in harnessing relaxation processes commonly encountered +in the real world and synergizing with historical capacity records within +battery management systems (BMS), thereby affording estimations and +prognostications of capacity decline with heightened precision. + +
+
+ comment: 3 figures, 1 table, 9 pages +
+
+
+
+
+ + ☆ Deep reinforcement learning for process design: Review and perspective + + +
+ The transformation towards renewable energy and feedstock supply in the +chemical industry requires new conceptual process design approaches. Recently, +breakthroughs in artificial intelligence offer opportunities to accelerate this +transition. Specifically, deep reinforcement learning, a subclass of machine +learning, has shown the potential to solve complex decision-making problems and +aid sustainable process design. We survey state-of-the-art research in +reinforcement learning for process design through three major elements: (i) +information representation, (ii) agent architecture, and (iii) environment and +reward. Moreover, we discuss perspectives on underlying challenges and +promising future works to unfold the full potential of reinforcement learning +for process design in chemical engineering. + +
+
+
+
+
+ + ☆ Quantifying the Cost of Learning in Queueing Systems + + +
+ Queueing systems are widely applicable stochastic models with use cases in +communication networks, healthcare, service systems, etc. Although their +optimal control has been extensively studied, most existing approaches assume +perfect knowledge of system parameters. Of course, this assumption rarely holds +in practice where there is parameter uncertainty, thus motivating a recent line +of work on bandit learning for queueing systems. This nascent stream of +research focuses on the asymptotic performance of the proposed algorithms. + In this paper, we argue that an asymptotic metric, which focuses on +late-stage performance, is insufficient to capture the intrinsic statistical +complexity of learning in queueing systems which typically occurs in the early +stage. Instead, we propose the Cost of Learning in Queueing (CLQ), a new metric +that quantifies the maximum increase in time-averaged queue length caused by +parameter uncertainty. We characterize the CLQ of a single-queue multi-server +system, and then extend these results to multi-queue multi-server systems and +networks of queues. In establishing our results, we propose a unified analysis +framework for CLQ that bridges Lyapunov and bandit analysis, which could be of +independent interest. + +
+
+
+
+
+ + ☆ Fairness and Privacy in Federated Learning and Their Implications in + Healthcare + + +
+ Currently, many contexts exist where distributed learning is difficult or +otherwise constrained by security and communication limitations. One common +domain where this is a consideration is in Healthcare where data is often +governed by data-use-ordinances like HIPAA. On the other hand, larger sample +sizes and shared data models are necessary to allow models to better generalize +on account of the potential for more variability and balancing underrepresented +classes. Federated learning is a type of distributed learning model that allows +data to be trained in a decentralized manner. This, in turn, addresses data +security, privacy, and vulnerability considerations as data itself is not +shared across a given learning network nodes. Three main challenges to +federated learning include node data is not independent and identically +distributed (iid), clients requiring high levels of communication overhead +between peers, and there is the heterogeneity of different clients within a +network with respect to dataset bias and size. As the field has grown, the +notion of fairness in federated learning has also been introduced through novel +implementations. Fairness approaches differ from the standard form of federated +learning and also have distinct challenges and considerations for the +healthcare domain. This paper endeavors to outline the typical lifecycle of +fair federated learning in research as well as provide an updated taxonomy to +account for the current state of fairness in implementations. Lastly, this +paper provides added insight into the implications and challenges of +implementing and supporting fairness in federated learning in the healthcare +domain. + +
+
+
+
+
+ + ☆ Adaptive Noise Covariance Estimation under Colored Noise using Dynamic + Expectation Maximization + + +
+ The accurate estimation of the noise covariance matrix (NCM) in a dynamic +system is critical for state estimation and control, as it has a major +influence in their optimality. Although a large number of NCM estimation +methods have been developed, most of them assume the noises to be white. +However, in many real-world applications, the noises are colored (e.g., they +exhibit temporal autocorrelations), resulting in suboptimal solutions. Here, we +introduce a novel brain-inspired algorithm that accurately and adaptively +estimates the NCM for dynamic systems subjected to colored noise. Particularly, +we extend the Dynamic Expectation Maximization algorithm to perform both online +noise covariance and state estimation by optimizing the free energy objective. +We mathematically prove that our NCM estimator converges to the global optimum +of this free energy objective. Using randomized numerical simulations, we show +that our estimator outperforms nine baseline methods with minimal noise +covariance estimation error under colored noise conditions. Notably, we show +that our method outperforms the best baseline (Variational Bayes) in joint +noise and state estimation for high colored noise. We foresee that the accuracy +and the adaptive nature of our estimator make it suitable for online estimation +in real-world applications. + +
+
+ comment: 62nd IEEE Conference on Decision and Control +
+
+
+
+
+ + ☆ Informed Named Entity Recognition Decoding for Generative Language + Models + + +
+ Ever-larger language models with ever-increasing capabilities are by now +well-established text processing tools. Alas, information extraction tasks such +as named entity recognition are still largely unaffected by this progress as +they are primarily based on the previous generation of encoder-only transformer +models. Here, we propose a simple yet effective approach, Informed Named Entity +Recognition Decoding (iNERD), which treats named entity recognition as a +generative process. It leverages the language understanding capabilities of +recent generative models in a future-proof manner and employs an informed +decoding scheme incorporating the restricted nature of information extraction +into open-ended text generation, improving performance and eliminating any risk +of hallucinations. We coarse-tune our model on a merged named entity corpus to +strengthen its performance, evaluate five generative language models on eight +named entity recognition datasets, and achieve remarkable results, especially +in an environment with an unknown entity class set, demonstrating the +adaptability of the approach. + +
+
+ comment: 12 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ DiffV2S: Diffusion-based Video-to-Speech Synthesis with Vision-guided + Speaker Embedding ICCV 2023 + + +
+ Recent research has demonstrated impressive results in video-to-speech +synthesis which involves reconstructing speech solely from visual input. +However, previous works have struggled to accurately synthesize speech due to a +lack of sufficient guidance for the model to infer the correct content with the +appropriate sound. To resolve the issue, they have adopted an extra speaker +embedding as a speaking style guidance from a reference auditory information. +Nevertheless, it is not always possible to obtain the audio information from +the corresponding video input, especially during the inference time. In this +paper, we present a novel vision-guided speaker embedding extractor using a +self-supervised pre-trained model and prompt tuning technique. In doing so, the +rich speaker embedding information can be produced solely from input visual +information, and the extra audio information is not necessary during the +inference time. Using the extracted vision-guided speaker embedding +representations, we further develop a diffusion-based video-to-speech synthesis +model, so called DiffV2S, conditioned on those speaker embeddings and the +visual representation extracted from the input video. The proposed DiffV2S not +only maintains phoneme details contained in the input video frames, but also +creates a highly intelligible mel-spectrogram in which the speaker identities +of the multiple speakers are all preserved. Our experimental results show that +DiffV2S achieves the state-of-the-art performance compared to the previous +video-to-speech synthesis technique. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Hierarchical generative modelling for autonomous robots + + +
+ Humans can produce complex whole-body motions when interacting with their +surroundings, by planning, executing and combining individual limb movements. +We investigated this fundamental aspect of motor control in the setting of +autonomous robotic operations. We approach this problem by hierarchical +generative modelling equipped with multi-level planning-for autonomous task +completion-that mimics the deep temporal architecture of human motor control. +Here, temporal depth refers to the nested time scales at which successive +levels of a forward or generative model unfold, for example, delivering an +object requires a global plan to contextualise the fast coordination of +multiple local movements of limbs. This separation of temporal scales also +motivates robotics and control. Specifically, to achieve versatile sensorimotor +control, it is advantageous to hierarchically structure the planning and +low-level motor control of individual limbs. We use numerical and physical +simulation to conduct experiments and to establish the efficacy of this +formulation. Using a hierarchical generative model, we show how a humanoid +robot can autonomously complete a complex task that necessitates a holistic use +of locomotion, manipulation, and grasping. Specifically, we demonstrate the +ability of a humanoid robot that can retrieve and transport a box, open and +walk through a door to reach the destination, approach and kick a football, +while showing robust performance in presence of body damage and ground +irregularities. Our findings demonstrated the effectiveness of using +human-inspired motor control algorithms, and our method provides a viable +hierarchical architecture for the autonomous completion of challenging +goal-directed tasks. + +
+
+
+
+
+ + ☆ A Graph Encoder-Decoder Network for Unsupervised Anomaly Detection + + +
+ A key component of many graph neural networks (GNNs) is the pooling +operation, which seeks to reduce the size of a graph while preserving important +structural information. However, most existing graph pooling strategies rely on +an assignment matrix obtained by employing a GNN layer, which is characterized +by trainable parameters, often leading to significant computational complexity +and a lack of interpretability in the pooling process. In this paper, we +propose an unsupervised graph encoder-decoder model to detect abnormal nodes +from graphs by learning an anomaly scoring function to rank nodes based on +their degree of abnormality. In the encoding stage, we design a novel pooling +mechanism, named LCPool, which leverages locality-constrained linear coding for +feature encoding to find a cluster assignment matrix by solving a least-squares +optimization problem with a locality regularization term. By enforcing locality +constraints during the coding process, LCPool is designed to be free from +learnable parameters, capable of efficiently handling large graphs, and can +effectively generate a coarser graph representation while retaining the most +significant structural characteristics of the graph. In the decoding stage, we +propose an unpooling operation, called LCUnpool, to reconstruct both the +structure and nodal features of the original graph. We conduct empirical +evaluations of our method on six benchmark datasets using several evaluation +metrics, and the results demonstrate its superiority over state-of-the-art +anomaly detection approaches. + +
+
+
+
+
+ + ☆ MOLE: MOdular Learning FramEwork via Mutual Information Maximization + + +
+ This paper is to introduce an asynchronous and local learning framework for +neural networks, named Modular Learning Framework (MOLE). This framework +modularizes neural networks by layers, defines the training objective via +mutual information for each module, and sequentially trains each module by +mutual information maximization. MOLE makes the training become local +optimization with gradient-isolated across modules, and this scheme is more +biologically plausible than BP. We run experiments on vector-, grid- and +graph-type data. In particular, this framework is capable of solving both +graph- and node-level tasks for graph-type data. Therefore, MOLE has been +experimentally proven to be universally applicable to different types of data. + +
+
+ comment: accepted by icml llw +
+
+
+
+
+ + ☆ NeFL: Nested Federated Learning for Heterogeneous Clients + + +
+ Federated learning (FL) is a promising approach in distributed learning +keeping privacy. However, during the training pipeline of FL, slow or incapable +clients (i.e., stragglers) slow down the total training time and degrade +performance. System heterogeneity, including heterogeneous computing and +network bandwidth, has been addressed to mitigate the impact of stragglers. +Previous studies split models to tackle the issue, but with less +degree-of-freedom in terms of model architecture. We propose nested federated +learning (NeFL), a generalized framework that efficiently divides a model into +submodels using both depthwise and widthwise scaling. NeFL is implemented by +interpreting models as solving ordinary differential equations (ODEs) with +adaptive step sizes. To address the inconsistency that arises when training +multiple submodels with different architecture, we decouple a few parameters. +NeFL enables resource-constrained clients to effectively join the FL pipeline +and the model to be trained with a larger amount of data. Through a series of +experiments, we demonstrate that NeFL leads to significant gains, especially +for the worst-case submodel (e.g., 8.33 improvement on CIFAR-10). Furthermore, +we demonstrate NeFL aligns with recent studies in FL. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ Backward Reasoning in Large Language Models for Verification + + +
+ Chain-of-Though (CoT) prompting has shown promising performance in various +reasoning tasks. Recently, Self-Consistency \citep{wang2023selfconsistency} +proposes to sample a diverse set of reasoning chains which may lead to +different answers while the answer that receives the most votes is selected. In +this paper, we propose a novel method to use backward reasoning in verifying +candidate answers. We mask a token in the question by ${\bf x}$ and ask the LLM +to predict the masked token when a candidate answer is provided by \textit{a +simple template}, i.e., ``\textit{\textbf{If we know the answer of the above +question is \{a candidate answer\}, what is the value of unknown variable ${\bf +x}$?}}'' Intuitively, the LLM is expected to predict the masked token +successfully if the provided candidate answer is correct. We further propose +FOBAR to combine forward and backward reasoning for estimating the probability +of candidate answers. We conduct extensive experiments on six data sets and +three LLMs. Experimental results demonstrate that FOBAR achieves +state-of-the-art performance on various reasoning benchmarks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Exploiting Sparsity in Automotive Radar Object Detection Networks + + +
+ Having precise perception of the environment is crucial for ensuring the +secure and reliable functioning of autonomous driving systems. Radar object +detection networks are one fundamental part of such systems. CNN-based object +detectors showed good performance in this context, but they require large +compute resources. This paper investigates sparse convolutional object +detection networks, which combine powerful grid-based detection with low +compute resources. We investigate radar specific challenges and propose sparse +kernel point pillars (SKPP) and dual voxel point convolutions (DVPC) as +remedies for the grid rendering and sparse backbone architectures. We evaluate +our SKPP-DPVCN architecture on nuScenes, which outperforms the baseline by +5.89% and the previous state of the art by 4.19% in Car AP4.0. Moreover, +SKPP-DPVCN reduces the average scale error (ASE) by 21.41% over the baseline. + +
+
+
+
+
+ + ☆ Real Robot Challenge 2022: Learning Dexterous Manipulation from Offline + Data in the Real World + + +
+ Experimentation on real robots is demanding in terms of time and costs. For +this reason, a large part of the reinforcement learning (RL) community uses +simulators to develop and benchmark algorithms. However, insights gained in +simulation do not necessarily translate to real robots, in particular for tasks +involving complex interactions with the environment. The Real Robot Challenge +2022 therefore served as a bridge between the RL and robotics communities by +allowing participants to experiment remotely with a real robot - as easily as +in simulation. + In the last years, offline reinforcement learning has matured into a +promising paradigm for learning from pre-collected datasets, alleviating the +reliance on expensive online interactions. We therefore asked the participants +to learn two dexterous manipulation tasks involving pushing, grasping, and +in-hand orientation from provided real-robot datasets. An extensive software +documentation and an initial stage based on a simulation of the real set-up +made the competition particularly accessible. By giving each team plenty of +access budget to evaluate their offline-learned policies on a cluster of seven +identical real TriFinger platforms, we organized an exciting competition for +machine learners and roboticists alike. + In this work we state the rules of the competition, present the methods used +by the winning teams and compare their results with a benchmark of +state-of-the-art offline RL algorithms on the challenge datasets. + +
+
+
+
+
+ + ☆ Domain-Aware Fine-Tuning: Enhancing Neural Network Adaptability + + +
+ Fine-tuning pre-trained neural network models has become a widely adopted +approach across various domains. However, it can lead to the distortion of +pre-trained feature extractors that already possess strong generalization +capabilities. Mitigating feature distortion during adaptation to new target +domains is crucial. Recent studies have shown promising results in handling +feature distortion by aligning the head layer on in-distribution datasets +before performing fine-tuning. Nonetheless, a significant limitation arises +from the treatment of batch normalization layers during fine-tuning, leading to +suboptimal performance. In this paper, we propose Domain-Aware Fine-Tuning +(DAFT), a novel approach that incorporates batch normalization conversion and +the integration of linear probing and fine-tuning. Our batch normalization +conversion method effectively mitigates feature distortion by reducing +modifications to the neural network during fine-tuning. Additionally, we +introduce the integration of linear probing and fine-tuning to optimize the +head layer with gradual adaptation of the feature extractor. By leveraging +batch normalization layers and integrating linear probing and fine-tuning, our +DAFT significantly mitigates feature distortion and achieves improved model +performance on both in-distribution and out-of-distribution datasets. Extensive +experiments demonstrate that our method outperforms other baseline methods, +demonstrating its effectiveness in not only improving performance but also +mitigating feature distortion. + +
+
+
+
+
+ + ☆ Fast Machine Unlearning Without Retraining Through Selective Synaptic + Dampening + + +
+ Machine unlearning, the ability for a machine learning model to forget, is +becoming increasingly important to comply with data privacy regulations, as +well as to remove harmful, manipulated, or outdated information. The key +challenge lies in forgetting specific information while protecting model +performance on the remaining data. While current state-of-the-art methods +perform well, they typically require some level of retraining over the retained +data, in order to protect or restore model performance. This adds computational +overhead and mandates that the training data remain available and accessible, +which may not be feasible. In contrast, other methods employ a retrain-free +paradigm, however, these approaches are prohibitively computationally expensive +and do not perform on par with their retrain-based counterparts. We present +Selective Synaptic Dampening (SSD), a novel two-step, post hoc, retrain-free +approach to machine unlearning which is fast, performant, and does not require +long-term storage of the training data. First, SSD uses the Fisher information +matrix of the training and forgetting data to select parameters that are +disproportionately important to the forget set. Second, SSD induces forgetting +by dampening these parameters proportional to their relative importance to the +forget set with respect to the wider training data. We evaluate our method +against several existing unlearning methods in a range of experiments using +ResNet18 and Vision Transformer. Results show that the performance of SSD is +competitive with retrain-based post hoc methods, demonstrating the viability of +retrain-free post hoc unlearning approaches. + +
+
+
+
+
+ + ☆ Exploring Transfer Learning in Medical Image Segmentation using + Vision-Language Models + + +
+ Medical Image Segmentation is crucial in various clinical applications within +the medical domain. While state-of-the-art segmentation models have proven +effective, integrating textual guidance to enhance visual features for this +task remains an area with limited progress. Existing segmentation models that +utilize textual guidance are primarily trained on open-domain images, raising +concerns about their direct applicability in the medical domain without manual +intervention or fine-tuning. + To address these challenges, we propose using multimodal vision-language +models for capturing semantic information from image descriptions and images, +enabling the segmentation of diverse medical images. This study comprehensively +evaluates existing vision language models across multiple datasets to assess +their transferability from the open domain to the medical field. Furthermore, +we introduce variations of image descriptions for previously unseen images in +the dataset, revealing notable variations in model performance based on the +generated prompts. + Our findings highlight the distribution shift between the open-domain images +and the medical domain and show that the segmentation models trained on +open-domain images are not directly transferrable to the medical field. But +their performance can be increased by finetuning them in the medical datasets. +We report the zero-shot and finetuned segmentation performance of 4 Vision +Language Models (VLMs) on 11 medical datasets using 9 types of prompts derived +from 14 attributes. + +
+
+ comment: 25 pages, 9 figures +
+
+
+
+
+ + ☆ Parametric entropy based Cluster Centriod Initialization for k-means + clustering of various Image datasets SP + + +
+ One of the most employed yet simple algorithm for cluster analysis is the +k-means algorithm. k-means has successfully witnessed its use in artificial +intelligence, market segmentation, fraud detection, data mining, psychology, +etc., only to name a few. The k-means algorithm, however, does not always yield +the best quality results. Its performance heavily depends upon the number of +clusters supplied and the proper initialization of the cluster centroids or +seeds. In this paper, we conduct an analysis of the performance of k-means on +image data by employing parametric entropies in an entropy based centroid +initialization method and propose the best fitting entropy measures for general +image datasets. We use several entropies like Taneja entropy, Kapur entropy, +Aczel Daroczy entropy, Sharma Mittal entropy. We observe that for different +datasets, different entropies provide better results than the conventional +methods. We have applied our proposed algorithm on these datasets: Satellite, +Toys, Fruits, Cars, Brain MRI, Covid X-Ray. + +
+
+ comment: 6 Pages, 2 tables, one algorithm. Accepted for publication in IEEE + International Conference on Signal Processing and Computer Vision (SPCV-2023) +
+
+
+
+
+ + ☆ Enhancing Network Initialization for Medical AI Models Using + Large-Scale, Unlabeled Natural Images + + +
+ Pre-training datasets, like ImageNet, have become the gold standard in +medical image analysis. However, the emergence of self-supervised learning +(SSL), which leverages unlabeled data to learn robust features, presents an +opportunity to bypass the intensive labeling process. In this study, we +explored if SSL for pre-training on non-medical images can be applied to chest +radiographs and how it compares to supervised pre-training on non-medical +images and on medical images. We utilized a vision transformer and initialized +its weights based on (i) SSL pre-training on natural images (DINOv2), (ii) SL +pre-training on natural images (ImageNet dataset), and (iii) SL pre-training on +chest radiographs from the MIMIC-CXR database. We tested our approach on over +800,000 chest radiographs from six large global datasets, diagnosing more than +20 different imaging findings. Our SSL pre-training on curated images not only +outperformed ImageNet-based pre-training (P<0.001 for all datasets) but, in +certain cases, also exceeded SL on the MIMIC-CXR dataset. Our findings suggest +that selecting the right pre-training strategy, especially with SSL, can be +pivotal for improving artificial intelligence (AI)'s diagnostic accuracy in +medical imaging. By demonstrating the promise of SSL in chest radiograph +analysis, we underline a transformative shift towards more efficient and +accurate AI models in medical imaging. + +
+
+
+
+
+ + ☆ DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using + Pre-trained Diffusion Models ICCV2023 + + +
+ Given a classifier, the inherent property of semantic Out-of-Distribution +(OOD) samples is that their contents differ from all legal classes in terms of +semantics, namely semantic mismatch. There is a recent work that directly +applies it to OOD detection, which employs a conditional Generative Adversarial +Network (cGAN) to enlarge semantic mismatch in the image space. While achieving +remarkable OOD detection performance on small datasets, it is not applicable to +ImageNet-scale datasets due to the difficulty in training cGANs with both input +images and labels as conditions. As diffusion models are much easier to train +and amenable to various conditions compared to cGANs, in this work, we propose +to directly use pre-trained diffusion models for semantic mismatch-guided OOD +detection, named DiffGuard. Specifically, given an OOD input image and the +predicted label from the classifier, we try to enlarge the semantic difference +between the reconstructed OOD image under these conditions and the original +input image. We also present several test-time techniques to further strengthen +such differences. Experimental results show that DiffGuard is effective on both +Cifar-10 and hard cases of the large-scale ImageNet, and it can be easily +combined with existing OOD detection techniques to achieve state-of-the-art OOD +detection results. + +
+
+ comment: Accepted by ICCV2023, with supplementary materials +
+
+
+
+
+ + ☆ Gradient-Based Post-Training Quantization: Challenging the Status Quo + + +
+ Quantization has become a crucial step for the efficient deployment of deep +neural networks, where floating point operations are converted to simpler fixed +point operations. In its most naive form, it simply consists in a combination +of scaling and rounding transformations, leading to either a limited +compression rate or a significant accuracy drop. Recently, Gradient-based +post-training quantization (GPTQ) methods appears to be constitute a suitable +trade-off between such simple methods and more powerful, yet expensive +Quantization-Aware Training (QAT) approaches, particularly when attempting to +quantize LLMs, where scalability of the quantization process is of paramount +importance. GPTQ essentially consists in learning the rounding operation using +a small calibration set. In this work, we challenge common choices in GPTQ +methods. In particular, we show that the process is, to a certain extent, +robust to a number of variables (weight selection, feature augmentation, choice +of calibration set). More importantly, we derive a number of best practices for +designing more efficient and scalable GPTQ methods, regarding the problem +formulation (loss, degrees of freedom, use of non-uniform quantization schemes) +or optimization process (choice of variable and optimizer). Lastly, we propose +a novel importance-based mixed-precision technique. Those guidelines lead to +significant performance improvements on all the tested state-of-the-art GPTQ +methods and networks (e.g. +6.819 points on ViT for 4-bit quantization), paving +the way for the design of scalable, yet effective quantization methods. + +
+
+
+
+
+ + ☆ Attention Is Not All You Need Anymore + + +
+ In recent years, the popular Transformer architecture has achieved great +success in many application areas, including natural language processing and +computer vision. Many existing works aim to reduce the computational and memory +complexity of the self-attention mechanism in the Transformer by trading off +performance. However, performance is key for the continuing success of the +Transformer. In this paper, a drop-in replacement for the self-attention +mechanism in the Transformer, called the Extractor, is proposed. Experimental +results show that replacing the self-attention mechanism with the Extractor +improves the performance of the Transformer. Furthermore, the proposed +Extractor has the potential to run faster than the self-attention since it has +a much shorter critical path of computation. Additionally, the sequence +prediction problem in the context of text generation is formulated using +variable-length discrete-time Markov chains, and the Transformer is reviewed +based on our understanding. + +
+
+
+
+
+ + ☆ From Commit Message Generation to History-Aware Commit Message + Completion + + +
+ Commit messages are crucial to software development, allowing developers to +track changes and collaborate effectively. Despite their utility, most commit +messages lack important information since writing high-quality commit messages +is tedious and time-consuming. The active research on commit message generation +(CMG) has not yet led to wide adoption in practice. We argue that if we could +shift the focus from commit message generation to commit message completion and +use previous commit history as additional context, we could significantly +improve the quality and the personal nature of the resulting commit messages. + In this paper, we propose and evaluate both of these novel ideas. Since the +existing datasets lack historical data, we collect and share a novel dataset +called CommitChronicle, containing 10.7M commits across 20 programming +languages. We use this dataset to evaluate the completion setting and the +usefulness of the historical context for state-of-the-art CMG models and +GPT-3.5-turbo. Our results show that in some contexts, commit message +completion shows better results than generation, and that while in general +GPT-3.5-turbo performs worse, it shows potential for long and detailed +messages. As for the history, the results show that historical information +improves the performance of CMG models in the generation task, and the +performance of GPT-3.5-turbo in both generation and completion. + +
+
+ comment: Accepted to ASE'23. 13 pages, 5 figures +
+
+
+
+
+ + ☆ Ternary Singular Value Decomposition as a Better Parameterized Form in + Linear Mapping + + +
+ We present a simple yet novel parameterized form of linear mapping to +achieves remarkable network compression performance: a pseudo SVD called +Ternary SVD (TSVD). + Unlike vanilla SVD, TSVD limits the $U$ and $V$ matrices in SVD to ternary +matrices form in $\{\pm 1, 0\}$. This means that instead of using the expensive +multiplication instructions, TSVD only requires addition instructions when +computing $U(\cdot)$ and $V(\cdot)$. + We provide direct and training transition algorithms for TSVD like Post +Training Quantization and Quantization Aware Training respectively. +Additionally, we analyze the convergence of the direct transition algorithms in +theory. + In experiments, we demonstrate that TSVD can achieve state-of-the-art network +compression performance in various types of networks and tasks, including +current baseline models such as ConvNext, Swim, BERT, and large language model +like OPT. + +
+
+
+
+
+ + ☆ Backpropagation Path Search On Adversarial Transferability ICCV2023 + + +
+ Deep neural networks are vulnerable to adversarial examples, dictating the +imperativeness to test the model's robustness before deployment. Transfer-based +attackers craft adversarial examples against surrogate models and transfer them +to victim models deployed in the black-box situation. To enhance the +adversarial transferability, structure-based attackers adjust the +backpropagation path to avoid the attack from overfitting the surrogate model. +However, existing structure-based attackers fail to explore the convolution +module in CNNs and modify the backpropagation graph heuristically, leading to +limited effectiveness. In this paper, we propose backPropagation pAth Search +(PAS), solving the aforementioned two problems. We first propose SkipConv to +adjust the backpropagation path of convolution by structural +reparameterization. To overcome the drawback of heuristically designed +backpropagation paths, we further construct a DAG-based search space, utilize +one-step approximation for path evaluation and employ Bayesian Optimization to +search for the optimal path. We conduct comprehensive experiments in a wide +range of transfer settings, showing that PAS improves the attack success rate +by a huge margin for both normally trained and defense models. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ A Multilayer Perceptron-based Fast Sunlight Assessment for the + Conceptual Design of Residential Neighborhoods under Chinese Policy + + +
+ In Chinese building codes, it is required that residential buildings receive +a minimum number of hours of natural, direct sunlight on a specified winter +day, which represents the worst sunlight condition in a year. This requirement +is a prerequisite for obtaining a building permit during the conceptual design +of a residential project. Thus, officially sanctioned software is usually used +to assess the sunlight performance of buildings. These software programs +predict sunlight hours based on repeated shading calculations, which is +time-consuming. This paper proposed a multilayer perceptron-based method, a +one-stage prediction approach, which outputs a shading time interval caused by +the inputted cuboid-form building. The sunlight hours of a site can be obtained +by calculating the union of the sunlight time intervals (complement of shading +time interval) of all the buildings. Three numerical experiments, i.e., +horizontal level and slope analysis, and simulation-based optimization are +carried out; the results show that the method reduces the computation time to +1/84~1/50 with 96.5%~98% accuracies. A residential neighborhood layout planning +plug-in for Rhino 7/Grasshopper is also developed based on the proposed model. +This paper indicates that deep learning techniques can be adopted to accelerate +sunlight hour simulations at the conceptual design phase. + +
+
+
+
+
+ + ☆ Searching for Novel Chemistry in Exoplanetary Atmospheres using Machine + Learning for Anomaly Detection + + +
+ The next generation of telescopes will yield a substantial increase in the +availability of high-resolution spectroscopic data for thousands of exoplanets. +The sheer volume of data and number of planets to be analyzed greatly motivate +the development of new, fast and efficient methods for flagging interesting +planets for reobservation and detailed analysis. We advocate the application of +machine learning (ML) techniques for anomaly (novelty) detection to exoplanet +transit spectra, with the goal of identifying planets with unusual chemical +composition and even searching for unknown biosignatures. We successfully +demonstrate the feasibility of two popular anomaly detection methods (Local +Outlier Factor and One Class Support Vector Machine) on a large public database +of synthetic spectra. We consider several test cases, each with different +levels of instrumental noise. In each case, we use ROC curves to quantify and +compare the performance of the two ML techniques. + +
+
+ comment: Submitted to AAS Journals, 30 pages, 14 figures +
+
+
+
+
+ + ☆ Generating Personas for Games with Multimodal Adversarial Imitation + Learning + + +
+ Reinforcement learning has been widely successful in producing agents capable +of playing games at a human level. However, this requires complex reward +engineering, and the agent's resulting policy is often unpredictable. Going +beyond reinforcement learning is necessary to model a wide range of human +playstyles, which can be difficult to represent with a reward function. This +paper presents a novel imitation learning approach to generate multiple persona +policies for playtesting. Multimodal Generative Adversarial Imitation Learning +(MultiGAIL) uses an auxiliary input parameter to learn distinct personas using +a single-agent model. MultiGAIL is based on generative adversarial imitation +learning and uses multiple discriminators as reward models, inferring the +environment reward by comparing the agent and distinct expert policies. The +reward from each discriminator is weighted according to the auxiliary input. +Our experimental analysis demonstrates the effectiveness of our technique in +two environments with continuous and discrete action spaces. + +
+
+ comment: Published in CoG 2023 +
+
+
+
+
+ + ☆ High-Probability Risk Bounds via Sequential Predictors + + +
+ Online learning methods yield sequential regret bounds under minimal +assumptions and provide in-expectation risk bounds for statistical learning. +However, despite the apparent advantage of online guarantees over their +statistical counterparts, recent findings indicate that in many important +cases, regret bounds may not guarantee tight high-probability risk bounds in +the statistical setting. In this work we show that online to batch conversions +applied to general online learning algorithms can bypass this limitation. Via a +general second-order correction to the loss function defining the regret, we +obtain nearly optimal high-probability risk bounds for several classical +statistical estimation problems, such as discrete distribution estimation, +linear regression, logistic regression, and conditional density estimation. Our +analysis relies on the fact that many online learning algorithms are improper, +as they are not restricted to use predictors from a given reference class. The +improper nature of our estimators enables significant improvements in the +dependencies on various problem parameters. Finally, we discuss some +computational advantages of our sequential algorithms over their existing batch +counterparts. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Story Visualization by Online Text Augmentation with Context Memory ICCV 2023 + + +
+ Story visualization (SV) is a challenging text-to-image generation task for +the difficulty of not only rendering visual details from the text descriptions +but also encoding a long-term context across multiple sentences. While prior +efforts mostly focus on generating a semantically relevant image for each +sentence, encoding a context spread across the given paragraph to generate +contextually convincing images (e.g., with a correct character or with a proper +background of the scene) remains a challenge. To this end, we propose a novel +memory architecture for the Bi-directional Transformers with an online text +augmentation that generates multiple pseudo-descriptions as supplementary +supervision during training, for better generalization to the language +variation at inference. In extensive experiments on the two popular SV +benchmarks, i.e., the Pororo-SV and Flintstones-SV, the proposed method +significantly outperforms the state of the arts in various evaluation metrics +including FID, character F1, frame accuracy, BLEU-2/3, and R-precision with +similar or less computational complexity. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Ske2Grid: Skeleton-to-Grid Representation Learning for Action + Recognition ICML 2023 + + +
+ This paper presents Ske2Grid, a new representation learning framework for +improved skeleton-based action recognition. In Ske2Grid, we define a regular +convolution operation upon a novel grid representation of human skeleton, which +is a compact image-like grid patch constructed and learned through three novel +designs. Specifically, we propose a graph-node index transform (GIT) to +construct a regular grid patch through assigning the nodes in the skeleton +graph one by one to the desired grid cells. To ensure that GIT is a bijection +and enrich the expressiveness of the grid representation, an up-sampling +transform (UPT) is learned to interpolate the skeleton graph nodes for filling +the grid patch to the full. To resolve the problem when the one-step UPT is +aggressive and further exploit the representation capability of the grid patch +with increasing spatial size, a progressive learning strategy (PLS) is proposed +which decouples the UPT into multiple steps and aligns them to multiple paired +GITs through a compact cascaded design learned progressively. We construct +networks upon prevailing graph convolution networks and conduct experiments on +six mainstream skeleton-based action recognition datasets. Experiments show +that our Ske2Grid significantly outperforms existing GCN-based solutions under +different benchmark settings, without bells and whistles. Code and models are +available at https://github.com/OSVAI/Ske2Grid + +
+
+ comment: The paper of Ske2Grid is published at ICML 2023. Code and models are + available at https://github.com/OSVAI/Ske2Grid +
+
+
+
+
+ + ☆ Semi-Supervised Learning with Multiple Imputations on Non-Random Missing + Labels + + +
+ Semi-Supervised Learning (SSL) is implemented when algorithms are trained on +both labeled and unlabeled data. This is a very common application of ML as it +is unrealistic to obtain a fully labeled dataset. Researchers have tackled +three main issues: missing at random (MAR), missing completely at random +(MCAR), and missing not at random (MNAR). The MNAR problem is the most +challenging of the three as one cannot safely assume that all class +distributions are equal. Existing methods, including Class-Aware Imputation +(CAI) and Class-Aware Propensity (CAP), mostly overlook the non-randomness in +the unlabeled data. This paper proposes two new methods of combining multiple +imputation models to achieve higher accuracy and less bias. 1) We use multiple +imputation models, create confidence intervals, and apply a threshold to ignore +pseudo-labels with low confidence. 2) Our new method, SSL with De-biased +Imputations (SSL-DI), aims to reduce bias by filtering out inaccurate data and +finding a subset that is accurate and reliable. This subset of the larger +dataset could be imputed into another SSL model, which will be less biased. The +proposed models have been shown to be effective in both MCAR and MNAR +situations, and experimental results show that our methodology outperforms +existing methods in terms of classification accuracy and reducing bias. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ A User-Centered Evaluation of Spanish Text Simplification + + +
+ We present an evaluation of text simplification (TS) in Spanish for a +production system, by means of two corpora focused in both complex-sentence and +complex-word identification. We compare the most prevalent Spanish-specific +readability scores with neural networks, and show that the latter are +consistently better at predicting user preferences regarding TS. As part of our +analysis, we find that multilingual models underperform against equivalent +Spanish-only models on the same task, yet all models focus too often on +spurious statistical features, such as sentence length. We release the corpora +in our evaluation to the broader community with the hopes of pushing forward +the state-of-the-art in Spanish natural language processing. + +
+
+ comment: Data at https://github.com/microsoft/BrevE-CLaro +
+
+
+
+
+ + ☆ Enhancing the Antidote: Improved Pointwise Certifications against + Poisoning Attacks + + +
+ Poisoning attacks can disproportionately influence model behaviour by making +small changes to the training corpus. While defences against specific poisoning +attacks do exist, they in general do not provide any guarantees, leaving them +potentially countered by novel attacks. In contrast, by examining worst-case +behaviours Certified Defences make it possible to provide guarantees of the +robustness of a sample against adversarial attacks modifying a finite number of +training samples, known as pointwise certification. We achieve this by +exploiting both Differential Privacy and the Sampled Gaussian Mechanism to +ensure the invariance of prediction for each testing instance against finite +numbers of poisoned examples. In doing so, our model provides guarantees of +adversarial robustness that are more than twice as large as those provided by +prior certifications. + +
+
+
+
+
+ + ☆ Domain Adaptation via Minimax Entropy for Real/Bogus Classification of + Astronomical Alerts + + +
+ Time domain astronomy is advancing towards the analysis of multiple massive +datasets in real time, prompting the development of multi-stream machine +learning models. In this work, we study Domain Adaptation (DA) for real/bogus +classification of astronomical alerts using four different datasets: HiTS, DES, +ATLAS, and ZTF. We study the domain shift between these datasets, and improve a +naive deep learning classification model by using a fine tuning approach and +semi-supervised deep DA via Minimax Entropy (MME). We compare the balanced +accuracy of these models for different source-target scenarios. We find that +both the fine tuning and MME models improve significantly the base model with +as few as one labeled item per class coming from the target dataset, but that +the MME does not compromise its performance on the source dataset. + +
+
+
+
+
+ + ☆ Projection-Free Methods for Stochastic Simple Bilevel Optimization with + Convex Lower-level Problem + + +
+ In this paper, we study a class of stochastic bilevel optimization problems, +also known as stochastic simple bilevel optimization, where we minimize a +smooth stochastic objective function over the optimal solution set of another +stochastic convex optimization problem. We introduce novel stochastic bilevel +optimization methods that locally approximate the solution set of the +lower-level problem via a stochastic cutting plane, and then run a conditional +gradient update with variance reduction techniques to control the error induced +by using stochastic gradients. For the case that the upper-level function is +convex, our method requires +$\tilde{\mathcal{O}}(\max\{1/\epsilon_f^{2},1/\epsilon_g^{2}\}) $ stochastic +oracle queries to obtain a solution that is $\epsilon_f$-optimal for the +upper-level and $\epsilon_g$-optimal for the lower-level. This guarantee +improves the previous best-known complexity of +$\mathcal{O}(\max\{1/\epsilon_f^{4},1/\epsilon_g^{4}\})$. Moreover, for the +case that the upper-level function is non-convex, our method requires at most +$\tilde{\mathcal{O}}(\max\{1/\epsilon_f^{3},1/\epsilon_g^{3}\}) $ stochastic +oracle queries to find an $(\epsilon_f, \epsilon_g)$-stationary point. In the +finite-sum setting, we show that the number of stochastic oracle calls required +by our method are $\tilde{\mathcal{O}}(\sqrt{n}/\epsilon)$ and +$\tilde{\mathcal{O}}(\sqrt{n}/\epsilon^{2})$ for the convex and non-convex +settings, respectively, where $\epsilon=\min \{\epsilon_f,\epsilon_g\}$. + +
+
+
+
+
+ + ☆ FeatGeNN: Improving Model Performance for Tabular Data with + Correlation-based Feature Extraction + + +
+ Automated Feature Engineering (AutoFE) has become an important task for any +machine learning project, as it can help improve model performance and gain +more information for statistical analysis. However, most current approaches for +AutoFE rely on manual feature creation or use methods that can generate a large +number of features, which can be computationally intensive and lead to +overfitting. To address these challenges, we propose a novel convolutional +method called FeatGeNN that extracts and creates new features using correlation +as a pooling function. Unlike traditional pooling functions like max-pooling, +correlation-based pooling considers the linear relationship between the +features in the data matrix, making it more suitable for tabular data. We +evaluate our method on various benchmark datasets and demonstrate that FeatGeNN +outperforms existing AutoFE approaches regarding model performance. Our results +suggest that correlation-based pooling can be a promising alternative to +max-pooling for AutoFE in tabular data applications. + +
+
+
+
+
+ + ☆ Potential of Deep Operator Networks in Digital Twin-enabling Technology + for Nuclear System + + +
+ This research introduces the Deep Operator Network (DeepONet) as a robust +surrogate modeling method within the context of digital twin (DT) systems for +nuclear engineering. With the increasing importance of nuclear energy as a +carbon-neutral solution, adopting DT technology has become crucial to enhancing +operational efficiencies, safety, and predictive capabilities in nuclear +engineering applications. DeepONet exhibits remarkable prediction accuracy, +outperforming traditional ML methods. Through extensive benchmarking and +evaluation, this study showcases the scalability and computational efficiency +of DeepONet in solving a challenging particle transport problem. By taking +functions as input data and constructing the operator $G$ from training data, +DeepONet can handle diverse and complex scenarios effectively. However, the +application of DeepONet also reveals challenges related to optimal sensor +placement and model evaluation, critical aspects of real-world implementation. +Addressing these challenges will further enhance the method's practicality and +reliability. Overall, DeepONet presents a promising and transformative tool for +nuclear engineering research and applications. Its accurate prediction and +computational efficiency capabilities can revolutionize DT systems, advancing +nuclear engineering research. This study marks an important step towards +harnessing the power of surrogate modeling techniques in critical engineering +domains. + +
+
+
+
+
+ + ☆ Nonlinearity, Feedback and Uniform Consistency in Causal Structural + Learning + + +
+ The goal of Causal Discovery is to find automated search methods for learning +causal structures from observational data. In some cases all variables of the +interested causal mechanism are measured, and the task is to predict the +effects one measured variable has on another. In contrast, sometimes the +variables of primary interest are not directly observable but instead inferred +from their manifestations in the data. These are referred to as latent +variables. One commonly known example is the psychological construct of +intelligence, which cannot directly measured so researchers try to assess +through various indicators such as IQ tests. In this case, casual discovery +algorithms can uncover underlying patterns and structures to reveal the causal +connections between the latent variables and between the latent and observed +variables. This thesis focuses on two questions in causal discovery: providing +an alternative definition of k-Triangle Faithfulness that (i) is weaker than +strong faithfulness when applied to the Gaussian family of distributions, (ii) +can be applied to non-Gaussian families of distributions, and (iii) under the +assumption that the modified version of Strong Faithfulness holds, can be used +to show the uniform consistency of a modified causal discovery algorithm; +relaxing the sufficiency assumption to learn causal structures with latent +variables. Given the importance of inferring cause-and-effect relationships for +understanding and forecasting complex systems, the work in this thesis of +relaxing various simplification assumptions is expected to extend the causal +discovery method to be applicable in a wider range with diversified causal +mechanism and statistical phenomena. + +
+
+
+
+
+ + ☆ Distilling Knowledge from Resource Management Algorithms to Neural + Networks: A Unified Training Assistance Approach + + +
+ As a fundamental problem, numerous methods are dedicated to the optimization +of signal-to-interference-plus-noise ratio (SINR), in a multi-user setting. +Although traditional model-based optimization methods achieve strong +performance, the high complexity raises the research of neural network (NN) +based approaches to trade-off the performance and complexity. To fully leverage +the high performance of traditional model-based methods and the low complexity +of the NN-based method, a knowledge distillation (KD) based algorithm +distillation (AD) method is proposed in this paper to improve the performance +and convergence speed of the NN-based method, where traditional SINR +optimization methods are employed as ``teachers" to assist the training of NNs, +which are ``students", thus enhancing the performance of unsupervised and +reinforcement learning techniques. This approach aims to alleviate common +issues encountered in each of these training paradigms, including the +infeasibility of obtaining optimal solutions as labels and overfitting in +supervised learning, ensuring higher convergence performance in unsupervised +learning, and improving training efficiency in reinforcement learning. +Simulation results demonstrate the enhanced performance of the proposed +AD-based methods compared to traditional learning methods. Remarkably, this +research paves the way for the integration of traditional optimization insights +and emerging NN techniques in wireless communication system optimization. + +
+
+
+
+
+ + ☆ Data Race Detection Using Large Language Models + + +
+ Large language models (LLMs) are demonstrating significant promise as an +alternate strategy to facilitate analyses and optimizations of high-performance +computing programs, circumventing the need for resource-intensive manual tool +creation. In this paper, we explore a novel LLM-based data race detection +approach combining prompting engineering and fine-tuning techniques. We create +a dedicated dataset named DRB-ML, which is derived from DataRaceBench, with +fine-grain labels showing the presence of data race pairs and their associated +variables, line numbers, and read/write information. DRB-ML is then used to +evaluate representative LLMs and fine-tune open-source ones. Our experiment +shows that LLMs can be a viable approach to data race detection. However, they +still cannot compete with traditional data race detection tools when we need +detailed information about variable pairs causing data races. + +
+
+
+
+
+ + ☆ Decentralized Graph Neural Network for Privacy-Preserving Recommendation + + +
+ Building a graph neural network (GNN)-based recommender system without +violating user privacy proves challenging. Existing methods can be divided into +federated GNNs and decentralized GNNs. But both methods have undesirable +effects, i.e., low communication efficiency and privacy leakage. This paper +proposes DGREC, a novel decentralized GNN for privacy-preserving +recommendations, where users can choose to publicize their interactions. It +includes three stages, i.e., graph construction, local gradient calculation, +and global gradient passing. The first stage builds a local inner-item +hypergraph for each user and a global inter-user graph. The second stage models +user preference and calculates gradients on each local device. The third stage +designs a local differential privacy mechanism named secure gradient-sharing, +which proves strong privacy-preserving of users' private data. We conduct +extensive experiments on three public datasets to validate the consistent +superiority of our framework. + +
+
+
+
+
+ + ☆ Freshness or Accuracy, Why Not Both? Addressing Delayed Feedback via + Dynamic Graph Neural Networks + + +
+ The delayed feedback problem is one of the most pressing challenges in +predicting the conversion rate since users' conversions are always delayed in +online commercial systems. Although new data are beneficial for continuous +training, without complete feedback information, i.e., conversion labels, +training algorithms may suffer from overwhelming fake negatives. Existing +methods tend to use multitask learning or design data pipelines to solve the +delayed feedback problem. However, these methods have a trade-off between data +freshness and label accuracy. In this paper, we propose Delayed Feedback +Modeling by Dynamic Graph Neural Network (DGDFEM). It includes three stages, +i.e., preparing a data pipeline, building a dynamic graph, and training a CVR +prediction model. In the model training, we propose a novel graph convolutional +method named HLGCN, which leverages both high-pass and low-pass filters to deal +with conversion and non-conversion relationships. The proposed method achieves +both data freshness and label accuracy. We conduct extensive experiments on +three industry datasets, which validate the consistent superiority of our +method. + +
+
+
+
+
+ + ☆ Max-affine regression via first-order methods + + +
+ We consider regression of a max-affine model that produces a piecewise linear +model by combining affine models via the max function. The max-affine model +ubiquitously arises in applications in signal processing and statistics +including multiclass classification, auction problems, and convex regression. +It also generalizes phase retrieval and learning rectifier linear unit +activation functions. We present a non-asymptotic convergence analysis of +gradient descent (GD) and mini-batch stochastic gradient descent (SGD) for +max-affine regression when the model is observed at random locations following +the sub-Gaussianity and an anti-concentration with additive sub-Gaussian noise. +Under these assumptions, a suitably initialized GD and SGD converge linearly to +a neighborhood of the ground truth specified by the corresponding error bound. +We provide numerical results that corroborate the theoretical finding. +Importantly, SGD not only converges faster in run time with fewer observations +than alternating minimization and GD in the noiseless scenario but also +outperforms them in low-sample scenarios with noise. + +
+
+
+
+
+ + ☆ A Reinforcement Learning Approach for Performance-aware Reduction in + Power Consumption of Data Center Compute Nodes + + +
+ As Exascale computing becomes a reality, the energy needs of compute nodes in +cloud data centers will continue to grow. A common approach to reducing this +energy demand is to limit the power consumption of hardware components when +workloads are experiencing bottlenecks elsewhere in the system. However, +designing a resource controller capable of detecting and limiting power +consumption on-the-fly is a complex issue and can also adversely impact +application performance. In this paper, we explore the use of Reinforcement +Learning (RL) to design a power capping policy on cloud compute nodes using +observations on current power consumption and instantaneous application +performance (heartbeats). By leveraging the Argo Node Resource Management (NRM) +software stack in conjunction with the Intel Running Average Power Limit (RAPL) +hardware control mechanism, we design an agent to control the maximum supplied +power to processors without compromising on application performance. Employing +a Proximal Policy Optimization (PPO) agent to learn an optimal policy on a +mathematical model of the compute nodes, we demonstrate and evaluate using the +STREAM benchmark how a trained agent running on actual hardware can take +actions by balancing power consumption and application performance. + +
+
+ comment: This manuscript consists of a total of 10 pages with 8 figures and 3 + tables and is awaiting its publication at IC2E-2023 +
+
+
+
+
+ + ☆ The Costly Dilemma: Generalization, Evaluation and Cost-Optimal + Deployment of Large Language Models + + +
+ When deploying machine learning models in production for any +product/application, there are three properties that are commonly desired. +First, the models should be generalizable, in that we can extend it to further +use cases as our knowledge of the domain area develops. Second they should be +evaluable, so that there are clear metrics for performance and the calculation +of those metrics in production settings are feasible. Finally, the deployment +should be cost-optimal as far as possible. In this paper we propose that these +three objectives (i.e. generalization, evaluation and cost-optimality) can +often be relatively orthogonal and that for large language models, despite +their performance over conventional NLP models, enterprises need to carefully +assess all the three factors before making substantial investments in this +technology. We propose a framework for generalization, evaluation and +cost-modeling specifically tailored to large language models, offering insights +into the intricacies of development, deployment and management for these large +language models. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Robust Bayesian Tensor Factorization with Zero-Inflated Poisson Model + and Consensus Aggregation + + +
+ Tensor factorizations (TF) are powerful tools for the efficient +representation and analysis of multidimensional data. However, classic TF +methods based on maximum likelihood estimation underperform when applied to +zero-inflated count data, such as single-cell RNA sequencing (scRNA-seq) data. +Additionally, the stochasticity inherent in TFs results in factors that vary +across repeated runs, making interpretation and reproducibility of the results +challenging. In this paper, we introduce Zero Inflated Poisson Tensor +Factorization (ZIPTF), a novel approach for the factorization of +high-dimensional count data with excess zeros. To address the challenge of +stochasticity, we introduce Consensus Zero Inflated Poisson Tensor +Factorization (C-ZIPTF), which combines ZIPTF with a consensus-based +meta-analysis. We evaluate our proposed ZIPTF and C-ZIPTF on synthetic +zero-inflated count data and synthetic and real scRNA-seq data. ZIPTF +consistently outperforms baseline matrix and tensor factorization methods in +terms of reconstruction accuracy for zero-inflated data. When the probability +of excess zeros is high, ZIPTF achieves up to $2.4\times$ better accuracy. +Additionally, C-ZIPTF significantly improves the consistency and accuracy of +the factorization. When tested on both synthetic and real scRNA-seq data, ZIPTF +and C-ZIPTF consistently recover known and biologically meaningful gene +expression programs. + +
+
+
+
+
+ + ☆ Simple online learning with consistency oracle + + +
+ We consider online learning in the model where a learning algorithm can +access the class only via the consistency oracle -- an oracle, that, at any +moment, can give a function from the class that agrees with all examples seen +so far. This model was recently considered by Assos et al. (COLT'23). It is +motivated by the fact that standard methods of online learning rely on +computing the Littlestone dimension of subclasses, a problem that is +computationally intractable. Assos et al. gave an online learning algorithm in +this model that makes at most $C^d$ mistakes on classes of Littlestone +dimension $d$, for some absolute unspecified constant $C > 0$. We give a novel +algorithm that makes at most $O(256^d)$ mistakes. Our proof is significantly +simpler and uses only very basic properties of the Littlestone dimension. We +also observe that there exists no algorithm in this model that makes at most +$2^{d+1}-2$ mistakes. We also observe that our algorithm (as well as the +algorithm of Assos et al.) solves an open problem by Hasrati and Ben-David +(ALT'23). Namely, it demonstrates that every class of finite Littlestone +dimension with recursively enumerable representation admits a computable online +learner (that may be undefined on unrealizable samples). + +
+
+ comment: submitted to conference +
+
+
+
+
+ + ☆ Natural Evolution Strategies as a Black Box Estimator for Stochastic + Variational Inference + + +
+ Stochastic variational inference and its derivatives in the form of +variational autoencoders enjoy the ability to perform Bayesian inference on +large datasets in an efficient manner. However, performing inference with a VAE +requires a certain design choice (i.e. reparameterization trick) to allow +unbiased and low variance gradient estimation, restricting the types of models +that can be created. To overcome this challenge, an alternative estimator based +on natural evolution strategies is proposed. This estimator does not make +assumptions about the kind of distributions used, allowing for the creation of +models that would otherwise not have been possible under the VAE framework. + +
+
+
+
+
+ + ☆ Unbiased Decisions Reduce Regret: Adversarial Domain Adaptation for the + Bank Loan Problem + + +
+ In many real world settings binary classification decisions are made based on +limited data in near real-time, e.g. when assessing a loan application. We +focus on a class of these problems that share a common feature: the true label +is only observed when a data point is assigned a positive label by the +principal, e.g. we only find out whether an applicant defaults if we accepted +their loan application. As a consequence, the false rejections become +self-reinforcing and cause the labelled training set, that is being +continuously updated by the model decisions, to accumulate bias. Prior work +mitigates this effect by injecting optimism into the model, however this comes +at the cost of increased false acceptance rate. We introduce adversarial +optimism (AdOpt) to directly address bias in the training set using adversarial +domain adaptation. The goal of AdOpt is to learn an unbiased but informative +representation of past data, by reducing the distributional shift between the +set of accepted data points and all data points seen thus far. AdOpt +significantly exceeds state-of-the-art performance on a set of challenging +benchmark problems. Our experiments also provide initial evidence that the +introduction of adversarial domain adaptation improves fairness in this +setting. + +
+
+
+
+
+ + ☆ Regret Lower Bounds in Multi-agent Multi-armed Bandit + + +
+ Multi-armed Bandit motivates methods with provable upper bounds on regret and +also the counterpart lower bounds have been extensively studied in this +context. Recently, Multi-agent Multi-armed Bandit has gained significant +traction in various domains, where individual clients face bandit problems in a +distributed manner and the objective is the overall system performance, +typically measured by regret. While efficient algorithms with regret upper +bounds have emerged, limited attention has been given to the corresponding +regret lower bounds, except for a recent lower bound for adversarial settings, +which, however, has a gap with let known upper bounds. To this end, we herein +provide the first comprehensive study on regret lower bounds across different +settings and establish their tightness. Specifically, when the graphs exhibit +good connectivity properties and the rewards are stochastically distributed, we +demonstrate a lower bound of order $O(\log T)$ for instance-dependent bounds +and $\sqrt{T}$ for mean-gap independent bounds which are tight. Assuming +adversarial rewards, we establish a lower bound $O(T^{\frac{2}{3}})$ for +connected graphs, thereby bridging the gap between the lower and upper bound in +the prior work. We also show a linear regret lower bound when the graph is +disconnected. While previous works have explored these settings with upper +bounds, we provide a thorough study on tight lower bounds. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Classification of Data Generated by Gaussian Mixture Models Using Deep + ReLU Networks + + +
+ This paper studies the binary classification of unbounded data from ${\mathbb +R}^d$ generated under Gaussian Mixture Models (GMMs) using deep ReLU neural +networks. We obtain $\unicode{x2013}$ for the first time $\unicode{x2013}$ +non-asymptotic upper bounds and convergence rates of the excess risk (excess +misclassification error) for the classification without restrictions on model +parameters. The convergence rates we derive do not depend on dimension $d$, +demonstrating that deep ReLU networks can overcome the curse of dimensionality +in classification. While the majority of existing generalization analysis of +classification algorithms relies on a bounded domain, we consider an unbounded +domain by leveraging the analyticity and fast decay of Gaussian distributions. +To facilitate our analysis, we give a novel approximation error bound for +general analytic functions using ReLU networks, which may be of independent +interest. Gaussian distributions can be adopted nicely to model data arising in +applications, e.g., speeches, images, and texts; our results provide a +theoretical verification of the observed efficiency of deep neural networks in +practical classification problems. + +
+
+
+
+
+ + ☆ Planning to Learn: A Novel Algorithm for Active Learning during + Model-Based Planning + + +
+ Active Inference is a recent framework for modeling planning under +uncertainty. Empirical and theoretical work have now begun to evaluate the +strengths and weaknesses of this approach and how it might be improved. A +recent extension - the sophisticated inference (SI) algorithm - improves +performance on multi-step planning problems through recursive decision tree +search. However, little work to date has been done to compare SI to other +established planning algorithms. SI was also developed with a focus on +inference as opposed to learning. The present paper has two aims. First, we +compare performance of SI to Bayesian reinforcement learning (RL) schemes +designed to solve similar problems. Second, we present an extension of SI - +sophisticated learning (SL) - that more fully incorporates active learning +during planning. SL maintains beliefs about how model parameters would change +under the future observations expected under each policy. This allows a form of +counterfactual retrospective inference in which the agent considers what could +be learned from current or past observations given different future +observations. To accomplish these aims, we make use of a novel, biologically +inspired environment designed to highlight the problem structure for which SL +offers a unique solution. Here, an agent must continually search for available +(but changing) resources in the presence of competing affordances for +information gain. Our simulations show that SL outperforms all other algorithms +in this context - most notably, Bayes-adaptive RL and upper confidence bound +algorithms, which aim to solve multi-step planning problems using similar +principles (i.e., directed exploration and counterfactual reasoning). These +results provide added support for the utility of Active Inference in solving +this class of biologically-relevant problems and offer added tools for testing +hypotheses about human cognition. + +
+
+ comment: 31 pages, 5 figures +
+
+
+
+
+ + ☆ Potential Energy Advantage of Quantum Economy + + +
+ Energy cost is increasingly crucial in the modern computing industry with the +wide deployment of large-scale machine learning models and language models. For +the firms that provide computing services, low energy consumption is important +both from the perspective of their own market growth and the government's +regulations. In this paper, we study the energy benefits of quantum computing +vis-a-vis classical computing. Deviating from the conventional notion of +quantum advantage based solely on computational complexity, we redefine +advantage in an energy efficiency context. Through a Cournot competition model +constrained by energy usage, we demonstrate quantum computing firms can +outperform classical counterparts in both profitability and energy efficiency +at Nash equilibrium. Therefore quantum computing may represent a more +sustainable pathway for the computing industry. Moreover, we discover that the +energy benefits of quantum computing economies are contingent on large-scale +computation. Based on real physical parameters, we further illustrate the scale +of operation necessary for realizing this energy efficiency advantage. + +
+
+ comment: 23 pages, many figures +
+
+
+
+
+ + ☆ Active Inverse Learning in Stackelberg Trajectory Games + + +
+ Game-theoretic inverse learning is the problem of inferring the players' +objectives from their actions. We formulate an inverse learning problem in a +Stackelberg game between a leader and a follower, where each player's action is +the trajectory of a dynamical system. We propose an active inverse learning +method for the leader to infer which hypothesis among a finite set of +candidates describes the follower's objective function. Instead of using +passively observed trajectories like existing methods, the proposed method +actively maximizes the differences in the follower's trajectories under +different hypotheses to accelerate the leader's inference. We demonstrate the +proposed method in a receding-horizon repeated trajectory game. Compared with +uniformly random inputs, the leader inputs provided by the proposed method +accelerate the convergence of the probability of different hypotheses +conditioned on the follower's trajectory by orders of magnitude. + +
+
+
+
+
+ + ☆ GRINN: A Physics-Informed Neural Network for solving hydrodynamic + systems in the presence of self-gravity + + +
+ Modeling self-gravitating gas flows is essential to answering many +fundamental questions in astrophysics. This spans many topics including +planet-forming disks, star-forming clouds, galaxy formation, and the +development of large-scale structures in the Universe. However, the nonlinear +interaction between gravity and fluid dynamics offers a formidable challenge to +solving the resulting time-dependent partial differential equations (PDEs) in +three dimensions (3D). By leveraging the universal approximation capabilities +of a neural network within a mesh-free framework, physics informed neural +networks (PINNs) offer a new way of addressing this challenge. We introduce the +gravity-informed neural network (GRINN), a PINN-based code, to simulate 3D +self-gravitating hydrodynamic systems. Here, we specifically study +gravitational instability and wave propagation in an isothermal gas. Our +results match a linear analytic solution to within 1\% in the linear regime and +a conventional grid code solution to within 5\% as the disturbance grows into +the nonlinear regime. We find that the computation time of the GRINN does not +scale with the number of dimensions. This is in contrast to the scaling of the +grid-based code for the hydrodynamic and self-gravity calculations as the +number of dimensions is increased. Our results show that the GRINN computation +time is longer than the grid code in one- and two- dimensional calculations but +is an order of magnitude lesser than the grid code in 3D with similar accuracy. +Physics-informed neural networks like GRINN thus show promise for advancing our +ability to model 3D astrophysical flows. + +
+
+
+
+
+ + ☆ BI-LAVA: Biocuration with Hierarchical Image Labeling through Active + Learning and Visual Analysis + + +
+ In the biomedical domain, taxonomies organize the acquisition modalities of +scientific images in hierarchical structures. Such taxonomies leverage large +sets of correct image labels and provide essential information about the +importance of a scientific publication, which could then be used in biocuration +tasks. However, the hierarchical nature of the labels, the overhead of +processing images, the absence or incompleteness of labeled data, and the +expertise required to label this type of data impede the creation of useful +datasets for biocuration. From a multi-year collaboration with biocurators and +text-mining researchers, we derive an iterative visual analytics and active +learning strategy to address these challenges. We implement this strategy in a +system called BI-LAVA Biocuration with Hierarchical Image Labeling through +Active Learning and Visual Analysis. BI-LAVA leverages a small set of image +labels, a hierarchical set of image classifiers, and active learning to help +model builders deal with incomplete ground-truth labels, target a hierarchical +taxonomy of image modalities, and classify a large pool of unlabeled images. +BI-LAVA's front end uses custom encodings to represent data distributions, +taxonomies, image projections, and neighborhoods of image thumbnails, which +help model builders explore an unfamiliar image dataset and taxonomy and +correct and generate labels. An evaluation with machine learning practitioners +shows that our mixed human-machine approach successfully supports domain +experts in understanding the characteristics of classes within the taxonomy, as +well as validating and improving data quality in labeled and unlabeled +collections. + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ☆ Monte Carlo guided Diffusion for Bayesian linear inverse problems + + +
+ Ill-posed linear inverse problems that combine knowledge of the forward +measurement model with prior models arise frequently in various applications, +from computational photography to medical imaging. Recent research has focused +on solving these problems with score-based generative models (SGMs) that +produce perceptually plausible images, especially in inpainting problems. In +this study, we exploit the particular structure of the prior defined in the SGM +to formulate recovery in a Bayesian framework as a Feynman--Kac model adapted +from the forward diffusion model used to construct score-based diffusion. To +solve this Feynman--Kac problem, we propose the use of Sequential Monte Carlo +methods. The proposed algorithm, MCGdiff, is shown to be theoretically grounded +and we provide numerical simulations showing that it outperforms competing +baselines when dealing with ill-posed inverse problems. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ An Adaptive Approach for Probabilistic Wind Power Forecasting Based on + Meta-Learning + + +
+ This paper studies an adaptive approach for probabilistic wind power +forecasting (WPF) including offline and online learning procedures. In the +offline learning stage, a base forecast model is trained via inner and outer +loop updates of meta-learning, which endows the base forecast model with +excellent adaptability to different forecast tasks, i.e., probabilistic WPF +with different lead times or locations. In the online learning stage, the base +forecast model is applied to online forecasting combined with incremental +learning techniques. On this basis, the online forecast takes full advantage of +recent information and the adaptability of the base forecast model. Two +applications are developed based on our proposed approach concerning +forecasting with different lead times (temporal adaptation) and forecasting for +newly established wind farms (spatial adaptation), respectively. Numerical +tests were conducted on real-world wind power data sets. Simulation results +validate the advantages in adaptivity of the proposed methods compared with +existing alternatives. + +
+
+
+
+
+ + ☆ MultiSChuBERT: Effective Multimodal Fusion for Scholarly Document + Quality Prediction + + +
+ Automatic assessment of the quality of scholarly documents is a difficult +task with high potential impact. Multimodality, in particular the addition of +visual information next to text, has been shown to improve the performance on +scholarly document quality prediction (SDQP) tasks. We propose the multimodal +predictive model MultiSChuBERT. It combines a textual model based on chunking +full paper text and aggregating computed BERT chunk-encodings (SChuBERT), with +a visual model based on Inception V3.Our work contributes to the current +state-of-the-art in SDQP in three ways. First, we show that the method of +combining visual and textual embeddings can substantially influence the +results. Second, we demonstrate that gradual-unfreezing of the weights of the +visual sub-model, reduces its tendency to ovefit the data, improving results. +Third, we show the retained benefit of multimodality when replacing standard +BERT$_{\textrm{BASE}}$ embeddings with more recent state-of-the-art text +embedding models. + Using BERT$_{\textrm{BASE}}$ embeddings, on the (log) number of citations +prediction task with the ACL-BiblioMetry dataset, our MultiSChuBERT +(text+visual) model obtains an $R^{2}$ score of 0.454 compared to 0.432 for the +SChuBERT (text only) model. Similar improvements are obtained on the PeerRead +accept/reject prediction task. In our experiments using SciBERT, scincl, +SPECTER and SPECTER2.0 embeddings, we show that each of these tailored +embeddings adds further improvements over the standard BERT$_{\textrm{BASE}}$ +embeddings, with the SPECTER2.0 embeddings performing best. + +
+
+
+
+
+ + ♻ ☆ Data augmentation and refinement for recommender system: A + semi-supervised approach using maximum margin matrix factorization + + +
+ Collaborative filtering (CF) has become a popular method for developing +recommender systems (RSs) where ratings of a user for new items are predicted +based on her past preferences and available preference information of other +users. Despite the popularity of CF-based methods, their performance is often +greatly limited by the sparsity of observed entries. In this study, we explore +the data augmentation and refinement aspects of Maximum Margin Matrix +Factorization (MMMF), a widely accepted CF technique for rating predictions, +which has not been investigated before. We exploit the inherent characteristics +of CF algorithms to assess the confidence level of individual ratings and +propose a semi-supervised approach for rating augmentation based on +self-training. We hypothesize that any CF algorithm's predictions with low +confidence are due to some deficiency in the training data and hence, the +performance of the algorithm can be improved by adopting a systematic data +augmentation strategy. We iteratively use some of the ratings predicted with +high confidence to augment the training data and remove low-confidence entries +through a refinement process. By repeating this process, the system learns to +improve prediction accuracy. Our method is experimentally evaluated on several +state-of-the-art CF algorithms and leads to informative rating augmentation, +improving the performance of the baseline approaches. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Tirtha -- An Automated Platform to Crowdsource Images and Create 3D + Models of Heritage Sites + + +
+ Digital preservation of Cultural Heritage (CH) sites is crucial to protect +them against damage from natural disasters or human activities. Creating 3D +models of CH sites has become a popular method of digital preservation thanks +to advancements in computer vision and photogrammetry. However, the process is +time-consuming, expensive, and typically requires specialized equipment and +expertise, posing challenges in resource-limited developing countries. +Additionally, the lack of an open repository for 3D models hinders research and +public engagement with their heritage. To address these issues, we propose +Tirtha, a web platform for crowdsourcing images of CH sites and creating their +3D models. Tirtha utilizes state-of-the-art Structure from Motion (SfM) and +Multi-View Stereo (MVS) techniques. It is modular, extensible and +cost-effective, allowing for the incorporation of new techniques as +photogrammetry advances. Tirtha is accessible through a web interface at +https://tirtha.niser.ac.in and can be deployed on-premise or in a cloud +environment. In our case studies, we demonstrate the pipeline's effectiveness +by creating 3D models of temples in Odisha, India, using crowdsourced images. +These models are available for viewing, interaction, and download on the Tirtha +website. Our work aims to provide a dataset of crowdsourced images and 3D +reconstructions for research in computer vision, heritage conservation, and +related domains. Overall, Tirtha is a step towards democratizing digital +preservation, primarily in resource-limited developing countries. + +
+
+ comment: Accepted at The 28th International ACM Conference on 3D Web + Technology (Web3D 2023) +
+
+
+
+
+ + ♻ ☆ Subset-Based Instance Optimality in Private Estimation + + +
+ We propose a new definition of instance optimality for differentially private +estimation algorithms. Our definition requires an optimal algorithm to compete, +simultaneously for every dataset $D$, with the best private benchmark algorithm +that (a) knows $D$ in advance and (b) is evaluated by its worst-case +performance on large subsets of $D$. That is, the benchmark algorithm need not +perform well when potentially extreme points are added to $D$; it only has to +handle the removal of a small number of real data points that already exist. +This makes our benchmark significantly stronger than those proposed in prior +work. We nevertheless show, for real-valued datasets, how to construct private +algorithms that achieve our notion of instance optimality when estimating a +broad class of dataset properties, including means, quantiles, and +$\ell_p$-norm minimizers. For means in particular, we provide a detailed +analysis and show that our algorithm simultaneously matches or exceeds the +asymptotic performance of existing algorithms under a range of distributional +assumptions. + +
+
+
+
+
+ + ♻ ☆ Whose Emotion Matters? Speaking Activity Localisation without Prior + Knowledge + + +
+ The task of emotion recognition in conversations (ERC) benefits from the +availability of multiple modalities, as provided, for example, in the +video-based Multimodal EmotionLines Dataset (MELD). However, only a few +research approaches use both acoustic and visual information from the MELD +videos. There are two reasons for this: First, label-to-video alignments in +MELD are noisy, making those videos an unreliable source of emotional speech +data. Second, conversations can involve several people in the same scene, which +requires the localisation of the utterance source. In this paper, we introduce +MELD with Fixed Audiovisual Information via Realignment (MELD-FAIR) by using +recent active speaker detection and automatic speech recognition models, we are +able to realign the videos of MELD and capture the facial expressions from +speakers in 96.92% of the utterances provided in MELD. Experiments with a +self-supervised voice recognition model indicate that the realigned MELD-FAIR +videos more closely match the transcribed utterances given in the MELD dataset. +Finally, we devise a model for emotion recognition in conversations trained on +the realigned MELD-FAIR videos, which outperforms state-of-the-art models for +ERC based on vision alone. This indicates that localising the source of +speaking activities is indeed effective for extracting facial expressions from +the uttering speakers and that faces provide more informative visual cues than +the visual features state-of-the-art models have been using so far. The +MELD-FAIR realignment data, and the code of the realignment procedure and of +the emotional recognition, are available at +https://github.com/knowledgetechnologyuhh/MELD-FAIR. + +
+
+ comment: 17 pages, 8 figures, 7 tables, Published in Neurocomputing +
+
+
+
+
+ + ♻ ☆ Undersampling and Cumulative Class Re-decision Methods to Improve + Detection of Agitation in People with Dementia + + +
+ Agitation is one of the most prevalent symptoms in people with dementia (PwD) +that can place themselves and the caregiver's safety at risk. Developing +objective agitation detection approaches is important to support health and +safety of PwD living in a residential setting. In a previous study, we +collected multimodal wearable sensor data from 17 participants for 600 days and +developed machine learning models for detecting agitation in one-minute +windows. However, there are significant limitations in the dataset, such as +imbalance problem and potential imprecise labelsas the occurrence of agitation +is much rarer in comparison to the normal behaviours. In this paper, we first +implemented different undersampling methods to eliminate the imbalance problem, +and came to the conclusion that only 20% of normal behaviour data were adequate +to train a competitive agitation detection model. Then, we designed a weighted +undersampling method to evaluate the manual labeling mechanism given the +ambiguous time interval assumption. After that, the postprocessing method of +cumulative class re-decision (CCR) was proposed based on the historical +sequential information and continuity characteristic of agitation, improving +the decision-making performance for the potential application of agitation +detection system. The results showed that a combination of undersampling and +CCR improved F1-score and other metrics to varying degrees with less training +time and data. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Stack More Layers Differently: High-Rank Training Through Low-Rank + Updates + + +
+ Despite the dominance and effectiveness of scaling, resulting in large +networks with hundreds of billions of parameters, the necessity to train +overparametrized models remains poorly understood, and alternative approaches +do not necessarily make it cheaper to train high-performance models. In this +paper, we explore low-rank training techniques as an alternative approach to +training large neural networks. We introduce a novel method called ReLoRA, +which utilizes low-rank updates to train high-rank networks. We apply ReLoRA to +pre-training transformer language models with up to 350M parameters and +demonstrate comparable performance to regular neural network training. +Furthermore, we observe that the efficiency of ReLoRA increases with model +size, making it a promising approach for training multi-billion-parameter +networks efficiently. Our findings shed light on the potential of low-rank +training techniques and their implications for scaling laws. + +
+
+
+
+
+ + ♻ ☆ Policy Regularization with Dataset Constraint for Offline Reinforcement + Learning ICML 2023 + + +
+ We consider the problem of learning the best possible policy from a fixed +dataset, known as offline Reinforcement Learning (RL). A common taxonomy of +existing offline RL works is policy regularization, which typically constrains +the learned policy by distribution or support of the behavior policy. However, +distribution and support constraints are overly conservative since they both +force the policy to choose similar actions as the behavior policy when +considering particular states. It will limit the learned policy's performance, +especially when the behavior policy is sub-optimal. In this paper, we find that +regularizing the policy towards the nearest state-action pair can be more +effective and thus propose Policy Regularization with Dataset Constraint +(PRDC). When updating the policy in a given state, PRDC searches the entire +dataset for the nearest state-action sample and then restricts the policy with +the action of this sample. Unlike previous works, PRDC can guide the policy +with proper behaviors from the dataset, allowing it to choose actions that do +not appear in the dataset along with the given state. It is a softer constraint +but still keeps enough conservatism from out-of-distribution actions. Empirical +evidence and theoretical analysis show that PRDC can alleviate offline RL's +fundamentally challenging value overestimation issue with a bounded performance +gap. Moreover, on a set of locomotion and navigation tasks, PRDC achieves +state-of-the-art performance compared with existing methods. Code is available +at https://github.com/LAMDA-RL/PRDC + +
+
+ comment: Accepted to ICML 2023 +
+
+
+
+
+ + ♻ ☆ A Framework For Refining Text Classification and Object Recognition from + Academic Articles + + +
+ With the widespread use of the internet, it has become increasingly crucial +to extract specific information from vast amounts of academic articles +efficiently. Data mining techniques are generally employed to solve this issue. +However, data mining for academic articles is challenging since it requires +automatically extracting specific patterns in complex and unstructured layout +documents. Current data mining methods for academic articles employ +rule-based(RB) or machine learning(ML) approaches. However, using rule-based +methods incurs a high coding cost for complex typesetting articles. On the +other hand, simply using machine learning methods requires annotation work for +complex content types within the paper, which can be costly. Furthermore, only +using machine learning can lead to cases where patterns easily recognized by +rule-based methods are mistakenly extracted. To overcome these issues, from the +perspective of analyzing the standard layout and typesetting used in the +specified publication, we emphasize implementing specific methods for specific +characteristics in academic articles. We have developed a novel Text Block +Refinement Framework (TBRF), a machine learning and rule-based scheme hybrid. +We used the well-known ACL proceeding articles as experimental data for the +validation experiment. The experiment shows that our approach achieved over 95% +classification accuracy and 90% detection accuracy for tables and figures. + +
+
+ comment: This paper has been accepted at 'The International Symposium on + Innovations in Intelligent Systems and Applications 2023 (INISTA 2023)' +
+
+
+
+
+ + ♻ ☆ A Recipe for Well-behaved Graph Neural Approximations of Complex + Dynamics + + +
+ Data-driven approximations of ordinary differential equations offer a +promising alternative to classical methods in discovering a dynamical system +model, particularly in complex systems lacking explicit first principles. This +paper focuses on a complex system whose dynamics is described with a system of +ordinary differential equations, coupled via a network adjacency matrix. +Numerous real-world systems, including financial, social, and neural systems, +belong to this class of dynamical models. We propose essential elements for +approximating such dynamical systems using neural networks, including necessary +biases and an appropriate neural architecture. Emphasizing the differences from +static supervised learning, we advocate for evaluating generalization beyond +classical assumptions of statistical learning theory. To estimate confidence in +prediction during inference time, we introduce a dedicated null model. By +studying various complex network dynamics, we demonstrate the neural network's +ability to approximate various dynamics, generalize across complex network +structures, sizes, and statistical properties of inputs. Our comprehensive +framework enables deep learning approximations of high-dimensional, +non-linearly coupled complex dynamical systems. + +
+
+
+
+
+ + ♻ ☆ Why Batch Normalization Damage Federated Learning on Non-IID Data? + + +
+ As a promising distributed learning paradigm, federated learning (FL) +involves training deep neural network (DNN) models at the network edge while +protecting the privacy of the edge clients. To train a large-scale DNN model, +batch normalization (BN) has been regarded as a simple and effective means to +accelerate the training and improve the generalization capability. However, +recent findings indicate that BN can significantly impair the performance of FL +in the presence of non-i.i.d. data. While several FL algorithms have been +proposed to address this issue, their performance still falls significantly +when compared to the centralized scheme. Furthermore, none of them have +provided a theoretical explanation of how the BN damages the FL convergence. In +this paper, we present the first convergence analysis to show that under the +non-i.i.d. data, the mismatch between the local and global statistical +parameters in BN causes the gradient deviation between the local and global +models, which, as a result, slows down and biases the FL convergence. In view +of this, we develop a new FL algorithm that is tailored to BN, called FedTAN, +which is capable of achieving robust FL performance under a variety of data +distributions via iterative layer-wise parameter aggregation. Comprehensive +experimental results demonstrate the superiority of the proposed FedTAN over +existing baselines for training BN-based DNN models. + +
+
+
+
+
+ + ♻ ☆ Source-free Domain Adaptive Human Pose Estimation ICCV 2023 + + +
+ Human Pose Estimation (HPE) is widely used in various fields, including +motion analysis, healthcare, and virtual reality. However, the great expenses +of labeled real-world datasets present a significant challenge for HPE. To +overcome this, one approach is to train HPE models on synthetic datasets and +then perform domain adaptation (DA) on real-world data. Unfortunately, existing +DA methods for HPE neglect data privacy and security by using both source and +target data in the adaptation process. To this end, we propose a new task, +named source-free domain adaptive HPE, which aims to address the challenges of +cross-domain learning of HPE without access to source data during the +adaptation process. We further propose a novel framework that consists of three +models: source model, intermediate model, and target model, which explores the +task from both source-protect and target-relevant perspectives. The +source-protect module preserves source information more effectively while +resisting noise, and the target-relevant module reduces the sparsity of spatial +representations by building a novel spatial probability space, and +pose-specific contrastive learning and information maximization are proposed on +the basis of this space. Comprehensive experiments on several domain adaptive +HPE benchmarks show that the proposed method outperforms existing approaches by +a considerable margin. The codes are available at +https://github.com/davidpengucf/SFDAHPE. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ ANTONIO: Towards a Systematic Method of Generating NLP Benchmarks for + Verification + + +
+ Verification of machine learning models used in Natural Language Processing +(NLP) is known to be a hard problem. In particular, many known neural network +verification methods that work for computer vision and other numeric datasets +do not work for NLP. Here, we study technical reasons that underlie this +problem. Based on this analysis, we propose practical methods and heuristics +for preparing NLP datasets and models in a way that renders them amenable to +known verification methods based on abstract interpretation. We implement these +methods as a Python library called ANTONIO that links to the neural network +verifiers ERAN and Marabou. We perform evaluation of the tool using an NLP +dataset R-U-A-Robot suggested as a benchmark for verifying legally critical NLP +applications. We hope that, thanks to its general applicability, this work will +open novel possibilities for including NLP verification problems into neural +network verification competitions, and will popularise NLP problems within this +community. + +
+
+ comment: To appear in proceedings of 6th Workshop on Formal Methods for + ML-Enabled Autonomous Systems (Affiliated with CAV 2023) +
+
+
+
+
+ + ♻ ☆ Mixed Regression via Approximate Message Passing AISTATS 2023 + + +
+ We study the problem of regression in a generalized linear model (GLM) with +multiple signals and latent variables. This model, which we call a matrix GLM, +covers many widely studied problems in statistical learning, including mixed +linear regression, max-affine regression, and mixture-of-experts. In mixed +linear regression, each observation comes from one of $L$ signal vectors +(regressors), but we do not know which one; in max-affine regression, each +observation comes from the maximum of $L$ affine functions, each defined via a +different signal vector. The goal in all these problems is to estimate the +signals, and possibly some of the latent variables, from the observations. We +propose a novel approximate message passing (AMP) algorithm for estimation in a +matrix GLM and rigorously characterize its performance in the high-dimensional +limit. This characterization is in terms of a state evolution recursion, which +allows us to precisely compute performance measures such as the asymptotic +mean-squared error. The state evolution characterization can be used to tailor +the AMP algorithm to take advantage of any structural information known about +the signals. Using state evolution, we derive an optimal choice of AMP +`denoising' functions that minimizes the estimation error in each iteration. + The theoretical results are validated by numerical simulations for mixed +linear regression, max-affine regression, and mixture-of-experts. For +max-affine regression, we propose an algorithm that combines AMP with +expectation-maximization to estimate intercepts of the model along with the +signals. The numerical results show that AMP significantly outperforms other +estimators for mixed linear regression and max-affine regression in most +parameter regimes. + +
+
+ comment: 44 pages. To appear in the Journal of Machine Learning Research. A + shorter version of this paper appeared in the proceedings of AISTATS 2023 +
+
+
+
+
+ + ♻ ☆ FedICT: Federated Multi-task Distillation for Multi-access Edge + Computing + + +
+ The growing interest in intelligent services and privacy protection for +mobile devices has given rise to the widespread application of federated +learning in Multi-access Edge Computing (MEC). Diverse user behaviors call for +personalized services with heterogeneous Machine Learning (ML) models on +different devices. Federated Multi-task Learning (FMTL) is proposed to train +related but personalized ML models for different devices, whereas previous +works suffer from excessive communication overhead during training and neglect +the model heterogeneity among devices in MEC. Introducing knowledge +distillation into FMTL can simultaneously enable efficient communication and +model heterogeneity among clients, whereas existing methods rely on a public +dataset, which is impractical in reality. To tackle this dilemma, Federated +MultI-task Distillation for Multi-access Edge CompuTing (FedICT) is proposed. +FedICT direct local-global knowledge aloof during bi-directional distillation +processes between clients and the server, aiming to enable multi-task clients +while alleviating client drift derived from divergent optimization directions +of client-side local models. Specifically, FedICT includes Federated Prior +Knowledge Distillation (FPKD) and Local Knowledge Adjustment (LKA). FPKD is +proposed to reinforce the clients' fitting of local data by introducing prior +knowledge of local data distributions. Moreover, LKA is proposed to correct the +distillation loss of the server, making the transferred local knowledge better +match the generalized representation. Experiments on three datasets show that +FedICT significantly outperforms all compared benchmarks in various data +heterogeneous and model architecture settings, achieving improved accuracy with +less than 1.2% training communication overhead compared with FedAvg and no more +than 75% training communication round compared with FedGKT. + +
+
+ comment: Accepted by IEEE TRANSACTIONS ON PARALLEL AND DISTRIBUTED SYSTEMS +
+
+
+
+
+ + ♻ ☆ Rigorous dynamical mean field theory for stochastic gradient descent + methods + + +
+ We prove closed-form equations for the exact high-dimensional asymptotics of +a family of first order gradient-based methods, learning an estimator (e.g. +M-estimator, shallow neural network, ...) from observations on Gaussian data +with empirical risk minimization. This includes widely used algorithms such as +stochastic gradient descent (SGD) or Nesterov acceleration. The obtained +equations match those resulting from the discretization of dynamical mean-field +theory (DMFT) equations from statistical physics when applied to gradient flow. +Our proof method allows us to give an explicit description of how memory +kernels build up in the effective dynamics, and to include non-separable update +functions, allowing datasets with non-identity covariance matrices. Finally, we +provide numerical implementations of the equations for SGD with generic +extensive batch-size and with constant learning rates. + +
+
+ comment: 38 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Spot The Odd One Out: Regularized Complete Cycle Consistent Anomaly + Detector GAN + + +
+ This study presents an adversarial method for anomaly detection in real-world +applications, leveraging the power of generative adversarial neural networks +(GANs) through cycle consistency in reconstruction error. Previous methods +suffer from the high variance between class-wise accuracy which leads to not +being applicable for all types of anomalies. The proposed method named RCALAD +tries to solve this problem by introducing a novel discriminator to the +structure, which results in a more efficient training process. Additionally, +RCALAD employs a supplementary distribution in the input space to steer +reconstructions toward the normal data distribution, effectively separating +anomalous samples from their reconstructions and facilitating more accurate +anomaly detection. To further enhance the performance of the model, two novel +anomaly scores are introduced. The proposed model has been thoroughly evaluated +through extensive experiments on six various datasets, yielding results that +demonstrate its superiority over existing state-of-the-art models. The code is +readily available to the research community at +https://github.com/zahraDehghanian97/RCALAD. + +
+
+ comment: under revision of Applied Soft Computing Journal +
+
+
+
+
+ + ♻ ☆ Achieving High Accuracy with PINNs via Energy Natural Gradients + + +
+ We propose energy natural gradient descent, a natural gradient method with +respect to a Hessian-induced Riemannian metric as an optimization algorithm for +physics-informed neural networks (PINNs) and the deep Ritz method. As a main +motivation we show that the update direction in function space resulting from +the energy natural gradient corresponds to the Newton direction modulo an +orthogonal projection onto the model's tangent space. We demonstrate +experimentally that energy natural gradient descent yields highly accurate +solutions with errors several orders of magnitude smaller than what is obtained +when training PINNs with standard optimizers like gradient descent or Adam, +even when those are allowed significantly more computation time. + +
+
+ comment: Published version +
+
+
+
+
+ + ♻ ☆ A Bio-Inspired Chaos Sensor Model Based on the Perceptron Neural + Network: Machine Learning Concept and Application for Computational + Neuro-Science + + +
+ The study presents a bio-inspired chaos sensor model based on the perceptron +neural network for the estimation of entropy of spike train in neurodynamic +systems. After training, the sensor on perceptron, having 50 neurons in the +hidden layer and 1 neuron at the output, approximates the fuzzy entropy of a +short time series with high accuracy, with a determination coefficient of R2 ~ +0.9. The Hindmarsh-Rose spike model was used to generate time series of spike +intervals, and datasets for training and testing the perceptron. The selection +of the hyperparameters of the perceptron model and the estimation of the sensor +accuracy were performed using the K-block cross-validation method. Even for a +hidden layer with one neuron, the model approximates the fuzzy entropy with +good results and the metric R2 ~ 0.5-0.8. In a simplified model with one neuron +and equal weights in the first layer, the principle of approximation is based +on the linear transformation of the average value of the time series into the +entropy value. An example of using the chaos sensor on spike train of action +potential recordings from the L5 dorsal rootlet of rat is provided. The +bio-inspired chaos sensor model based on an ensemble of neurons is able to +dynamically track the chaotic behavior of a spike signal and transmit this +information to other parts of the neurodynamic model for further processing. +The study will be useful for specialists in the field of computational +neuroscience, and also to create humanoid and animal robots, and bio-robots +with limited resources. + +
+
+ comment: 28 pages, 15 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Bayesian Hyperbolic Multidimensional Scaling + + +
+ Multidimensional scaling (MDS) is a widely used approach to representing +high-dimensional, dependent data. MDS works by assigning each observation a +location on a low-dimensional geometric manifold, with distance on the manifold +representing similarity. We propose a Bayesian approach to multidimensional +scaling when the low-dimensional manifold is hyperbolic. Using hyperbolic space +facilitates representing tree-like structures common in many settings (e.g. +text or genetic data with hierarchical structure). A Bayesian approach provides +regularization that minimizes the impact of measurement error in the observed +data and assesses uncertainty. We also propose a case-control likelihood +approximation that allows for efficient sampling from the posterior +distribution in larger data settings, reducing computational complexity from +approximately $O(n^2)$ to $O(n)$. We evaluate the proposed method against +state-of-the-art alternatives using simulations, canonical reference datasets, +Indian village network data, and human gene expression data. + +
+
+
+
+
+ + ♻ ☆ Positive Unlabeled Contrastive Learning + + +
+ Self-supervised pretraining on unlabeled data followed by supervised +fine-tuning on labeled data is a popular paradigm for learning from limited +labeled examples. We extend this paradigm to the classical positive unlabeled +(PU) setting, where the task is to learn a binary classifier given only a few +labeled positive samples, and (often) a large amount of unlabeled samples +(which could be positive or negative). + We first propose a simple extension of standard infoNCE family of contrastive +losses, to the PU setting; and show that this learns superior representations, +as compared to existing unsupervised and supervised approaches. We then develop +a simple methodology to pseudo-label the unlabeled samples using a new +PU-specific clustering scheme; these pseudo-labels can then be used to train +the final (positive vs. negative) classifier. Our method handily outperforms +state-of-the-art PU methods over several standard PU benchmark datasets, while +not requiring a-priori knowledge of any class prior (which is a common +assumption in other PU methods). We also provide a simple theoretical analysis +that motivates our methods. + +
+
+
+
+
+ + ♻ ☆ Proportionally Representative Clustering ICML + + +
+ In recent years, there has been a surge in effort to formalize notions of +fairness in machine learning. We focus on clustering -- one of the fundamental +tasks in unsupervised machine learning. We propose a new axiom ``proportional +representation fairness'' (PRF) that is designed for clustering problems where +the selection of centroids reflects the distribution of data points and how +tightly they are clustered together. Our fairness concept is not satisfied by +existing fair clustering algorithms. We design efficient algorithms to achieve +PRF both for unconstrained and discrete clustering problems. Our algorithm for +the unconstrained setting is also the first known polynomial-time approximation +algorithm for the well-studied Proportional Fairness (PF) axiom (Chen, Fain, +Lyu, and Munagala, ICML, 2019). Our algorithm for the discrete setting also +matches the best known approximation factor for PF. + +
+
+ comment: Revised version includes a new author (Jeremy Vollen) and new + results: Our algorithm for the unconstrained setting is also the first known + polynomial-time approximation algorithm for the well-studied Proportional + Fairness (PF) axiom (Chen, Fain, Lyu, and Munagala, ICML, 2019). Our + algorithm for the discrete setting also matches the best known approximation + factor for PF +
+
+
+
+
+ + ♻ ☆ Explainable Representation Learning of Small Quantum States + + +
+ Unsupervised machine learning models build an internal representation of +their training data without the need for explicit human guidance or feature +engineering. This learned representation provides insights into which features +of the data are relevant for the task at hand. In the context of quantum +physics, training models to describe quantum states without human intervention +offers a promising approach to gaining insight into how machines represent +complex quantum states. The ability to interpret the learned representation may +offer a new perspective on non-trivial features of quantum systems and their +efficient representation. We train a generative model on two-qubit density +matrices generated by a parameterized quantum circuit. In a series of +computational experiments, we investigate the learned representation of the +model and its internal understanding of the data. We observe that the model +learns an interpretable representation which relates the quantum states to +their underlying entanglement characteristics. In particular, our results +demonstrate that the latent representation of the model is directly correlated +with the entanglement measure concurrence. The insights from this study +represent proof of concept towards interpretable machine learning of quantum +states. Our approach offers insight into how machines learn to represent +small-scale quantum systems autonomously. + +
+
+
+
+
+ + ♻ ☆ Budgeted Multi-Armed Bandits with Asymmetric Confidence Intervals + + +
+ We study the stochastic Budgeted Multi-Armed Bandit (MAB) problem, where a +player chooses from $K$ arms with unknown expected rewards and costs. The goal +is to maximize the total reward under a budget constraint. A player thus seeks +to choose the arm with the highest reward-cost ratio as often as possible. +Current state-of-the-art policies for this problem have several issues, which +we illustrate. To overcome them, we propose a new upper confidence bound (UCB) +sampling policy, $\omega$-UCB, that uses asymmetric confidence intervals. These +intervals scale with the distance between the sample mean and the bounds of a +random variable, yielding a more accurate and tight estimation of the +reward-cost ratio compared to our competitors. We show that our approach has +logarithmic regret and consistently outperforms existing policies in synthetic +and real settings. + +
+
+
+
+
+ + ♻ ☆ Deep Bradley-Terry Rating: Quantifying Properties from Comparisons + + +
+ Many properties in the real world can't be directly observed, making them +difficult to learn. To deal with this challenging problem, prior works have +primarily focused on estimating those properties by using graded human scores +as the target label in the training. Meanwhile, rating algorithms based on the +Bradley-Terry model are extensively studied to evaluate the competitiveness of +players based on their match history. In this paper, we introduce the Deep +Bradley-Terry Rating (DBTR), a novel machine learning framework designed to +quantify and evaluate properties of unknown items. Our method seamlessly +integrates the Bradley-Terry model into the neural network structure. Moreover, +we generalize this architecture further to asymmetric environments with +unfairness, a condition more commonly encountered in real-world settings. +Through experimental analysis, we demonstrate that DBTR successfully learns to +quantify and estimate desired properties. + +
+
+
+
+
+ + ♻ ☆ Quality at the Tail + + +
+ Benchmarking and evaluating deep learning models and systems necessitate a +meticulous approach to ensure comprehensive assessment. In practical +applications, it is paramount to consider both the inference quality and the +inference time, particularly within critical contexts, where stringent +requirements demand the simultaneous satisfaction of both metrics. Neglecting +either aspect can result in severe and irreversible consequences, including +loss of human life and property damage. Unfortunately, many studies lack a +comprehensive consideration of these metrics, often conducted under ideal or +permissive conditions, thereby leading to incomplete or non-intuitive +evaluation methodologies. + This study reveals that deep learning inference quality exhibits +fluctuations, which further introduces complications and challenges to the +benchmarking and evaluation. To better characterize the phenomenon, the concept +of "tail quality" is introduced, which indicates the quality at the tail of +distributions. "Tail quality" can offer a more objective evaluation, overcoming +the limitations of conventional inference quality and inference time metrics in +capturing the quality fluctuation phenomenon. To capture the phenomenon, this +paper also proposes a pioneering evaluation framework for comprehensive +assessment and analysis of various factors affecting inference time and +quality. Leveraging this framework enables the anticipation of the potential +distribution of inference time and inference quality, thus capturing "tail +quality" before practically applying deep learning. The effectiveness of the +evaluation framework is validated through experiments conducted on deep +learning models for three different tasks across four systems. Furthermore, +employing this evaluation framework, the experiments conducted a preliminary +analysis of several factors influencing inference quality and inference time. + +
+
+ comment: 11 pages, 4 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ FlexFringe: Modeling Software Behavior by Learning Probabilistic + Automata + + +
+ We present the efficient implementations of probabilistic deterministic +finite automaton learning methods available in FlexFringe. These implement +well-known strategies for state-merging including several modifications to +improve their performance in practice. We show experimentally that these +algorithms obtain competitive results and significant improvements over a +default implementation. We also demonstrate how to use FlexFringe to learn +interpretable models from software logs and use these for anomaly detection. +Although less interpretable, we show that learning smaller more convoluted +models improves the performance of FlexFringe on anomaly detection, +outperforming an existing solution based on neural nets. + +
+
+
+
+
+ + ♻ ☆ Fair Densities via Boosting the Sufficient Statistics of Exponential + Families ICML2023 + + +
+ We introduce a boosting algorithm to pre-process data for fairness. Starting +from an initial fair but inaccurate distribution, our approach shifts towards +better data fitting while still ensuring a minimal fairness guarantee. To do +so, it learns the sufficient statistics of an exponential family with +boosting-compliant convergence. Importantly, we are able to theoretically prove +that the learned distribution will have a representation rate and statistical +rate data fairness guarantee. Unlike recent optimization based pre-processing +methods, our approach can be easily adapted for continuous domain features. +Furthermore, when the weak learners are specified to be decision trees, the +sufficient statistics of the learned distribution can be examined to provide +clues on sources of (un)fairness. Empirical results are present to display the +quality of result on real-world data. + +
+
+ comment: Published in Proceedings of the 40th International Conference on + Machine Learning (ICML2023) +
+
+
+
+
+ + ♻ ☆ FUSQA: Fetal Ultrasound Segmentation Quality Assessment + + +
+ Deep learning models have been effective for various fetal ultrasound +segmentation tasks. However, generalization to new unseen data has raised +questions about their effectiveness for clinical adoption. Normally, a +transition to new unseen data requires time-consuming and costly quality +assurance processes to validate the segmentation performance post-transition. +Segmentation quality assessment efforts have focused on natural images, where +the problem has been typically formulated as a dice score regression task. In +this paper, we propose a simplified Fetal Ultrasound Segmentation Quality +Assessment (FUSQA) model to tackle the segmentation quality assessment when no +masks exist to compare with. We formulate the segmentation quality assessment +process as an automated classification task to distinguish between good and +poor-quality segmentation masks for more accurate gestational age estimation. +We validate the performance of our proposed approach on two datasets we collect +from two hospitals using different ultrasound machines. We compare different +architectures, with our best-performing architecture achieving over 90% +classification accuracy on distinguishing between good and poor-quality +segmentation masks from an unseen dataset. Additionally, there was only a +1.45-day difference between the gestational age reported by doctors and +estimated based on CRL measurements using well-segmented masks. On the other +hand, this difference increased and reached up to 7.73 days when we calculated +CRL from the poorly segmented masks. As a result, AI-based approaches can +potentially aid fetal ultrasound segmentation quality assessment and might +detect poor segmentation in real-time screening in the future. + +
+
+ comment: 13 pages, 3 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Semantic-aware Node Synthesis for Imbalanced Heterogeneous Information + Networks + + +
+ Heterogeneous graph neural networks (HGNNs) have exhibited exceptional +efficacy in modeling the complex heterogeneity in heterogeneous information +networks (HINs). The critical advantage of HGNNs is their ability to handle +diverse node and edge types in HINs by extracting and utilizing the abundant +semantic information for effective representation learning. However, as a +widespread phenomenon in many real-world scenarios, the class-imbalance +distribution in HINs creates a performance bottleneck for existing HGNNs. Apart +from the quantity imbalance of nodes, another more crucial and distinctive +challenge in HINs is semantic imbalance. Minority classes in HINs often lack +diverse and sufficient neighbor nodes, resulting in biased and incomplete +semantic information. This semantic imbalance further compounds the difficulty +of accurately classifying minority nodes, leading to the performance +degradation of HGNNs. To tackle the imbalance of minority classes and +supplement their inadequate semantics, we present the first method for the +semantic imbalance problem in imbalanced HINs named Semantic-aware Node +Synthesis (SNS). By assessing the influence on minority classes, SNS adaptively +selects the heterogeneous neighbor nodes and augments the network with +synthetic nodes while preserving the minority semantics. In addition, we +introduce two regularization approaches for HGNNs that constrain the +representation of synthetic nodes from both semantic and class perspectives to +effectively suppress the potential noises from synthetic nodes, facilitating +more expressive embeddings for classification. The comprehensive experimental +study demonstrates that SNS consistently outperforms existing methods by a +large margin in different benchmark datasets. + +
+
+
+
+
+ + ♻ ☆ A Time-aware tensor decomposition for tracking evolving patterns + + +
+ Time-evolving data sets can often be arranged as a higher-order tensor with +one of the modes being the time mode. While tensor factorizations have been +successfully used to capture the underlying patterns in such higher-order data +sets, the temporal aspect is often ignored, allowing for the reordering of time +points. In recent studies, temporal regularizers are incorporated in the time +mode to tackle this issue. Nevertheless, existing approaches still do not allow +underlying patterns to change in time (e.g., spatial changes in the brain, +contextual changes in topics). In this paper, we propose temporal PARAFAC2 +(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal +regularization to extract gradually evolving patterns from temporal data. +Through extensive experiments on synthetic data, we demonstrate that tPARAFAC2 +can capture the underlying evolving patterns accurately performing better than +PARAFAC2 and coupled matrix factorization with temporal smoothness +regularization. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Echo from noise: synthetic ultrasound image generation using diffusion + models for real image segmentation + + +
+ We propose a novel pipeline for the generation of synthetic ultrasound images +via Denoising Diffusion Probabilistic Models (DDPMs) guided by cardiac semantic +label maps. We show that these synthetic images can serve as a viable +substitute for real data in the training of deep-learning models for ultrasound +image analysis tasks such as cardiac segmentation. To demonstrate the +effectiveness of this approach, we generated synthetic 2D echocardiograms and +trained a neural network for segmenting the left ventricle and left atrium. The +performance of the network trained on exclusively synthetic images was +evaluated on an unseen dataset of real images and yielded mean Dice scores of +88.6 $\pm 4.91$ , 91.9 $\pm 4.22$, 85.2 $\pm 4.83$ \% for left ventricular +endocardium, epicardium and left atrial segmentation respectively. This +represents a relative increase of $9.2$, $3.3$ and $13.9$ \% in Dice scores +compared to the previous state-of-the-art. The proposed pipeline has potential +for application to a wide range of other tasks across various medical imaging +modalities. + +
+
+
+
+
+ + ♻ ☆ Multi-task Representation Learning with Stochastic Linear Bandits + + +
+ We study the problem of transfer-learning in the setting of stochastic linear +bandit tasks. We consider that a low dimensional linear representation is +shared across the tasks, and study the benefit of learning this representation +in the multi-task learning setting. Following recent results to design +stochastic bandit policies, we propose an efficient greedy policy based on +trace norm regularization. It implicitly learns a low dimensional +representation by encouraging the matrix formed by the task regression vectors +to be of low rank. Unlike previous work in the literature, our policy does not +need to know the rank of the underlying matrix. We derive an upper bound on the +multi-task regret of our policy, which is, up to logarithmic factors, of order +$\sqrt{NdT(T+d)r}$, where $T$ is the number of tasks, $r$ the rank, $d$ the +number of variables and $N$ the number of rounds per task. We show the benefit +of our strategy compared to the baseline $Td\sqrt{N}$ obtained by solving each +task independently. We also provide a lower bound to the multi-task regret. +Finally, we corroborate our theoretical findings with preliminary experiments +on synthetic data. + +
+
+
+
+
+ + ♻ ☆ Multiscale Attention via Wavelet Neural Operators for Vision + Transformers + + +
+ Transformers have achieved widespread success in computer vision. At their +heart, there is a Self-Attention (SA) mechanism, an inductive bias that +associates each token in the input with every other token through a weighted +basis. The standard SA mechanism has quadratic complexity with the sequence +length, which impedes its utility to long sequences appearing in high +resolution vision. Recently, inspired by operator learning for PDEs, Adaptive +Fourier Neural Operators (AFNO) were introduced for high resolution attention +based on global convolution that is efficiently implemented via FFT. However, +the AFNO global filtering cannot well represent small and moderate scale +structures that commonly appear in natural images. To leverage the +coarse-to-fine scale structures we introduce a Multiscale Wavelet Attention +(MWA) by leveraging wavelet neural operators which incurs linear complexity in +the sequence size. We replace the attention in ViT with MWA and our experiments +with CIFAR and Tiny-ImageNet classification demonstrate significant improvement +over alternative Fourier-based attentions such as AFNO and Global Filter +Network (GFN). + +
+
+
+
+
+ + ♻ ☆ Time-aware Graph Structure Learning via Sequence Prediction on Temporal + Graphs CIKM 2023 + + +
+ Temporal Graph Learning, which aims to model the time-evolving nature of +graphs, has gained increasing attention and achieved remarkable performance +recently. However, in reality, graph structures are often incomplete and noisy, +which hinders temporal graph networks (TGNs) from learning informative +representations. Graph contrastive learning uses data augmentation to generate +plausible variations of existing data and learn robust representations. +However, rule-based augmentation approaches may be suboptimal as they lack +learnability and fail to leverage rich information from downstream tasks. To +address these issues, we propose a Time-aware Graph Structure Learning (TGSL) +approach via sequence prediction on temporal graphs, which learns better graph +structures for downstream tasks through adding potential temporal edges. In +particular, it predicts time-aware context embedding based on previously +observed interactions and uses the Gumble-Top-K to select the closest candidate +edges to this context embedding. Additionally, several candidate sampling +strategies are proposed to ensure both efficiency and diversity. Furthermore, +we jointly learn the graph structure and TGNs in an end-to-end manner and +perform inference on the refined graph. Extensive experiments on temporal link +prediction benchmarks demonstrate that TGSL yields significant gains for the +popular TGNs such as TGAT and GraphMixer, and it outperforms other contrastive +learning methods on temporal graphs. We release the code at +https://github.com/ViktorAxelsen/TGSL. + +
+
+ comment: Accepted by CIKM 2023. The code is available at + https://github.com/ViktorAxelsen/TGSL +
+
+
+
+
+ + ♻ ☆ Variational Gibbs Inference for Statistical Model Estimation from + Incomplete Data + + +
+ Statistical models are central to machine learning with broad applicability +across a range of downstream tasks. The models are controlled by free +parameters that are typically estimated from data by maximum-likelihood +estimation or approximations thereof. However, when faced with real-world data +sets many of the models run into a critical issue: they are formulated in terms +of fully-observed data, whereas in practice the data sets are plagued with +missing data. The theory of statistical model estimation from incomplete data +is conceptually similar to the estimation of latent-variable models, where +powerful tools such as variational inference (VI) exist. However, in contrast +to standard latent-variable models, parameter estimation with incomplete data +often requires estimating exponentially-many conditional distributions of the +missing variables, hence making standard VI methods intractable. We address +this gap by introducing variational Gibbs inference (VGI), a new +general-purpose method to estimate the parameters of statistical models from +incomplete data. We validate VGI on a set of synthetic and real-world +estimation tasks, estimating important machine learning models such as +variational autoencoders and normalising flows from incomplete data. The +proposed method, whilst general-purpose, achieves competitive or better +performance than existing model-specific estimation methods. + +
+
+ comment: Published at Journal of Machine Learning Research (JMLR) +
+
+
+
+
+ + ♻ ☆ R2C-GAN: Restore-to-Classify GANs for Blind X-Ray Restoration and + COVID-19 Classification + + +
+ Restoration of poor quality images with a blended set of artifacts plays a +vital role for a reliable diagnosis. Existing studies have focused on specific +restoration problems such as image deblurring, denoising, and exposure +correction where there is usually a strong assumption on the artifact type and +severity. As a pioneer study in blind X-ray restoration, we propose a joint +model for generic image restoration and classification: Restore-to-Classify +Generative Adversarial Networks (R2C-GANs). Such a jointly optimized model +keeps any disease intact after the restoration. Therefore, this will naturally +lead to a higher diagnosis performance thanks to the improved X-ray image +quality. To accomplish this crucial objective, we define the restoration task +as an Image-to-Image translation problem from poor quality having noisy, +blurry, or over/under-exposed images to high quality image domain. The proposed +R2C-GAN model is able to learn forward and inverse transforms between the two +domains using unpaired training samples. Simultaneously, the joint +classification preserves the disease label during restoration. Moreover, the +R2C-GANs are equipped with operational layers/neurons reducing the network +depth and further boosting both restoration and classification performances. +The proposed joint model is extensively evaluated over the QaTa-COV19 dataset +for Coronavirus Disease 2019 (COVID-19) classification. The proposed +restoration approach achieves over 90% F1-Score which is significantly higher +than the performance of any deep model. Moreover, in the qualitative analysis, +the restoration performance of R2C-GANs is approved by a group of medical +doctors. We share the software implementation at +https://github.com/meteahishali/R2C-GAN. + +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning with Multitask Episodic Memory Based on + Task-Conditioned Hypernetwork + + +
+ Deep reinforcement learning algorithms are usually impeded by sampling +inefficiency, heavily depending on multiple interactions with the environment +to acquire accurate decision-making capabilities. In contrast, humans rely on +their hippocampus to retrieve relevant information from past experiences of +relevant tasks, which guides their decision-making when learning a new task, +rather than exclusively depending on environmental interactions. Nevertheless, +designing a hippocampus-like module for an agent to incorporate past +experiences into established reinforcement learning algorithms presents two +challenges. The first challenge involves selecting the most relevant past +experiences for the current task, and the second challenge is integrating such +experiences into the decision network. To address these challenges, we propose +a novel method that utilizes a retrieval network based on task-conditioned +hypernetwork, which adapts the retrieval network's parameters depending on the +task. At the same time, a dynamic modification mechanism enhances the +collaborative efforts between the retrieval and decision networks. We evaluate +the proposed method on the MiniGrid environment.The experimental results +demonstrate that our proposed method significantly outperforms strong +baselines. + +
+
+
+
+
+ + ♻ ☆ PromptStyler: Prompt-driven Style Generation for Source-free Domain + Generalization ICCV 2023 + + +
+ In a joint vision-language space, a text feature (e.g., from "a photo of a +dog") could effectively represent its relevant image features (e.g., from dog +photos). Also, a recent study has demonstrated the cross-modal transferability +phenomenon of this joint space. From these observations, we propose +PromptStyler which simulates various distribution shifts in the joint space by +synthesizing diverse styles via prompts without using any images to deal with +source-free domain generalization. The proposed method learns to generate a +variety of style features (from "a S* style of a") via learnable style word +vectors for pseudo-words S*. To ensure that learned styles do not distort +content information, we force style-content features (from "a S* style of a +[class]") to be located nearby their corresponding content features (from +"[class]") in the joint vision-language space. After learning style word +vectors, we train a linear classifier using synthesized style-content features. +PromptStyler achieves the state of the art on PACS, VLCS, OfficeHome and +DomainNet, even though it does not require any images for training. + +
+
+ comment: Accepted to ICCV 2023, Project Page: https://promptstyler.github.io/ +
+
+
+
+
+ + ♻ ☆ SynJax: Structured Probability Distributions for JAX + + +
+ The development of deep learning software libraries enabled significant +progress in the field by allowing users to focus on modeling, while letting the +library to take care of the tedious and time-consuming task of optimizing +execution for modern hardware accelerators. However, this has benefited only +particular types of deep learning models, such as Transformers, whose +primitives map easily to the vectorized computation. The models that explicitly +account for structured objects, such as trees and segmentations, did not +benefit equally because they require custom algorithms that are difficult to +implement in a vectorized form. + SynJax directly addresses this problem by providing an efficient vectorized +implementation of inference algorithms for structured distributions covering +alignment, tagging, segmentation, constituency trees and spanning trees. With +SynJax we can build large-scale differentiable models that explicitly model +structure in the data. The code is available at +https://github.com/deepmind/synjax. + +
+
+
+
+
+ + ♻ ☆ SGL-PT: A Strong Graph Learner with Graph Prompt Tuning + + +
+ Recently, much exertion has been paid to design graph self-supervised methods +to obtain generalized pre-trained models, and adapt pre-trained models onto +downstream tasks through fine-tuning. However, there exists an inherent gap +between pretext and downstream graph tasks, which insufficiently exerts the +ability of pre-trained models and even leads to negative transfer. Meanwhile, +prompt tuning has seen emerging success in natural language processing by +aligning pre-training and fine-tuning with consistent training objectives. In +this paper, we identify the challenges for graph prompt tuning: The first is +the lack of a strong and universal pre-training task across sundry pre-training +methods in graph domain. The second challenge lies in the difficulty of +designing a consistent training objective for both pre-training and downstream +tasks. To overcome above obstacles, we propose a novel framework named SGL-PT +which follows the learning strategy ``Pre-train, Prompt, and Predict''. +Specifically, we raise a strong and universal pre-training task coined as SGL +that acquires the complementary merits of generative and contrastive +self-supervised graph learning. And aiming for graph classification task, we +unify pre-training and fine-tuning by designing a novel verbalizer-free +prompting function, which reformulates the downstream task in a similar format +as pretext task. Empirical results show that our method surpasses other +baselines under unsupervised setting, and our prompt tuning method can greatly +facilitate models on biological datasets over fine-tuning methods. + +
+
+
+
+
+ + ♻ ☆ μSplit: efficient image decomposition for microscopy data ICCV 2023 + + +
+ We present {\mu}Split, a dedicated approach for trained image decomposition +in the context of fluorescence microscopy images. We find that best results +using regular deep architectures are achieved when large image patches are used +during training, making memory consumption the limiting factor to further +improving performance. We therefore introduce lateral contextualization (LC), a +memory efficient way to train powerful networks and show that LC leads to +consistent and significant improvements on the task at hand. We integrate LC +with U-Nets, Hierarchical AEs, and Hierarchical VAEs, for which we formulate a +modified ELBO loss. Additionally, LC enables training deeper hierarchical +models than otherwise possible and, interestingly, helps to reduce tiling +artefacts that are inherently impossible to avoid when using tiled VAE +predictions. We apply {\mu}Split to five decomposition tasks, one on a +synthetic dataset, four others derived from real microscopy data. LC achieves +SOTA results (average improvements to the best baseline of 2.36 dB PSNR), while +simultaneously requiring considerably less GPU memory. + +
+
+ comment: Published at ICCV 2023. 10 pages, 7 figures, 9 pages supplement, 8 + supplementary figures +
+
+
+
+
+ + ♻ ☆ Catastrophic overfitting can be induced with discriminative non-robust + features + + +
+ Adversarial training (AT) is the de facto method for building robust neural +networks, but it can be computationally expensive. To mitigate this, fast +single-step attacks can be used, but this may lead to catastrophic overfitting +(CO). This phenomenon appears when networks gain non-trivial robustness during +the first stages of AT, but then reach a breaking point where they become +vulnerable in just a few iterations. The mechanisms that lead to this failure +mode are still poorly understood. In this work, we study the onset of CO in +single-step AT methods through controlled modifications of typical datasets of +natural images. In particular, we show that CO can be induced at much smaller +$\epsilon$ values than it was observed before just by injecting images with +seemingly innocuous features. These features aid non-robust classification but +are not enough to achieve robustness on their own. Through extensive +experiments we analyze this novel phenomenon and discover that the presence of +these easy features induces a learning shortcut that leads to CO. Our findings +provide new insights into the mechanisms of CO and improve our understanding of +the dynamics of AT. The code to reproduce our experiments can be found at +https://github.com/gortizji/co_features. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ♻ ☆ #InsTag: Instruction Tagging for Analyzing Supervised Fine-tuning of + Large Language Models + + +
+ Foundation language models obtain the instruction-following ability through +supervised fine-tuning (SFT). Diversity and complexity are considered critical +factors of a successful SFT dataset, while their definitions remain obscure and +lack quantitative analyses. In this work, we propose InsTag, an open-set +fine-grained tagger, to tag samples within SFT datasets based on semantics and +intentions and define instruction diversity and complexity regarding tags. We +obtain 6.6K tags to describe comprehensive user queries. Then we analyze +popular open-sourced SFT datasets and find that the model ability grows with +more diverse and complex data. Based on this observation, we propose a data +selector based on InsTag to select 6K diverse and complex samples from +open-source datasets and fine-tune models on InsTag-selected data. The +resulting models, TagLM, outperform open-source models based on considerably +larger SFT data evaluated by MT-Bench, echoing the importance of query +diversity and complexity. We open-source InsTag in +https://github.com/OFA-Sys/InsTag. + +
+
+
+
+
+ + ♻ ☆ Clustering and Structural Robustness in Causal Diagrams + + +
+ Graphs are commonly used to represent and visualize causal relations. For a +small number of variables, this approach provides a succinct and clear view of +the scenario at hand. As the number of variables under study increases, the +graphical approach may become impractical, and the clarity of the +representation is lost. Clustering of variables is a natural way to reduce the +size of the causal diagram, but it may erroneously change the essential +properties of the causal relations if implemented arbitrarily. We define a +specific type of cluster, called transit cluster, that is guaranteed to +preserve the identifiability properties of causal effects under certain +conditions. We provide a sound and complete algorithm for finding all transit +clusters in a given graph and demonstrate how clustering can simplify the +identification of causal effects. We also study the inverse problem, where one +starts with a clustered graph and looks for extended graphs where the +identifiability properties of causal effects remain unchanged. We show that +this kind of structural robustness is closely related to transit clusters. + +
+
+ comment: This is the version published in JMLR +
+
+
+
+
+ + ♻ ☆ Fairness through Aleatoric Uncertainty + + +
+ We propose a simple yet effective solution to tackle the often-competing +goals of fairness and utility in classification tasks. While fairness ensures +that the model's predictions are unbiased and do not discriminate against any +particular group or individual, utility focuses on maximizing the model's +predictive performance. This work introduces the idea of leveraging aleatoric +uncertainty (e.g., data ambiguity) to improve the fairness-utility trade-off. +Our central hypothesis is that aleatoric uncertainty is a key factor for +algorithmic fairness and samples with low aleatoric uncertainty are modeled +more accurately and fairly than those with high aleatoric uncertainty. We then +propose a principled model to improve fairness when aleatoric uncertainty is +high and improve utility elsewhere. Our approach first intervenes in the data +distribution to better decouple aleatoric uncertainty and epistemic +uncertainty. It then introduces a fairness-utility bi-objective loss defined +based on the estimated aleatoric uncertainty. Our approach is theoretically +guaranteed to improve the fairness-utility trade-off. Experimental results on +both tabular and image datasets show that the proposed approach outperforms +state-of-the-art methods w.r.t. the fairness-utility trade-off and w.r.t. both +group and individual fairness metrics. This work presents a fresh perspective +on the trade-off between utility and algorithmic fairness and opens a key +avenue for the potential of using prediction uncertainty in fair machine +learning. + +
+
+
+
+
+ + ♻ ☆ AudioFormer: Audio Transformer learns audio feature representations from + discrete acoustic codes + + +
+ We propose a method named AudioFormer,which learns audio feature +representations through the acquisition of discrete acoustic codes and +subsequently fine-tunes them for audio classification tasks. Initially,we +introduce a novel perspective by considering the audio classification task as a +form of natural language understanding (NLU). Leveraging an existing neural +audio codec model,we generate discrete acoustic codes and utilize them to train +a masked language model (MLM),thereby obtaining audio feature representations. +Furthermore,we pioneer the integration of a Multi-Positive sample Contrastive +(MPC) learning approach. This method enables the learning of joint +representations among multiple discrete acoustic codes within the same audio +input. In our experiments,we treat discrete acoustic codes as textual data and +train a masked language model using a cloze-like methodology,ultimately +deriving high-quality audio representations. Notably,the MPC learning technique +effectively captures collaborative representations among distinct positive +samples. Our research outcomes demonstrate that AudioFormer attains +significantly improved performance compared to prevailing monomodal audio +classification models across multiple datasets,and even outperforms +audio-visual multimodal classification models on select datasets. +Specifically,our approach achieves remarkable results on datasets including +AudioSet (2M,20K),and FSD50K,with performance scores of 53.9,45.1,and +65.6,respectively. We have openly shared both the code and models: +https://github.com/LZH-0225/AudioFormer.git. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Label-efficient Time Series Representation Learning: A Review + + +
+ The scarcity of labeled data is one of the main challenges of applying deep +learning models on time series data in the real world. Therefore, several +approaches, e.g., transfer learning, self-supervised learning, and +semi-supervised learning, have been recently developed to promote the learning +capability of deep learning models from the limited time series labels. In this +survey, for the first time, we provide a novel taxonomy to categorize existing +approaches that address the scarcity of labeled data problem in time series +data based on their dependency on external data sources. Moreover, we present a +review of the recent advances in each approach and conclude the limitations of +the current works and provide future directions that could yield better +progress in the field. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ MiAMix: Enhancing Image Classification through a Multi-stage Augmented + Mixed Sample Data Augmentation Method + + +
+ Despite substantial progress in the field of deep learning, overfitting +persists as a critical challenge, and data augmentation has emerged as a +particularly promising approach due to its capacity to enhance model +generalization in various computer vision tasks. While various strategies have +been proposed, Mixed Sample Data Augmentation (MSDA) has shown great potential +for enhancing model performance and generalization. We introduce a novel mixup +method called MiAMix, which stands for Multi-stage Augmented Mixup. MiAMix +integrates image augmentation into the mixup framework, utilizes multiple +diversified mixing methods concurrently, and improves the mixing method by +randomly selecting mixing mask augmentation methods. Recent methods utilize +saliency information and the MiAMix is designed for computational efficiency as +well, reducing additional overhead and offering easy integration into existing +training pipelines. We comprehensively evaluate MiaMix using four image +benchmarks and pitting it against current state-of-the-art mixed sample data +augmentation techniques to demonstrate that MIAMix improves performance without +heavy computational overhead. + +
+
+
+
+
+ + ♻ ☆ Diffusion Model in Causal Inference with Unmeasured Confounders + + +
+ We study how to extend the use of the diffusion model to answer the causal +question from the observational data under the existence of unmeasured +confounders. In Pearl's framework of using a Directed Acyclic Graph (DAG) to +capture the causal intervention, a Diffusion-based Causal Model (DCM) was +proposed incorporating the diffusion model to answer the causal questions more +accurately, assuming that all of the confounders are observed. However, +unmeasured confounders in practice exist, which hinders DCM from being +applicable. To alleviate this limitation of DCM, we propose an extended model +called Backdoor Criterion based DCM (BDCM), whose idea is rooted in the +Backdoor criterion to find the variables in DAG to be included in the decoding +process of the diffusion model so that we can extend DCM to the case with +unmeasured confounders. Synthetic data experiment demonstrates that our +proposed model captures the counterfactual distribution more precisely than DCM +under the unmeasured confounders. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ FedALA: Adaptive Local Aggregation for Personalized Federated Learning AAAI 2023 + + +
+ A key challenge in federated learning (FL) is the statistical heterogeneity +that impairs the generalization of the global model on each client. To address +this, we propose a method Federated learning with Adaptive Local Aggregation +(FedALA) by capturing the desired information in the global model for client +models in personalized FL. The key component of FedALA is an Adaptive Local +Aggregation (ALA) module, which can adaptively aggregate the downloaded global +model and local model towards the local objective on each client to initialize +the local model before training in each iteration. To evaluate the +effectiveness of FedALA, we conduct extensive experiments with five benchmark +datasets in computer vision and natural language processing domains. FedALA +outperforms eleven state-of-the-art baselines by up to 3.27% in test accuracy. +Furthermore, we also apply ALA module to other federated learning methods and +achieve up to 24.19% improvement in test accuracy. + +
+
+ comment: Accepted by AAAI 2023 +
+
+
+
+
+ + ♻ ☆ Multi-Scale Hybrid Vision Transformer for Learning Gastric Histology: + AI-Based Decision Support System for Gastric Cancer Treatment + + +
+ Gastric endoscopic screening is an effective way to decide appropriate +gastric cancer (GC) treatment at an early stage, reducing GC-associated +mortality rate. Although artificial intelligence (AI) has brought a great +promise to assist pathologist to screen digitalized whole slide images, +existing AI systems are limited in fine-grained cancer subclassifications and +have little usability in planning cancer treatment. We propose a practical AI +system that enables five subclassifications of GC pathology, which can be +directly matched to general GC treatment guidance. The AI system is designed to +efficiently differentiate multi-classes of GC through multi-scale +self-attention mechanism using 2-stage hybrid Vision Transformer (ViT) +networks, by mimicking the way how human pathologists understand histology. The +AI system demonstrates reliable diagnostic performance by achieving +class-average sensitivity of above 0.85 on a total of 1,212 slides from +multicentric cohort. Furthermore, AI-assisted pathologists show significantly +improved diagnostic sensitivity by 12% in addition to 18% reduced screening +time compared to human pathologists. Our results demonstrate that AI-assisted +gastric endoscopic screening has a great potential for providing presumptive +pathologic opinion and appropriate cancer treatment of gastric cancer in +practical clinical settings. + +
+
+
+
+
+ + ♻ ☆ Discovering Dynamic Causal Space for DAG Structure Learning KDD 2023 + + +
+ Discovering causal structure from purely observational data (i.e., causal +discovery), aiming to identify causal relationships among variables, is a +fundamental task in machine learning. The recent invention of differentiable +score-based DAG learners is a crucial enabler, which reframes the combinatorial +optimization problem into a differentiable optimization with a DAG constraint +over directed graph space. Despite their great success, these cutting-edge DAG +learners incorporate DAG-ness independent score functions to evaluate the +directed graph candidates, lacking in considering graph structure. As a result, +measuring the data fitness alone regardless of DAG-ness inevitably leads to +discovering suboptimal DAGs and model vulnerabilities. Towards this end, we +propose a dynamic causal space for DAG structure learning, coined CASPER, that +integrates the graph structure into the score function as a new measure in the +causal space to faithfully reflect the causal distance between estimated and +ground truth DAG. CASPER revises the learning process as well as enhances the +DAG structure learning via adaptive attention to DAG-ness. Grounded by +empirical visualization, CASPER, as a space, satisfies a series of desired +properties, such as structure awareness and noise robustness. Extensive +experiments on both synthetic and real-world datasets clearly validate the +superiority of our CASPER over the state-of-the-art causal discovery methods in +terms of accuracy and robustness. + +
+
+ comment: Accepted by KDD 2023. Our codes are available at + https://github.com/liuff19/CASPER +
+
+
+
+
+ + ♻ ☆ Non-stationary Online Learning with Memory and Non-stochastic Control + + +
+ We study the problem of Online Convex Optimization (OCO) with memory, which +allows loss functions to depend on past decisions and thus captures temporal +effects of learning problems. In this paper, we introduce dynamic policy regret +as the performance measure to design algorithms robust to non-stationary +environments, which competes algorithms' decisions with a sequence of changing +comparators. We propose a novel algorithm for OCO with memory that provably +enjoys an optimal dynamic policy regret in terms of time horizon, +non-stationarity measure, and memory length. The key technical challenge is how +to control the switching cost, the cumulative movements of player's decisions, +which is neatly addressed by a novel switching-cost-aware online ensemble +approach equipped with a new meta-base decomposition of dynamic policy regret +and a careful design of meta-learner and base-learner that explicitly +regularizes the switching cost. The results are further applied to tackle +non-stationarity in online non-stochastic control (Agarwal et al., 2019), i.e., +controlling a linear dynamical system with adversarial disturbance and convex +cost functions. We derive a novel gradient-based controller with dynamic policy +regret guarantees, which is the first controller provably competitive to a +sequence of changing policies for online non-stochastic control. + +
+
+
+
+
+ + ♻ ☆ GCformer: An Efficient Framework for Accurate and Scalable Long-Term + Multivariate Time Series Forecasting + + +
+ Transformer-based models have emerged as promising tools for time series +forecasting. + However, these model cannot make accurate prediction for long input time +series. On the one hand, they failed to capture global dependencies within time +series data. On the other hand, the long input sequence usually leads to large +model size and high time complexity. + To address these limitations, we present GCformer, which combines a +structured global convolutional branch for processing long input sequences with +a local Transformer-based branch for capturing short, recent signals. A +cohesive framework for a global convolution kernel has been introduced, +utilizing three distinct parameterization methods. The selected structured +convolutional kernel in the global branch has been specifically crafted with +sublinear complexity, thereby allowing for the efficient and effective +processing of lengthy and noisy input signals. Empirical studies on six +benchmark datasets demonstrate that GCformer outperforms state-of-the-art +methods, reducing MSE error in multivariate time series benchmarks by 4.38% and +model parameters by 61.92%. In particular, the global convolutional branch can +serve as a plug-in block to enhance the performance of other models, with an +average improvement of 31.93\%, including various recently published +Transformer-based models. Our code is publicly available at +https://github.com/zyj-111/GCformer. + +
+
+
+
+
+ + ♻ ☆ Anomaly Detection in Automated Fibre Placement: Learning with Data + Limitations + + +
+ Conventional defect detection systems in Automated Fibre Placement (AFP) +typically rely on end-to-end supervised learning, necessitating a substantial +number of labelled defective samples for effective training. However, the +scarcity of such labelled data poses a challenge. To overcome this limitation, +we present a comprehensive framework for defect detection and localization in +Automated Fibre Placement. Our approach combines unsupervised deep learning and +classical computer vision algorithms, eliminating the need for labelled data or +manufacturing defect samples. It efficiently detects various surface issues +while requiring fewer images of composite parts for training. Our framework +employs an innovative sample extraction method leveraging AFP's inherent +symmetry to expand the dataset. By inputting a depth map of the fibre layup +surface, we extract local samples aligned with each composite strip (tow). +These samples are processed through an autoencoder, trained on normal samples +for precise reconstructions, highlighting anomalies through reconstruction +errors. Aggregated values form an anomaly map for insightful visualization. The +framework employs blob detection on this map to locate manufacturing defects. +The experimental findings reveal that despite training the autoencoder with a +limited number of images, our proposed method exhibits satisfactory detection +accuracy and accurately identifies defect locations. Our framework demonstrates +comparable performance to existing methods, while also offering the advantage +of detecting all types of anomalies without relying on an extensive labelled +dataset of defects. + +
+
+
+
+
+ + ♻ ☆ Virtual Human Generative Model: Masked Modeling Approach for Learning + Human Characteristics + + +
+ Identifying the relationship between healthcare attributes, lifestyles, and +personality is vital for understanding and improving physical and mental +conditions. Machine learning approaches are promising for modeling their +relationships and offering actionable suggestions. In this paper, we propose +Virtual Human Generative Model (VHGM), a machine learning model for estimating +attributes about healthcare, lifestyles, and personalities. VHGM is a deep +generative model trained with masked modeling to learn the joint distribution +of attributes conditioned on known ones. Using heterogeneous tabular datasets, +VHGM learns more than 1,800 attributes efficiently. We numerically evaluate the +performance of VHGM and its training techniques. As a proof-of-concept of VHGM, +we present several applications demonstrating user scenarios, such as virtual +measurements of healthcare attributes and hypothesis verifications of +lifestyles. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Decentralized Federated Learning: Fundamentals, State of the Art, + Frameworks, Trends, and Challenges + + +
+ In recent years, Federated Learning (FL) has gained relevance in training +collaborative models without sharing sensitive data. Since its birth, +Centralized FL (CFL) has been the most common approach in the literature, where +a central entity creates a global model. However, a centralized approach leads +to increased latency due to bottlenecks, heightened vulnerability to system +failures, and trustworthiness concerns affecting the entity responsible for the +global model creation. Decentralized Federated Learning (DFL) emerged to +address these concerns by promoting decentralized model aggregation and +minimizing reliance on centralized architectures. However, despite the work +done in DFL, the literature has not (i) studied the main aspects +differentiating DFL and CFL; (ii) analyzed DFL frameworks to create and +evaluate new solutions; and (iii) reviewed application scenarios using DFL. +Thus, this article identifies and analyzes the main fundamentals of DFL in +terms of federation architectures, topologies, communication mechanisms, +security approaches, and key performance indicators. Additionally, the paper at +hand explores existing mechanisms to optimize critical DFL fundamentals. Then, +the most relevant features of the current DFL frameworks are reviewed and +compared. After that, it analyzes the most used DFL application scenarios, +identifying solutions based on the fundamentals and frameworks previously +defined. Finally, the evolution of existing DFL solutions is studied to provide +a list of trends, lessons learned, and open challenges. + +
+
+
+
+
+ + ♻ ☆ SAILOR: Structural Augmentation Based Tail Node Representation Learning CIKM 2023 + + +
+ Graph Neural Networks (GNNs) have achieved state-of-the-art performance in +representation learning for graphs recently. However, the effectiveness of +GNNs, which capitalize on the key operation of message propagation, highly +depends on the quality of the topology structure. Most of the graphs in +real-world scenarios follow a long-tailed distribution on their node degrees, +that is, a vast majority of the nodes in the graph are tail nodes with only a +few connected edges. GNNs produce inferior node representations for tail nodes +since they lack structural information. In the pursuit of promoting the +expressiveness of GNNs for tail nodes, we explore how the deficiency of +structural information deteriorates the performance of tail nodes and propose a +general Structural Augmentation based taIL nOde Representation learning +framework, dubbed as SAILOR, which can jointly learn to augment the graph +structure and extract more informative representations for tail nodes. +Extensive experiments on public benchmark datasets demonstrate that SAILOR can +significantly improve the tail node representations and outperform the +state-of-the-art baselines. + +
+
+ comment: Accepted by CIKM 2023; Code is available at + https://github.com/Jie-Re/SAILOR +
+
+
+
+
+ + ♻ ☆ InfiniCity: Infinite-Scale City Synthesis + + +
+ Toward infinite-scale 3D city synthesis, we propose a novel framework, +InfiniCity, which constructs and renders an unconstrainedly large and +3D-grounded environment from random noises. InfiniCity decomposes the seemingly +impractical task into three feasible modules, taking advantage of both 2D and +3D data. First, an infinite-pixel image synthesis module generates +arbitrary-scale 2D maps from the bird's-eye view. Next, an octree-based voxel +completion module lifts the generated 2D map to 3D octrees. Finally, a +voxel-based neural rendering module texturizes the voxels and renders 2D +images. InfiniCity can thus synthesize arbitrary-scale and traversable 3D city +environments, and allow flexible and interactive editing from users. We +quantitatively and qualitatively demonstrate the efficacy of the proposed +framework. Project page: https://hubert0527.github.io/infinicity/ + +
+
+
+
+
+ + ♻ ☆ Disentanglement via Latent Quantization + + +
+ In disentangled representation learning, a model is asked to tease apart a +dataset's underlying sources of variation and represent them independently of +one another. Since the model is provided with no ground truth information about +these sources, inductive biases take a paramount role in enabling +disentanglement. In this work, we construct an inductive bias towards encoding +to and decoding from an organized latent space. Concretely, we do this by (i) +quantizing the latent space into discrete code vectors with a separate +learnable scalar codebook per dimension and (ii) applying strong model +regularization via an unusually high weight decay. Intuitively, the latent +space design forces the encoder to combinatorially construct codes from a small +number of distinct scalar values, which in turn enables the decoder to assign a +consistent meaning to each value. Regularization then serves to drive the model +towards this parsimonious strategy. We demonstrate the broad applicability of +this approach by adding it to both basic data-reconstructing (vanilla +autoencoder) and latent-reconstructing (InfoGAN) generative models. For +reliable evaluation, we also propose InfoMEC, a new set of metrics for +disentanglement that is cohesively grounded in information theory and fixes +well-established shortcomings in previous metrics. Together with +regularization, latent quantization dramatically improves the modularity and +explicitness of learned representations on a representative suite of benchmark +datasets. In particular, our quantized-latent autoencoder (QLAE) consistently +outperforms strong methods from prior work in these key disentanglement +properties without compromising data reconstruction. + +
+
+ comment: 25 pages, 15 figures, code available at + https://github.com/kylehkhsu/disentangle +
+
+
+
+
+ + ♻ ☆ Diffusion Models for Graphs Benefit From Discrete State Spaces NeurIPS 2022 + + +
+ Denoising diffusion probabilistic models and score-matching models have +proven to be very powerful for generative tasks. While these approaches have +also been applied to the generation of discrete graphs, they have, so far, +relied on continuous Gaussian perturbations. Instead, in this work, we suggest +using discrete noise for the forward Markov process. This ensures that in every +intermediate step the graph remains discrete. Compared to the previous +approach, our experimental results on four datasets and multiple architectures +show that using a discrete noising process results in higher quality generated +samples indicated with an average MMDs reduced by a factor of 1.5. Furthermore, +the number of denoising steps is reduced from 1000 to 32 steps, leading to a 30 +times faster sampling procedure. + +
+
+ comment: Presented at the First Learning on Graphs Conference (LoG 2022) and + the NeurIPS 2022 New Frontiers in Graph Learning Workshop (NeurIPS + GLFrontiers 2022) +
+
+
+
+
+ + ♻ ☆ Time-Synchronized Full System State Estimation Considering Practical + Implementation Challenges + + +
+ As phasor measurement units (PMUs) are usually placed on the highest voltage +buses, many lower voltage levels of the bulk power system are not observed by +them. This lack of visibility makes time-synchronized state estimation of the +full system a challenging problem. We propose a Deep Neural network-based State +Estimator (DeNSE) to overcome this problem. The DeNSE employs a Bayesian +framework to indirectly combine inferences drawn from slow timescale but +widespread supervisory control and data acquisition (SCADA) data with fast +timescale but local PMU data to attain sub-second situational awareness of the +entire system. The practical utility of the proposed approach is demonstrated +by considering topology changes, non-Gaussian measurement noise, and bad data +detection and correction. The results obtained using the IEEE 118-bus system +show the superiority of the DeNSE over a purely SCADA state estimator, a +SCADA-PMU hybrid state estimator, and a PMU-only linear state estimator from a +techno-economic viability perspective. Lastly, the scalability of the DeNSE is +proven by performing state estimation on a large and realistic 2000-bus +Synthetic Texas system. + +
+
+
+
+
+ + ♻ ☆ PeRP: Personalized Residual Policies For Congestion Mitigation Through + Co-operative Advisory Systems SC 2023 + + +
+ Intelligent driving systems can be used to mitigate congestion through simple +actions, thus improving many socioeconomic factors such as commute time and gas +costs. However, these systems assume precise control over autonomous vehicle +fleets, and are hence limited in practice as they fail to account for +uncertainty in human behavior. Piecewise Constant (PC) Policies address these +issues by structurally modeling the likeness of human driving to reduce traffic +congestion in dense scenarios to provide action advice to be followed by human +drivers. However, PC policies assume that all drivers behave similarly. To this +end, we develop a co-operative advisory system based on PC policies with a +novel driver trait conditioned Personalized Residual Policy, PeRP. PeRP advises +drivers to behave in ways that mitigate traffic congestion. We first infer the +driver's intrinsic traits on how they follow instructions in an unsupervised +manner with a variational autoencoder. Then, a policy conditioned on the +inferred trait adapts the action of the PC policy to provide the driver with a +personalized recommendation. Our system is trained in simulation with novel +driver modeling of instruction adherence. We show that our approach +successfully mitigates congestion while adapting to different driver behaviors, +with 4 to 22% improvement in average speed over baselines. + +
+
+ comment: Accepted to ITSC 2023. Additional material and code is available at + the project webpage: https://sites.google.com/illinois.edu/perp +
+
+
+
+
+ + ♻ ☆ Federated Learning with Server Learning: Enhancing Performance for + Non-IID Data + + +
+ Federated Learning (FL) has emerged as a means of distributed learning using +local data stored at clients with a coordinating server. Recent studies showed +that FL can suffer from poor performance and slower convergence when training +data at clients are not independent and identically distributed. Here we +consider a new complementary approach to mitigating this performance +degradation by allowing the server to perform auxiliary learning from a small +dataset. Our analysis and experiments show that this new approach can achieve +significant improvements in both model accuracy and convergence time even when +the server dataset is small and its distribution differs from that of the +aggregated data from all clients. + +
+
+ comment: 22 pages, 11 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ DuETT: Dual Event Time Transformer for Electronic Health Records + + +
+ Electronic health records (EHRs) recorded in hospital settings typically +contain a wide range of numeric time series data that is characterized by high +sparsity and irregular observations. Effective modelling for such data must +exploit its time series nature, the semantic relationship between different +types of observations, and information in the sparsity structure of the data. +Self-supervised Transformers have shown outstanding performance in a variety of +structured tasks in NLP and computer vision. But multivariate time series data +contains structured relationships over two dimensions: time and recorded event +type, and straightforward applications of Transformers to time series data do +not leverage this distinct structure. The quadratic scaling of self-attention +layers can also significantly limit the input sequence length without +appropriate input engineering. We introduce the DuETT architecture, an +extension of Transformers designed to attend over both time and event type +dimensions, yielding robust representations from EHR data. DuETT uses an +aggregated input where sparse time series are transformed into a regular +sequence with fixed length; this lowers the computational complexity relative +to previous EHR Transformer models and, more importantly, enables the use of +larger and deeper neural networks. When trained with self-supervised prediction +tasks, that provide rich and informative signals for model pre-training, our +model outperforms state-of-the-art deep learning models on multiple downstream +tasks from the MIMIC-IV and PhysioNet-2012 EHR datasets. + +
+
+ comment: Accepted at MLHC 2023, camera-ready version +
+
+
+
+
+ + ♻ ☆ ERM++: An Improved Baseline for Domain Generalization + + +
+ Multi-source Domain Generalization (DG) measures a classifier's ability to +generalize to new distributions of data it was not trained on, given several +training domains. While several multi-source DG methods have been proposed, +they incur additional complexity during training by using domain labels. Recent +work has shown that a well-tuned Empirical Risk Minimization (ERM) training +procedure, that is simply minimizing the empirical risk on the source domains, +can outperform most existing DG methods. We identify several key candidate +techniques to further improve ERM performance, such as better utilization of +training data, model parameter selection, and weight-space regularization. We +call the resulting method ERM++, and show it significantly improves the +performance of DG on five multi-source datasets by over 5% compared to standard +ERM, and beats state-of-the-art despite being less computationally expensive. +Additionally, we demonstrate the efficacy of ERM++ on the WILDS-FMOW dataset, a +challenging DG benchmark. We hope that ERM++ becomes a strong baseline for +future DG research. Code is released at +https://github.com/piotr-teterwak/erm_plusplus. + +
+
+ comment: An improved baseline for Domain Generalization +
+
+
+
+
+ + ♻ ☆ Is My Prediction Arbitrary? The Confounding Effects of Variance in Fair + Classification Benchmarks + + +
+ Variance in predictions across different trained models is a significant, +under-explored source of error in fair classification. In practice, the +variance on some data examples is so large that decisions can be effectively +arbitrary. To investigate this problem, we take an experimental approach and +make four overarching contributions: We 1) Define a metric called +self-consistency, derived from variance, which we use as a proxy for measuring +and reducing arbitrariness; 2) Develop an ensembling algorithm that abstains +from classification when a prediction would be arbitrary; 3) Conduct the +largest to-date empirical study of the role of variance (vis-a-vis +self-consistency and arbitrariness) in fair classification; and, 4) Release a +toolkit that makes the US Home Mortgage Disclosure Act (HMDA) datasets easily +usable for future research. Altogether, our experiments reveal shocking +insights about the reliability of conclusions on benchmark datasets. Most +fairness classification benchmarks are close-to-fair when taking into account +the amount of arbitrariness present in predictions -- before we even try to +apply common fairness interventions. This finding calls into question the +practical utility of common algorithmic fairness methods, and in turn suggests +that we should fundamentally reconsider how we choose to measure fairness in +machine learning. + +
+
+
+
+
+ + ♻ ☆ LCE: An Augmented Combination of Bagging and Boosting in Python + + +
+ lcensemble is a high-performing, scalable and user-friendly Python package +for the general tasks of classification and regression. The package implements +Local Cascade Ensemble (LCE), a machine learning method that further enhances +the prediction performance of the current state-of-the-art methods Random +Forest and XGBoost. LCE combines their strengths and adopts a complementary +diversification approach to obtain a better generalizing predictor. The package +is compatible with scikit-learn, therefore it can interact with scikit-learn +pipelines and model selection tools. It is distributed under the Apache 2.0 +license, and its source code is available at +https://github.com/LocalCascadeEnsemble/LCE. + +
+
+
+
+
+ + ♻ ☆ SMGRL: Scalable Multi-resolution Graph Representation Learning + + +
+ Graph convolutional networks (GCNs) allow us to learn topologically-aware +node embeddings, which can be useful for classification or link prediction. +However, they are unable to capture long-range dependencies between nodes +without adding additional layers -- which in turn leads to over-smoothing and +increased time and space complexity. Further, the complex dependencies between +nodes make mini-batching challenging, limiting their applicability to large +graphs. We propose a Scalable Multi-resolution Graph Representation Learning +(SMGRL) framework that enables us to learn multi-resolution node embeddings +efficiently. Our framework is model-agnostic and can be applied to any existing +GCN model. We dramatically reduce training costs by training only on a +reduced-dimension coarsening of the original graph, then exploit +self-similarity to apply the resulting algorithm at multiple resolutions. The +resulting multi-resolution embeddings can be aggregated to yield high-quality +node embeddings that capture both long- and short-range dependencies. Our +experiments show that this leads to improved classification accuracy, without +incurring high computational costs. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ Pseudo Supervised Metrics: Evaluating Unsupervised Image to Image + Translation Models In Unsupervised Cross-Domain Classification Frameworks + + +
+ The ability to classify images accurately and efficiently is dependent on +having access to large labeled datasets and testing on data from the same +domain that the model is trained on. Classification becomes more challenging +when dealing with new data from a different domain, where collecting a large +labeled dataset and training a new classifier from scratch is time-consuming, +expensive, and sometimes infeasible or impossible. Cross-domain classification +frameworks were developed to handle this data domain shift problem by utilizing +unsupervised image-to-image (UI2I) translation models to translate an input +image from the unlabeled domain to the labeled domain. The problem with these +unsupervised models lies in their unsupervised nature. For lack of annotations, +it is not possible to use the traditional supervised metrics to evaluate these +translation models to pick the best-saved checkpoint model. In this paper, we +introduce a new method called Pseudo Supervised Metrics that was designed +specifically to support cross-domain classification applications contrary to +other typically used metrics such as the FID which was designed to evaluate the +model in terms of the quality of the generated image from a human-eye +perspective. We show that our metric not only outperforms unsupervised metrics +such as the FID, but is also highly correlated with the true supervised +metrics, robust, and explainable. Furthermore, we demonstrate that it can be +used as a standard metric for future research in this field by applying it to a +critical real-world problem (the boiling crisis problem). + +
+
+ comment: arXiv admin note: text overlap with arXiv:2212.09107 +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Dynamic Low-Rank Instance Adaptation for Universal Neural Image + Compression ACM MM 2023 + + +
+ The latest advancements in neural image compression show great potential in +surpassing the rate-distortion performance of conventional standard codecs. +Nevertheless, there exists an indelible domain gap between the datasets +utilized for training (i.e., natural images) and those utilized for inference +(e.g., artistic images). Our proposal involves a low-rank adaptation approach +aimed at addressing the rate-distortion drop observed in out-of-domain +datasets. Specifically, we perform low-rank matrix decomposition to update +certain adaptation parameters of the client's decoder. These updated +parameters, along with image latents, are encoded into a bitstream and +transmitted to the decoder in practical scenarios. Due to the low-rank +constraint imposed on the adaptation parameters, the resulting bit rate +overhead is small. Furthermore, the bit rate allocation of low-rank adaptation +is \emph{non-trivial}, considering the diverse inputs require varying +adaptation bitstreams. We thus introduce a dynamic gating network on top of the +low-rank adaptation method, in order to decide which decoder layer should +employ adaptation. The dynamic adaptation network is optimized end-to-end using +rate-distortion loss. Our proposed method exhibits universality across diverse +image datasets. Extensive results demonstrate that this paradigm significantly +mitigates the domain gap, surpassing non-adaptive methods with an average +BD-rate improvement of approximately $19\%$ across out-of-domain images. +Furthermore, it outperforms the most advanced instance adaptive methods by +roughly $5\%$ BD-rate. Ablation studies confirm our method's ability to +universally enhance various image compression architectures. + +
+
+ comment: Accepted by ACM MM 2023, 13 pages, 12 figures +
+
+
+
+
+ + ☆ EMID: An Emotional Aligned Dataset in Audio-Visual Modality + + +
+ In this paper, we propose Emotionally paired Music and Image Dataset (EMID), +a novel dataset designed for the emotional matching of music and images, to +facilitate auditory-visual cross-modal tasks such as generation and retrieval. +Unlike existing approaches that primarily focus on semantic correlations or +roughly divided emotional relations, EMID emphasizes the significance of +emotional consistency between music and images using an advanced 13-dimension +emotional model. By incorporating emotional alignment into the dataset, it aims +to establish pairs that closely align with human perceptual understanding, +thereby raising the performance of auditory-visual cross-modal tasks. We also +design a supplemental module named EMI-Adapter to optimize existing cross-modal +alignment methods. To validate the effectiveness of the EMID, we conduct a +psychological experiment, which has demonstrated that considering the emotional +relationship between the two modalities effectively improves the accuracy of +matching in abstract perspective. This research lays the foundation for future +cross-modal research in domains such as psychotherapy and contributes to +advancing the understanding and utilization of emotions in cross-modal +alignment. The EMID dataset is available at https://github.com/ecnu-aigc/EMID. + +
+
+
+
+
+ + ☆ SGDiff: A Style Guided Diffusion Model for Fashion Synthesis ACM MM'23 + + +
+ This paper reports on the development of \textbf{a novel style guided +diffusion model (SGDiff)} which overcomes certain weaknesses inherent in +existing models for image synthesis. The proposed SGDiff combines image +modality with a pretrained text-to-image diffusion model to facilitate creative +fashion image synthesis. It addresses the limitations of text-to-image +diffusion models by incorporating supplementary style guidance, substantially +reducing training costs, and overcoming the difficulties of controlling +synthesized styles with text-only inputs. This paper also introduces a new +dataset -- SG-Fashion, specifically designed for fashion image synthesis +applications, offering high-resolution images and an extensive range of garment +categories. By means of comprehensive ablation study, we examine the +application of classifier-free guidance to a variety of conditions and validate +the effectiveness of the proposed model for generating fashion images of the +desired categories, product attributes, and styles. The contributions of this +paper include a novel classifier-free guidance method for multi-modal feature +fusion, a comprehensive dataset for fashion image synthesis application, a +thorough investigation on conditioned text-to-image synthesis, and valuable +insights for future research in the text-to-image synthesis domain. The code +and dataset are available at: \url{https://github.com/taited/SGDiff}. + +
+
+ comment: Accepted by ACM MM'23 +
+
+
+
+
+ + ☆ AKVSR: Audio Knowledge Empowered Visual Speech Recognition by + Compressing Audio Knowledge of a Pretrained Model + + +
+ Visual Speech Recognition (VSR) is the task of predicting spoken words from +silent lip movements. VSR is regarded as a challenging task because of the +insufficient information on lip movements. In this paper, we propose an Audio +Knowledge empowered Visual Speech Recognition framework (AKVSR) to complement +the insufficient speech information of visual modality by using audio modality. +Different from the previous methods, the proposed AKVSR 1) utilizes rich audio +knowledge encoded by a large-scale pretrained audio model, 2) saves the +linguistic information of audio knowledge in compact audio memory by discarding +the non-linguistic information from the audio through quantization, and 3) +includes Audio Bridging Module which can find the best-matched audio features +from the compact audio memory, which makes our training possible without audio +inputs, once after the compact audio memory is composed. We validate the +effectiveness of the proposed method through extensive experiments, and achieve +new state-of-the-art performances on the widely-used datasets, LRS2 and LRS3. + +
+
+
+
+
+ + ☆ Understanding User Behavior in Volumetric Video Watching: Dataset, + Analysis and Prediction + + +
+ Volumetric video emerges as a new attractive video paradigm in recent years +since it provides an immersive and interactive 3D viewing experience with six +degree-of-freedom (DoF). Unlike traditional 2D or panoramic videos, volumetric +videos require dense point clouds, voxels, meshes, or huge neural models to +depict volumetric scenes, which results in a prohibitively high bandwidth +burden for video delivery. Users' behavior analysis, especially the viewport +and gaze analysis, then plays a significant role in prioritizing the content +streaming within users' viewport and degrading the remaining content to +maximize user QoE with limited bandwidth. Although understanding user behavior +is crucial, to the best of our best knowledge, there are no available 3D +volumetric video viewing datasets containing fine-grained user interactivity +features, not to mention further analysis and behavior prediction. In this +paper, we for the first time release a volumetric video viewing behavior +dataset, with a large scale, multiple dimensions, and diverse conditions. We +conduct an in-depth analysis to understand user behaviors when viewing +volumetric videos. Interesting findings on user viewport, gaze, and motion +preference related to different videos and users are revealed. We finally +design a transformer-based viewport prediction model that fuses the features of +both gaze and motion, which is able to achieve high accuracy at various +conditions. Our prediction model is expected to further benefit volumetric +video streaming optimization. Our dataset, along with the corresponding +visualization tools is accessible at +https://cuhksz-inml.github.io/user-behavior-in-vv-watching/ + +
+
+
+
+
+ + ☆ Introducing a New Evaluation Criteria for EMD-Base Steganography Method + + +
+ Steganography is a technique to hide the presence of secret communication. +When one of the communication elements is under the influence of the enemy, it +can be used. The main measure to evaluate steganography methods in a certain +capacity is security. Therefore, in a certain capacity, reducing the amount of +changes in the cover media, creates a higher embedding efficiency and thus more +security of an steganography method. Mostly, security and capacity are in +conflict with each other, the increase of one lead to the decrease of the +other. The presence of a single criterion that represents security and capacity +at the same time be useful in comparing steganography methods. EMD and the +relevant methods are a group of steganography techniques, which optimize the +amount of changes resulting from embedding (security). The present paper is +aimed to provide an evaluation criterion for this group of steganography +methods. In this study, after a general review and comparison of EMD-based +steganography techniques, we present a method to compare them exactly, from the +perspective of embedding efficiency. First, a formula is presented to determine +the value of embedding efficiency, which indicates the effect of one or more +changes on one or more pixels. The results demonstrate that the proposed +embedding efficiency formula shows the performance of the methods better when +several changes are made on a pixel compared to the existing criteria. In the +second step, we have obtained an upper bound, which determines the best +efficiency for each certain capacity. Finally, based on the introduced bound, +another evaluation criterion for a better comparison of the methods is +presented. + +
+
+
+
+
+ + ♻ ☆ SuS-X: Training-Free Name-Only Transfer of Vision-Language Models ICCV2023 + + +
+ Contrastive Language-Image Pre-training (CLIP) has emerged as a simple yet +effective way to train large-scale vision-language models. CLIP demonstrates +impressive zero-shot classification and retrieval on diverse downstream tasks. +However, to leverage its full potential, fine-tuning still appears to be +necessary. Fine-tuning the entire CLIP model can be resource-intensive and +unstable. Moreover, recent methods that aim to circumvent this need for +fine-tuning still require access to images from the target distribution. In +this paper, we pursue a different approach and explore the regime of +training-free "name-only transfer" in which the only knowledge we possess about +the downstream task comprises the names of downstream target categories. We +propose a novel method, SuS-X, consisting of two key building blocks -- SuS and +TIP-X, that requires neither intensive fine-tuning nor costly labelled data. +SuS-X achieves state-of-the-art zero-shot classification results on 19 +benchmark datasets. We further show the utility of TIP-X in the training-free +few-shot setting, where we again achieve state-of-the-art results over strong +training-free baselines. Code is available at +https://github.com/vishaal27/SuS-X. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Text-to-Audio based Event Detection Towards Intelligent Vehicle Road + Cooperation + + +
+ In this paper, we target at the text-to-audio grounding issue, namely, +grounding the segments of the sound event described by a natural language query +in the untrimmed audio. This is a newly proposed but challenging audio-language +task, since it requires to not only precisely localize all the on- and off-sets +of the desired segments in the audio, but also perform comprehensive acoustic +and linguistic understandings and reason the multimodal interactions between +the audio and query. To tackle those problems, the existing methods often +holistically treat the query as a single unit by a global query representation. +We argue that this approach suffers from several limitations. Motivated by the +above considerations, we propose a novel Cross-modal Graph Interaction (CGI) +model, which comprehensively models the comprehensive relations between the +words in a query through a novel language graph. To capture the fine-grained +interactions between the audio and query, a cross-modal attention module is +introduced to assign higher weights to the keywords with more important +semantics and generate the snippet-specific query representations. Furthermore, +we design a cross-gating module to emphasize the crucial parts and weaken the +irrelevant ones in the audio and query. We extensively evaluate the proposed +CGI model on the public Audiogrounding dataset with significant improvements +over several state-of-the-art methods. The ablation study demonstrate the +consistent effectiveness of different modules in our model. + +
+
+ comment: 9 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 45 + +
+
+
+ + ☆ Platypus: Quick, Cheap, and Powerful Refinement of LLMs + + +
+ We present $\textbf{Platypus}$, a family of fine-tuned and merged Large +Language Models (LLMs) that achieves the strongest performance and currently +stands at first place in HuggingFace's Open LLM Leaderboard as of the release +date of this work. In this work we describe (1) our curated dataset +$\textbf{Open-Platypus}$, that is a subset of other open datasets and which +$\textit{we release to the public}$ (2) our process of fine-tuning and merging +LoRA modules in order to conserve the strong prior of pretrained LLMs, while +bringing specific domain knowledge to the surface (3) our efforts in checking +for test data leaks and contamination in the training data, which can inform +future research. Specifically, the Platypus family achieves strong performance +in quantitative LLM metrics across model sizes, topping the global Open LLM +leaderboard while using just a fraction of the fine-tuning data and overall +compute that are required for other state-of-the-art fine-tuned LLMs. In +particular, a 13B Platypus model can be trained on $\textit{a single}$ A100 GPU +using 25k questions in 5 hours. This is a testament of the quality of our +Open-Platypus dataset, and opens opportunities for more improvements in the +field. Project page: https://platypus-llm.github.io + +
+
+
+
+
+ + ☆ LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked + + +
+ Large language models (LLMs) have skyrocketed in popularity in recent years +due to their ability to generate high-quality text in response to human +prompting. However, these models have been shown to have the potential to +generate harmful content in response to user prompting (e.g., giving users +instructions on how to commit crimes). There has been a focus in the literature +on mitigating these risks, through methods like aligning models with human +values through reinforcement learning. However, it has been shown that even +aligned language models are susceptible to adversarial attacks that bypass +their restrictions on generating harmful text. We propose a simple approach to +defending against these attacks by having a large language model filter its own +responses. Our current results show that even if a model is not fine-tuned to +be aligned with human values, it is possible to stop it from presenting harmful +content to users by validating the content using a language model. + +
+
+
+
+
+ + ☆ Neural Authorship Attribution: Stylometric Analysis on Large Language + Models + + +
+ Large language models (LLMs) such as GPT-4, PaLM, and Llama have +significantly propelled the generation of AI-crafted text. With rising concerns +about their potential misuse, there is a pressing need for AI-generated-text +forensics. Neural authorship attribution is a forensic effort, seeking to trace +AI-generated text back to its originating LLM. The LLM landscape can be divided +into two primary categories: proprietary and open-source. In this work, we +delve into these emerging categories of LLMs, focusing on the nuances of neural +authorship attribution. To enrich our understanding, we carry out an empirical +analysis of LLM writing signatures, highlighting the contrasts between +proprietary and open-source models, and scrutinizing variations within each +group. By integrating stylometric features across lexical, syntactic, and +structural aspects of language, we explore their potential to yield +interpretable results and augment pre-trained language model-based classifiers +utilized in neural authorship attribution. Our findings, based on a range of +state-of-the-art LLMs, provide empirical insights into neural authorship +attribution, paving the way for future investigations aimed at mitigating the +threats posed by AI-generated misinformation. + +
+
+
+
+
+ + ☆ The Devil is in the Errors: Leveraging Large Language Models for + Fine-grained Machine Translation Evaluation + + +
+ Automatic evaluation of machine translation (MT) is a critical tool driving +the rapid iterative development of MT systems. While considerable progress has +been made on estimating a single scalar quality score, current metrics lack the +informativeness of more detailed schemes that annotate individual errors, such +as Multidimensional Quality Metrics (MQM). In this paper, we help fill this gap +by proposing AutoMQM, a prompting technique which leverages the reasoning and +in-context learning capabilities of large language models (LLMs) and asks them +to identify and categorize errors in translations. We start by evaluating +recent LLMs, such as PaLM and PaLM-2, through simple score prediction +prompting, and we study the impact of labeled data through in-context learning +and finetuning. We then evaluate AutoMQM with PaLM-2 models, and we find that +it improves performance compared to just prompting for scores (with +particularly large gains for larger models) while providing interpretability +through error spans that align with human annotations. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Comparison between parameter-efficient techniques and full fine-tuning: + A case study on multilingual news article classification + + +
+ Adapters and Low-Rank Adaptation (LoRA) are parameter-efficient fine-tuning +techniques designed to make the training of language models more efficient. +Previous results demonstrated that these methods can even improve performance +on some classification tasks. This paper complements the existing research by +investigating how these techniques influence the classification performance and +computation costs compared to full fine-tuning when applied to multilingual +text classification tasks (genre, framing, and persuasion techniques detection; +with different input lengths, number of predicted classes and classification +difficulty), some of which have limited training data. In addition, we conduct +in-depth analyses of their efficacy across different training scenarios +(training on the original multilingual data; on the translations into English; +and on a subset of English-only data) and different languages. Our findings +provide valuable insights into the applicability of the parameter-efficient +fine-tuning techniques, particularly to complex multilingual and multilabel +classification tasks. + +
+
+
+
+
+ + ☆ Dialogue for Prompting: a Policy-Gradient-Based Discrete Prompt + Optimization for Few-shot Learning + + +
+ Prompt-based pre-trained language models (PLMs) paradigm have succeeded +substantially in few-shot natural language processing (NLP) tasks. However, +prior discrete prompt optimization methods require expert knowledge to design +the base prompt set and identify high-quality prompts, which is costly, +inefficient, and subjective. Meanwhile, existing continuous prompt optimization +methods improve the performance by learning the ideal prompts through the +gradient information of PLMs, whose high computational cost, and low +readability and generalizability are often concerning. To address the research +gap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt +Optimization ($DP_2O$) method. We first design a multi-round dialogue alignment +strategy for readability prompt set generation based on GPT-4. Furthermore, we +propose an efficient prompt screening metric to identify high-quality prompts +with linear complexity. Finally, we construct a reinforcement learning (RL) +framework based on policy gradients to match the prompts to inputs optimally. +By training a policy network with only 0.67% of the PLM parameter size on the +tasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA) +method by 1.52% in accuracy on average on four open-source datasets. Moreover, +subsequent experiments also demonstrate that $DP_2O$ has good universality, +robustness, and generalization ability. + +
+
+
+
+
+ + ☆ EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language + Models + + +
+ Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy +issues, which means they are unaware of unseen events or generate text with +incorrect facts owing to the outdated/noisy data. To this end, many knowledge +editing approaches for LLMs have emerged -- aiming to subtly inject/edit +updated knowledge or adjust undesired behavior while minimizing the impact on +unrelated inputs. Nevertheless, due to significant differences among various +knowledge editing methods and the variations in task setups, there is no +standard implementation framework available for the community, which hinders +practitioners to apply knowledge editing to applications. To address these +issues, we propose EasyEdit, an easy-to-use knowledge editing framework for +LLMs. It supports various cutting-edge knowledge editing approaches and can be +readily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc. +Empirically, we report the knowledge editing results on LlaMA-2 with EasyEdit, +demonstrating that knowledge editing surpasses traditional fine-tuning in terms +of reliability and generalization. We have released the source code on GitHub +at https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and +comprehensive documentation for beginners to get started. Besides, we present +an online system for real-time knowledge editing, and a demo video at +http://knowlm.zjukg.cn/easyedit.mp4. + +
+
+ comment: The project website is https://github.com/zjunlp/EasyEdit +
+
+
+
+
+ + ☆ ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate + + +
+ Text evaluation has historically posed significant challenges, often +demanding substantial labor and time cost. With the emergence of large language +models (LLMs), researchers have explored LLMs' potential as alternatives for +human evaluation. While these single-agent-based approaches show promise, +experimental results suggest that further advancements are needed to bridge the +gap between their current effectiveness and human-level evaluation quality. +Recognizing that best practices of human evaluation processes often involve +multiple human annotators collaborating in the evaluation, we resort to a +multi-agent debate framework, moving beyond single-agent prompting strategies. +The multi-agent-based approach enables a group of LLMs to synergize with an +array of intelligent counterparts, harnessing their distinct capabilities and +expertise to enhance efficiency and effectiveness in handling intricate tasks. +In this paper, we construct a multi-agent referee team called ChatEval to +autonomously discuss and evaluate the quality of generated responses from +different models on open-ended questions and traditional natural language +generation (NLG) tasks. Our analysis shows that ChatEval transcends mere +textual scoring, offering a human-mimicking evaluation process for reliable +assessments. Our code is available at https://github.com/chanchimin/ChatEval. + +
+
+
+
+
+ + ☆ Incorporating Annotator Uncertainty into Representations of Discourse + Relations + + +
+ Annotation of discourse relations is a known difficult task, especially for +non-expert annotators. In this paper, we investigate novice annotators' +uncertainty on the annotation of discourse relations on spoken conversational +data. We find that dialogue context (single turn, pair of turns within speaker, +and pair of turns across speakers) is a significant predictor of confidence +scores. We compute distributed representations of discourse relations from +co-occurrence statistics that incorporate information about confidence scores +and dialogue context. We perform a hierarchical clustering analysis using these +representations and show that weighting discourse relation representations with +information about confidence and dialogue context coherently models our +annotators' uncertainty about discourse relation labels. + +
+
+
+
+
+ + ☆ OctoPack: Instruction Tuning Code Large Language Models + + +
+ Finetuning large language models (LLMs) on instructions leads to vast +performance improvements on natural language tasks. We apply instruction tuning +using code, leveraging the natural structure of Git commits, which pair code +changes with human instructions. We compile CommitPack: 4 terabytes of Git +commits across 350 programming languages. We benchmark CommitPack against other +natural and synthetic code instructions (xP3x, Self-Instruct, OASST) on the 16B +parameter StarCoder model, and achieve state-of-the-art performance among +models not trained on OpenAI outputs, on the HumanEval Python benchmark (46.2% +pass@1). We further introduce HumanEvalPack, expanding the HumanEval benchmark +to a total of 3 coding tasks (Code Repair, Code Explanation, Code Synthesis) +across 6 languages (Python, JavaScript, Java, Go, C++, Rust). Our models, +OctoCoder and OctoGeeX, achieve the best performance across HumanEvalPack among +all permissive models, demonstrating CommitPack's benefits in generalizing to a +wider set of languages and natural coding tasks. Code, models and data are +freely available at https://github.com/bigcode-project/octopack. + +
+
+ comment: 57 pages (9 main), 39 figures, 16 tables +
+
+
+
+
+ + ☆ Natural Language is All a Graph Needs + + +
+ The emergence of large-scale pre-trained language models, such as ChatGPT, +has revolutionized various research fields in artificial intelligence. +Transformers-based large language models (LLMs) have gradually replaced CNNs +and RNNs to unify fields of computer vision and natural language processing. +Compared with the data that exists relatively independently such as images, +videos or texts, graph is a type of data that contains rich structural and +relational information. Meanwhile, natural language, as one of the most +expressive mediums, excels in describing complex structures. However, existing +work on incorporating graph learning problems into the generative language +modeling framework remains very limited. As the importance of language models +continues to grow, it becomes essential to explore whether LLMs can also +replace GNNs as the foundational model for graphs. In this paper, we propose +InstructGLM (Instruction-finetuned Graph Language Model), systematically design +highly scalable prompts based on natural language instructions, and use natural +language to describe the geometric structure and node features of the graph for +instruction tuning an LLMs to perform learning and inference on graphs in a +generative manner. Our method exceeds all competitive GNN baselines on +ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of +our method and sheds light on generative language models replacing GNNs as the +foundation model for graph machine learning. + +
+
+ comment: 21 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ Mind your Language (Model): Fact-Checking LLMs and their Role in NLP + Research and Practice + + +
+ Much of the recent discourse within the NLP research community has been +centered around Large Language Models (LLMs), their functionality and potential +-- yet not only do we not have a working definition of LLMs, but much of this +discourse relies on claims and assumptions that are worth re-examining. This +position paper contributes a definition of LLMs, explicates some of the +assumptions made regarding their functionality, and outlines the existing +evidence for and against them. We conclude with suggestions for research +directions and their framing in future work. + +
+
+
+
+
+ + ☆ Large Language Models for Information Retrieval: A Survey + + +
+ As a primary means of information acquisition, information retrieval (IR) +systems, such as search engines, have integrated themselves into our daily +lives. These systems also serve as components of dialogue, question-answering, +and recommender systems. The trajectory of IR has evolved dynamically from its +origins in term-based methods to its integration with advanced neural models. +While the neural models excel at capturing complex contextual signals and +semantic nuances, thereby reshaping the IR landscape, they still face +challenges such as data scarcity, interpretability, and the generation of +contextually plausible yet potentially inaccurate responses. This evolution +requires a combination of both traditional methods (such as term-based sparse +retrieval methods with rapid response) and modern neural architectures (such as +language models with powerful language understanding capacity). Meanwhile, the +emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has +revolutionized natural language processing due to their remarkable language +understanding, generation, generalization, and reasoning abilities. +Consequently, recent research has sought to leverage LLMs to improve IR +systems. Given the rapid evolution of this research trajectory, it is necessary +to consolidate existing methodologies and provide nuanced insights through a +comprehensive overview. In this survey, we delve into the confluence of LLMs +and IR systems, including crucial aspects such as query rewriters, retrievers, +rerankers, and readers. Additionally, we explore promising directions within +this expanding field. + +
+
+
+
+
+ + ☆ Temporal Sentence Grounding in Streaming Videos ACM MM 2023 + + +
+ This paper aims to tackle a novel task - Temporal Sentence Grounding in +Streaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance +between a video stream and a given sentence query. Unlike regular videos, +streaming videos are acquired continuously from a particular source, and are +always desired to be processed on-the-fly in many applications such as +surveillance and live-stream analysis. Thus, TSGSV is challenging since it +requires the model to infer without future frames and process long historical +frames effectively, which is untouched in the early methods. To specifically +address the above challenges, we propose two novel methods: (1) a TwinNet +structure that enables the model to learn about upcoming events; and (2) a +language-guided feature compressor that eliminates redundant visual frames and +reinforces the frames that are relevant to the query. We conduct extensive +experiments using ActivityNet Captions, TACoS, and MAD datasets. The results +demonstrate the superiority of our proposed methods. A systematic ablation +study also confirms their effectiveness. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Aesthetics of Sanskrit Poetry from the Perspective of Computational + Linguistics: A Case Study Analysis on Siksastaka + + +
+ Sanskrit poetry has played a significant role in shaping the literary and +cultural landscape of the Indian subcontinent for centuries. However, not much +attention has been devoted to uncovering the hidden beauty of Sanskrit poetry +in computational linguistics. This article explores the intersection of +Sanskrit poetry and computational linguistics by proposing a roadmap of an +interpretable framework to analyze and classify the qualities and +characteristics of fine Sanskrit poetry. We discuss the rich tradition of +Sanskrit poetry and the significance of computational linguistics in +automatically identifying the characteristics of fine poetry. The proposed +framework involves a human-in-the-loop approach that combines deterministic +aspects delegated to machines and deep semantics left to human experts. We +provide a deep analysis of Siksastaka, a Sanskrit poem, from the perspective of +6 prominent kavyashastra schools, to illustrate the proposed framework. +Additionally, we provide compound, dependency, anvaya (prose order linearised +form), meter, rasa (mood), alankar (figure of speech), and riti (writing style) +annotations for Siksastaka and a web application to illustrate the poem's +analysis and annotations. Our key contributions include the proposed framework, +the analysis of Siksastaka, the annotations and the web application for future +research. Link for interactive analysis: +https://sanskritshala.github.io/shikshastakam/ + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ #InsTag: Instruction Tagging for Diversity and Complexity Analysis + + +
+ Foundation language models obtain the instruction-following ability through +supervised fine-tuning (SFT). Diversity and complexity are considered critical +factors of a successful SFT dataset, while their definitions remain obscure and +lack quantitative analyses. In this work, we propose InsTag, an open-set +fine-grained tagger, to tag samples within SFT datasets based on semantics and +intentions and define instruction diversity and complexity regarding tags. We +obtain 6.6K tags to describe comprehensive user queries. Then we analyze +popular open-sourced SFT datasets and find that the model ability grows with +more diverse and complex data. Based on this observation, we propose a data +selector based on InsTag to select 6K diverse and complex samples from +open-source datasets and fine-tune models on InsTag-selected data. The +resulting models, TagLM, outperform open-source models based on considerably +larger SFT data evaluated by MT-Bench, echoing the importance of query +diversity and complexity. We open-source InsTag in +https://github.com/OFA-Sys/InsTag. + +
+
+
+
+
+ + ☆ Can Knowledge Graphs Simplify Text? CIKM 2023 + + +
+ Knowledge Graph (KG)-to-Text Generation has seen recent improvements in +generating fluent and informative sentences which describe a given KG. As KGs +are widespread across multiple domains and contain important entity-relation +information, and as text simplification aims to reduce the complexity of a text +while preserving the meaning of the original text, we propose KGSimple, a novel +approach to unsupervised text simplification which infuses KG-established +techniques in order to construct a simplified KG path and generate a concise +text which preserves the original input's meaning. Through an iterative and +sampling KG-first approach, our model is capable of simplifying text when +starting from a KG by learning to keep important information while harnessing +KG-to-text generation to output fluent and descriptive sentences. We evaluate +various settings of the KGSimple model on currently-available KG-to-text +datasets, demonstrating its effectiveness compared to unsupervised text +simplification models which start with a given complex text. Our code is +available on GitHub. + +
+
+ comment: Accepted as a Main Conference Long Paper at CIKM 2023 +
+
+
+
+
+ + ☆ EcomGPT: Instruction-tuning Large Language Model with Chain-of-Task + Tasks for E-commerce + + +
+ Recently, instruction-following Large Language Models (LLMs) , represented by +ChatGPT, have exhibited exceptional performance in general Natural Language +Processing (NLP) tasks. However, the unique characteristics of E-commerce data +pose significant challenges to general LLMs. An LLM tailored specifically for +E-commerce scenarios, possessing robust cross-dataset/task generalization +capabilities, is a pressing necessity. To solve this issue, in this work, we +proposed the first e-commerce instruction dataset EcomInstruct, with a total of +2.5 million instruction data. EcomInstruct scales up the data size and task +diversity by constructing atomic tasks with E-commerce basic data types, such +as product information, user reviews. Atomic tasks are defined as intermediate +tasks implicitly involved in solving a final task, which we also call +Chain-of-Task tasks. We developed EcomGPT with different parameter scales by +training the backbone model BLOOMZ with the EcomInstruct. Benefiting from the +fundamental semantic understanding capabilities acquired from the Chain-of-Task +tasks, EcomGPT exhibits excellent zero-shot generalization capabilities. +Extensive experiments and human evaluations demonstrate that EcomGPT +outperforms ChatGPT in term of cross-dataset/task generalization on E-commerce +tasks. + +
+
+ comment: Initial version of EcomGPT +
+
+
+
+
+ + ☆ Thresh: A Unified, Customizable and Deployable Platform for Fine-Grained + Text Evaluation + + +
+ Fine-grained, span-level human evaluation has emerged as a reliable and +robust method for evaluating text generation tasks such as summarization, +simplification, machine translation and news generation, and the derived +annotations have been useful for training automatic metrics and improving +language models. However, existing annotation tools implemented for these +evaluation frameworks lack the adaptability to be extended to different domains +or languages, or modify annotation settings according to user needs. And the +absence of a unified annotated data format inhibits the research in multi-task +learning. In this paper, we introduce Thresh, a unified, customizable and +deployable platform for fine-grained evaluation. By simply creating a YAML +configuration file, users can build and test an annotation interface for any +framework within minutes -- all in one web browser window. To facilitate +collaboration and sharing, Thresh provides a community hub that hosts a +collection of fine-grained frameworks and corresponding annotations made and +collected by the community, covering a wide range of NLP tasks. For deployment, +Thresh offers multiple options for any scale of annotation projects from small +manual inspections to large crowdsourcing ones. Additionally, we introduce a +Python library to streamline the entire process from typology design and +deployment to annotation processing. Thresh is publicly accessible at +https://thresh.tools. + +
+
+
+
+
+ + ☆ Approximating Human-Like Few-shot Learning with GPT-based Compression + + +
+ In this work, we conceptualize the learning process as information +compression. We seek to equip generative pre-trained models with human-like +learning capabilities that enable data compression during inference. We present +a novel approach that utilizes the Generative Pre-trained Transformer (GPT) to +approximate Kolmogorov complexity, with the aim of estimating the optimal +Information Distance for few-shot learning. We first propose using GPT as a +prior for lossless text compression, achieving a noteworthy compression ratio. +Experiment with LLAMA2-7B backbone achieves a compression ratio of 15.5 on +enwik9. We justify the pre-training objective of GPT models by demonstrating +its equivalence to the compression length, and, consequently, its ability to +approximate the information distance for texts. Leveraging the approximated +information distance, our method allows the direct application of GPT models in +quantitative text similarity measurements. Experiment results show that our +method overall achieves superior performance compared to embedding and prompt +baselines on challenging NLP tasks, including semantic similarity, zero and +one-shot text classification, and zero-shot text ranking. + +
+
+
+
+
+ + ☆ CausalLM is not optimal for in-context learning + + +
+ Recent empirical evidence indicates that transformer based in-context +learning performs better when using a prefix language model (prefixLM), in +which in-context samples can all attend to each other, compared to causal +language models (causalLM), which use auto-regressive attention that prohibits +in-context samples to attend to future samples. While this result is intuitive, +it is not understood from a theoretical perspective. In this paper we take a +theoretical approach and analyze the convergence behavior of prefixLM and +causalLM under a certain parameter construction. Our analysis shows that both +LM types converge to their stationary points at a linear rate, but that while +prefixLM converges to the optimal solution of linear regression, causalLM +convergence dynamics follows that of an online gradient descent algorithm, +which is not guaranteed to be optimal even as the number of samples grows +infinitely. We supplement our theoretical claims with empirical experiments +over synthetic and real tasks and using various types of transformers. Our +experiments verify that causalLM consistently underperforms prefixLM in all +settings. + +
+
+
+
+
+ + ☆ GIT-Mol: A Multi-modal Large Language Model for Molecular Science with + Graph, Image, and Text + + +
+ Large language models have made significant strides in natural language +processing, paving the way for innovative applications including molecular +representation and generation. However, most existing single-modality +approaches cannot capture the abundant and complex information in molecular +data. Here, we introduce GIT-Mol, a multi-modal large language model that +integrates the structure Graph, Image, and Text information, including the +Simplified Molecular Input Line Entry System (SMILES) and molecular captions. +To facilitate the integration of multi-modal molecular data, we propose +GIT-Former, a novel architecture capable of mapping all modalities into a +unified latent space. Our study develops an innovative any-to-language +molecular translation strategy and achieves a 10%-15% improvement in molecular +captioning, a 5%-10% accuracy increase in property prediction, and a 20% boost +in molecule generation validity compared to baseline or single-modality models. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ Generative Interpretation + + +
+ We introduce generative interpretation, a new approach to estimating +contractual meaning using large language models. As AI triumphalism is the +order of the day, we proceed by way of grounded case studies, each illustrating +the capabilities of these novel tools in distinct ways. Taking well-known +contracts opinions, and sourcing the actual agreements that they adjudicated, +we show that AI models can help factfinders ascertain ordinary meaning in +context, quantify ambiguity, and fill gaps in parties' agreements. We also +illustrate how models can calculate the probative value of individual pieces of +extrinsic evidence. After offering best practices for the use of these models +given their limitations, we consider their implications for judicial practice +and contract theory. Using LLMs permits courts to estimate what the parties +intended cheaply and accurately, and as such generative interpretation +unsettles the current interpretative stalemate. Their use responds to +efficiency-minded textualists and justice-oriented contextualists, who argue +about whether parties will prefer cost and certainty or accuracy and fairness. +Parties--and courts--would prefer a middle path, in which adjudicators strive +to predict what the contract really meant, admitting just enough context to +approximate reality while avoiding unguided and biased assimilation of +evidence. As generative interpretation offers this possibility, we argue it can +become the new workhorse of contractual interpretation. + +
+
+
+
+
+ + ☆ SpeechX: Neural Codec Language Model as a Versatile Speech Transformer + + +
+ Recent advancements in generative speech models based on audio-text prompts +have enabled remarkable innovations like high-quality zero-shot text-to-speech. +However, existing models still face limitations in handling diverse audio-text +speech generation tasks involving transforming input speech and processing +audio captured in adverse acoustic conditions. This paper introduces SpeechX, a +versatile speech generation model capable of zero-shot TTS and various speech +transformation tasks, dealing with both clean and noisy signals. SpeechX +combines neural codec language modeling with multi-task learning using +task-dependent prompting, enabling unified and extensible modeling and +providing a consistent way for leveraging textual input in speech enhancement +and transformation tasks. Experimental results show SpeechX's efficacy in +various tasks, including zero-shot TTS, noise suppression, target speaker +extraction, speech removal, and speech editing with or without background +noise, achieving comparable or superior performance to specialized models +across tasks. See https://aka.ms/speechx for demo samples. + +
+
+ comment: See https://aka.ms/speechx for demo samples +
+
+
+
+
+ + ☆ SOTASTREAM: A Streaming Approach to Machine Translation Training + + +
+ Many machine translation toolkits make use of a data preparation step wherein +raw data is transformed into a tensor format that can be used directly by the +trainer. This preparation step is increasingly at odds with modern research and +development practices because this process produces a static, unchangeable +version of the training data, making common training-time needs difficult +(e.g., subword sampling), time-consuming (preprocessing with large data can +take days), expensive (e.g., disk space), and cumbersome (managing experiment +combinatorics). We propose an alternative approach that separates the +generation of data from the consumption of that data. In this approach, there +is no separate pre-processing step; data generation produces an infinite stream +of permutations of the raw training data, which the trainer tensorizes and +batches as it is consumed. Additionally, this data stream can be manipulated by +a set of user-definable operators that provide on-the-fly modifications, such +as data normalization, augmentation or filtering. We release an open-source +toolkit, SOTASTREAM, that implements this approach: +https://github.com/marian-nmt/sotastream. We show that it cuts training time, +adds flexibility, reduces experiment management complexity, and reduces disk +space, all without affecting the accuracy of the trained models. + +
+
+
+
+
+ + ☆ O-1: Self-training with Oracle and 1-best Hypothesis + + +
+ We introduce O-1, a new self-training objective to reduce training bias and +unify training and evaluation metrics for speech recognition. O-1 is a faster +variant of Expected Minimum Bayes Risk (EMBR), that boosts the oracle +hypothesis and can accommodate both supervised and unsupervised data. We +demonstrate the effectiveness of our approach in terms of recognition on +publicly available SpeechStew datasets and a large-scale, in-house data set. On +Speechstew, the O-1 objective closes the gap between the actual and oracle +performance by 80\% relative compared to EMBR which bridges the gap by 43\% +relative. O-1 achieves 13\% to 25\% relative improvement over EMBR on the +various datasets that SpeechStew comprises of, and a 12\% relative gap +reduction with respect to the oracle WER over EMBR training on the in-house +dataset. Overall, O-1 results in a 9\% relative improvement in WER over EMBR, +thereby speaking to the scalability of the proposed objective for large-scale +datasets. + +
+
+
+
+
+ + ☆ Playing with Words: Comparing the Vocabulary and Lexical Richness of + ChatGPT and Humans + + +
+ The introduction of Artificial Intelligence (AI) generative language models +such as GPT (Generative Pre-trained Transformer) and tools such as ChatGPT has +triggered a revolution that can transform how text is generated. This has many +implications, for example, as AI-generated text becomes a significant fraction +of the text in many disciplines, would this have an effect on the language +capabilities of readers and also on the training of newer AI tools? Would it +affect the evolution of languages? Focusing on one specific aspect of the +language: words; will the use of tools such as ChatGPT increase or reduce the +vocabulary used or the lexical richness (understood as the number of different +words used in a written or oral production) when writing a given text? This has +implications for words, as those not included in AI-generated content will tend +to be less and less popular and may eventually be lost. In this work, we +perform an initial comparison of the vocabulary and lexical richness of ChatGPT +and humans when performing the same tasks. In more detail, two datasets +containing the answers to different types of questions answered by ChatGPT and +humans are used, and the analysis shows that ChatGPT tends to use fewer +distinct words and lower lexical richness than humans. These results are very +preliminary and additional datasets and ChatGPT configurations have to be +evaluated to extract more general conclusions. Therefore, further research is +needed to understand how the use of ChatGPT and more broadly generative AI +tools will affect the vocabulary and lexical richness in different types of +text and languages. + +
+
+
+
+
+ + ☆ Development and Evaluation of Three Chatbots for Postpartum Mood and + Anxiety Disorders + + +
+ In collaboration with Postpartum Support International (PSI), a non-profit +organization dedicated to supporting caregivers with postpartum mood and +anxiety disorders, we developed three chatbots to provide context-specific +empathetic support to postpartum caregivers, leveraging both rule-based and +generative models. We present and evaluate the performance of our chatbots +using both machine-based metrics and human-based questionnaires. Overall, our +rule-based model achieves the best performance, with outputs that are close to +ground truth reference and contain the highest levels of empathy. Human users +prefer the rule-based chatbot over the generative chatbot for its +context-specific and human-like replies. Our generative chatbot also produced +empathetic responses and was described by human users as engaging. However, +limitations in the training dataset often result in confusing or nonsensical +responses. We conclude by discussing practical benefits of rule-based vs. +generative models for supporting individuals with mental health challenges. In +light of the recent surge of ChatGPT and BARD, we also discuss the +possibilities and pitfalls of large language models for digital mental +healthcare. + +
+
+
+
+
+ + ☆ Text Injection for Capitalization and Turn-Taking Prediction in Speech + Models + + +
+ Text injection for automatic speech recognition (ASR), wherein unpaired +text-only data is used to supplement paired audio-text data, has shown +promising improvements for word error rate. This study examines the use of text +injection for auxiliary tasks, which are the non-ASR tasks often performed by +an E2E model. In this work, we use joint end-to-end and internal language model +training (JEIT) as our text injection algorithm to train an ASR model which +performs two auxiliary tasks. The first is capitalization, which is a +de-normalization task. The second is turn-taking prediction, which attempts to +identify whether a user has completed their conversation turn in a digital +assistant interaction. We show results demonstrating that our text injection +method boosts capitalization performance for long-tail data, and improves +turn-taking detection recall. + +
+
+
+
+
+ + ☆ Using Text Injection to Improve Recognition of Personal Identifiers in + Speech + + +
+ Accurate recognition of specific categories, such as persons' names, dates or +other identifiers is critical in many Automatic Speech Recognition (ASR) +applications. As these categories represent personal information, ethical use +of this data including collection, transcription, training and evaluation +demands special care. One way of ensuring the security and privacy of +individuals is to redact or eliminate Personally Identifiable Information (PII) +from collection altogether. However, this results in ASR models that tend to +have lower recognition accuracy of these categories. We use text-injection to +improve the recognition of PII categories by including fake textual substitutes +of PII categories in the training data using a text injection method. We +demonstrate substantial improvement to Recall of Names and Dates in medical +notes while improving overall WER. For alphanumeric digit sequences we show +improvements to Character Error Rate and Sentence Accuracy. + +
+
+ comment: Accepted to Interspeech 2023 +
+
+
+
+
+ + ☆ Human-centered NLP Fact-checking: Co-Designing with Fact-checkers using + Matchmaking for AI + + +
+ A key challenge in professional fact-checking is its limited scalability in +relation to the magnitude of false information. While many Natural Language +Processing (NLP) tools have been proposed to enhance fact-checking efficiency +and scalability, both academic research and fact-checking organizations report +limited adoption of such tooling due to insufficient alignment with +fact-checker practices, values, and needs. To address this gap, we investigate +a co-design method, Matchmaking for AI, which facilitates fact-checkers, +designers, and NLP researchers to collaboratively discover what fact-checker +needs should be addressed by technology and how. Our co-design sessions with 22 +professional fact-checkers yielded a set of 11 novel design ideas. They assist +in information searching, processing, and writing tasks for efficient and +personalized fact-checking; help fact-checkers proactively prepare for future +misinformation; monitor their potential biases; and support internal +organization collaboration. Our work offers implications for human-centered +fact-checking research and practice and AI co-design research. + +
+
+
+
+
+ + ☆ Improving Audio-Visual Speech Recognition by Lip-Subword Correlation + Based Visual Pre-training and Cross-Modal Fusion Encoder ICME2023 + + +
+ In recent research, slight performance improvement is observed from automatic +speech recognition systems to audio-visual speech recognition systems in the +end-to-end framework with low-quality videos. Unmatching convergence rates and +specialized input representations between audio and visual modalities are +considered to cause the problem. In this paper, we propose two novel techniques +to improve audio-visual speech recognition (AVSR) under a pre-training and +fine-tuning training framework. First, we explore the correlation between lip +shapes and syllable-level subword units in Mandarin to establish good +frame-level syllable boundaries from lip shapes. This enables accurate +alignment of video and audio streams during visual model pre-training and +cross-modal fusion. Next, we propose an audio-guided cross-modal fusion encoder +(CMFE) neural network to utilize main training parameters for multiple +cross-modal attention layers to make full use of modality complementarity. +Experiments on the MISP2021-AVSR data set show the effectiveness of the two +proposed techniques. Together, using only a relatively small amount of training +data, the final system achieves better performances than state-of-the-art +systems with more complex front-ends and back-ends. + +
+
+ comment: 6 pages, 2 figures, published in ICME2023 +
+
+
+
+
+ + ☆ Automated Testing and Improvement of Named Entity Recognition Systems + + +
+ Named entity recognition (NER) systems have seen rapid progress in recent +years due to the development of deep neural networks. These systems are widely +used in various natural language processing applications, such as information +extraction, question answering, and sentiment analysis. However, the complexity +and intractability of deep neural networks can make NER systems unreliable in +certain circumstances, resulting in incorrect predictions. For example, NER +systems may misidentify female names as chemicals or fail to recognize the +names of minority groups, leading to user dissatisfaction. To tackle this +problem, we introduce TIN, a novel, widely applicable approach for +automatically testing and repairing various NER systems. The key idea for +automated testing is that the NER predictions of the same named entities under +similar contexts should be identical. The core idea for automated repairing is +that similar named entities should have the same NER prediction under the same +context. We use TIN to test two SOTA NER models and two commercial NER APIs, +i.e., Azure NER and AWS NER. We manually verify 784 of the suspicious issues +reported by TIN and find that 702 are erroneous issues, leading to high +precision (85.0%-93.4%) across four categories of NER errors: omission, +over-labeling, incorrect category, and range error. For automated repairing, +TIN achieves a high error reduction rate (26.8%-50.6%) over the four systems +under test, which successfully repairs 1,056 out of the 1,877 reported NER +errors. + +
+
+ comment: Accepted by ESEC/FSE'23 +
+
+
+
+
+ + ♻ ☆ NECE: Narrative Event Chain Extraction Toolkit + + +
+ To understand a narrative, it is essential to comprehend the temporal event +flows, especially those associated with main characters; however, this can be +challenging with lengthy and unstructured narrative texts. To address this, we +introduce NECE, an open-access, document-level toolkit that automatically +extracts and aligns narrative events in the temporal order of their occurrence. +Through extensive evaluations, we show the high quality of the NECE toolkit and +demonstrates its downstream application in analyzing narrative bias regarding +gender. We also openly discuss the shortcomings of the current approach, and +potential of leveraging generative models in future works. Lastly the NECE +toolkit includes both a Python library and a user-friendly web interface, which +offer equal access to professionals and layman audience alike, to visualize +event chain, obtain narrative flows, or study narrative bias. + +
+
+
+
+
+ + ♻ ☆ Self-Alignment with Instruction Backtranslation + + +
+ We present a scalable method to build a high quality instruction following +language model by automatically labelling human-written text with corresponding +instructions. Our approach, named instruction backtranslation, starts with a +language model finetuned on a small amount of seed data, and a given web +corpus. The seed model is used to construct training examples by generating +instruction prompts for web documents (self-augmentation), and then selecting +high quality examples from among these candidates (self-curation). This data is +then used to finetune a stronger model. Finetuning LLaMa on two iterations of +our approach yields a model that outperforms all other LLaMa-based models on +the Alpaca leaderboard not relying on distillation data, demonstrating highly +effective self-alignment. + +
+
+
+
+
+ + ♻ ☆ Contrastive Learning for API Aspect Analysis + + +
+ We present a novel approach - CLAA - for API aspect detection in API reviews +that utilizes transformer models trained with a supervised contrastive loss +objective function. We evaluate CLAA using performance and impact analysis. For +performance analysis, we utilized a benchmark dataset on developer discussions +collected from Stack Overflow and compare the results to those obtained using +state-of-the-art transformer models. Our experiments show that contrastive +learning can significantly improve the performance of transformer models in +detecting aspects such as Performance, Security, Usability, and Documentation. +For impact analysis, we performed empirical and developer study. On a randomly +selected and manually labeled 200 online reviews, CLAA achieved 92% accuracy +while the SOTA baseline achieved 81.5%. According to our developer study +involving 10 participants, the use of 'Stack Overflow + CLAA' resulted in +increased accuracy and confidence during API selection. Replication package: +https://github.com/disa-lab/Contrastive-Learning-API-Aspect-ASE2023 + +
+
+ comment: Accepted in the 38th IEEE/ACM International Conference on Automated + Software Engineering (ASE2023) +
+
+
+
+
+ + ♻ ☆ Temporal Modeling Matters: A Novel Temporal Emotional Modeling Approach + for Speech Emotion Recognition ICASSP 2023 + + +
+ Speech emotion recognition (SER) plays a vital role in improving the +interactions between humans and machines by inferring human emotion and +affective states from speech signals. Whereas recent works primarily focus on +mining spatiotemporal information from hand-crafted features, we explore how to +model the temporal patterns of speech emotions from dynamic temporal scales. +Towards that goal, we introduce a novel temporal emotional modeling approach +for SER, termed Temporal-aware bI-direction Multi-scale Network (TIM-Net), +which learns multi-scale contextual affective representations from various time +scales. Specifically, TIM-Net first employs temporal-aware blocks to learn +temporal affective representation, then integrates complementary information +from the past and the future to enrich contextual representations, and finally, +fuses multiple time scale features for better adaptation to the emotional +variation. Extensive experimental results on six benchmark SER datasets +demonstrate the superior performance of TIM-Net, gaining 2.34% and 2.61% +improvements of the average UAR and WAR over the second-best on each corpus. +The source code is available at https://github.com/Jiaxin-Ye/TIM-Net_SER. + +
+
+ comment: ICASSP 2023 +
+
+
+
+
+ + ♻ ☆ Does Correction Remain A Problem For Large Language Models? + + +
+ As large language models, such as GPT, continue to advance the capabilities +of natural language processing (NLP), the question arises: does the problem of +correction still persist? This paper investigates the role of correction in the +context of large language models by conducting two experiments. The first +experiment focuses on correction as a standalone task, employing few-shot +learning techniques with GPT-like models for error correction. The second +experiment explores the notion of correction as a preparatory task for other +NLP tasks, examining whether large language models can tolerate and perform +adequately on texts containing certain levels of noise or errors. By addressing +these experiments, we aim to shed light on the significance of correction in +the era of large language models and its implications for various NLP +applications. + +
+
+
+
+
+ + ♻ ☆ ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on + Class-level Code Generation + + +
+ In this work, we make the first attempt to evaluate LLMs in a more +challenging code generation scenario, i.e. class-level code generation. We +first manually construct the first class-level code generation benchmark +ClassEval of 100 class-level Python code generation tasks with approximately +500 person-hours. Based on it, we then perform the first study of 11 +state-of-the-art LLMs on class-level code generation. Based on our results, we +have the following main findings. First, we find that all existing LLMs show +much worse performance on class-level code generation compared to on standalone +method-level code generation benchmarks like HumanEval; and the method-level +coding ability cannot equivalently reflect the class-level coding ability among +LLMs. Second, we find that GPT-4 and GPT-3.5 still exhibit dominate superior +than other LLMs on class-level code generation, and the second-tier models +includes Instruct-Starcoder, Instruct-Codegen, and Wizardcoder with very +similar performance. Third, we find that generating the entire class all at +once (i.e. holistic generation strategy) is the best generation strategy only +for GPT-4 and GPT-3.5, while method-by-method generation (i.e. incremental and +compositional) is better strategies for the other models with limited ability +of understanding long instructions and utilizing the middle information. +Lastly, we find the limited model ability of generating method-dependent code +and discuss the frequent error types in generated classes. Our benchmark is +available at https://github.com/FudanSELab/ClassEval. + +
+
+
+
+
+ + ♻ ☆ SEAM: An Integrated Activation-Coupled Model of Sentence Processing and + Eye Movements in Reading + + +
+ Models of eye-movement control during reading, developed largely within +psychology, usually focus on visual, attentional, lexical, and motor processes +but neglect post-lexical language processing; by contrast, models of sentence +comprehension processes, developed largely within psycholinguistics, generally +focus only on post-lexical language processes. We present a model that combines +these two research threads, by integrating eye-movement control and sentence +processing. Developing such an integrated model is extremely challenging and +computationally demanding, but such an integration is an important step toward +complete mathematical models of natural language comprehension in reading. We +combine the SWIFT model of eye-movement control (Seelig et al., 2020, +doi:10.1016/j.jmp.2019.102313) with key components of the Lewis and Vasishth +sentence processing model (Lewis & Vasishth, 2005, +doi:10.1207/s15516709cog0000_25). This integration becomes possible, for the +first time, due in part to recent advances in successful parameter +identification in dynamical models, which allows us to investigate profile +log-likelihoods for individual model parameters. We present a fully implemented +proof-of-concept model demonstrating how such an integrated model can be +achieved; our approach includes Bayesian model inference with Markov Chain +Monte Carlo (MCMC) sampling as a key computational tool. The integrated model, +SEAM, can successfully reproduce eye movement patterns that arise due to +similarity-based interference in reading. To our knowledge, this is the +first-ever integration of a complete process model of eye-movement control with +linguistic dependency completion processes in sentence comprehension. In future +work, this proof of concept model will need to be evaluated using a +comprehensive set of benchmark data. + +
+
+
+
+
+ + ♻ ☆ Skills-in-Context Prompting: Unlocking Compositionality in Large + Language Models + + +
+ We consider the problem of eliciting compositional generalization +capabilities in large language models (LLMs) with a novel type of prompting +strategy. Compositional generalization empowers the LLMs to solve problems that +are harder than the ones they have seen (i.e., easy-to-hard generalization), +which is a critical reasoning capability of human-like intelligence. However, +even the current state-of-the-art LLMs still struggle with this form of +reasoning. To bridge this gap, we propose skills-in-context (SKiC) prompting, +which instructs LLMs how to compose basic skills to resolve more complex +problems. We find that it is crucial to demonstrate both the skills and the +compositional examples within the same prompting context. With as few as two +examplars, our SKiC prompting initiates strong synergies between skills and +their composition capabilities. Notably, it empowers LLMs to solve unseen +problems that require innovative skill compositions, achieving near-perfect +generalization on a broad range of challenging compositionality tasks. +Intriguingly, SKiC prompting unlocks the latent potential of LLMs, enabling +them to leverage pre-existing internal skills acquired during earlier +pre-training stages, even when these skills are not explicitly presented in the +prompting context. This results in the capability of LLMs to solve unseen +complex problems by activating and composing internal competencies. With such +prominent features, SKiC prompting is able to achieve state-of-the-art +performance on challenging mathematical reasoning benchmarks (e.g., MATH). + +
+
+
+
+
+ + ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models: + A Comprehensive Survey + + +
+ Diffusion models and large language models have emerged as leading-edge +generative models and have sparked a revolutionary impact on various aspects of +human life. However, the practical implementation of these models has also +exposed inherent risks, highlighting their dual nature and raising concerns +regarding their trustworthiness. Despite the abundance of literature on this +subject, a comprehensive survey specifically delving into the intersection of +large-scale generative models and their trustworthiness remains largely absent. +To bridge this gap, This paper investigates both the long-standing and emerging +threats associated with these models across four fundamental dimensions: +privacy, security, fairness, and responsibility. In this way, we construct an +extensive map outlining the trustworthiness of these models, while also +providing practical recommendations and identifying future directions. These +efforts are crucial for promoting the trustworthy deployment of these models, +ultimately benefiting society as a whole. + +
+
+ comment: draft version +
+
+
+
+
+ + ♻ ☆ Improving Zero-Shot Text Matching for Financial Auditing with Large + Language Models + + +
+ Auditing financial documents is a very tedious and time-consuming process. As +of today, it can already be simplified by employing AI-based solutions to +recommend relevant text passages from a report for each legal requirement of +rigorous accounting standards. However, these methods need to be fine-tuned +regularly, and they require abundant annotated data, which is often lacking in +industrial environments. Hence, we present ZeroShotALI, a novel recommender +system that leverages a state-of-the-art large language model (LLM) in +conjunction with a domain-specifically optimized transformer-based +text-matching solution. We find that a two-step approach of first retrieving a +number of best matching document sections per legal requirement with a custom +BERT-based model and second filtering these selections using an LLM yields +significant performance improvements over existing approaches. + +
+
+ comment: Accepted at DocEng 2023, 4 pages, 1 figure, 2 tables +
+
+
+
+
+ + ♻ ☆ Zhongjing: Enhancing the Chinese Medical Capabilities of Large Language + Model through Expert Feedback and Real-world Multi-turn Dialogue + + +
+ Recent advances in Large Language Models (LLMs) have achieved remarkable +breakthroughs in understanding and responding to user intents. However, their +performance lag behind general use cases in some expertise domains, such as +Chinese medicine. Existing efforts to incorporate Chinese medicine into LLMs +rely on Supervised Fine-Tuning (SFT) with single-turn and distilled dialogue +data. These models lack the ability for doctor-like proactive inquiry and +multi-turn comprehension and cannot always align responses with safety and +professionalism experts. In this work, we introduce Zhongjing, the first +Chinese medical LLaMA-based LLM that implements an entire training pipeline +from pre-training to reinforcement learning with human feedback (RLHF). +Additionally, we introduce a Chinese multi-turn medical dialogue dataset of +70,000 authentic doctor-patient dialogues, CMtMedQA, which significantly +enhances the model's capability for complex dialogue and proactive inquiry +initiation. We define a refined annotation rule and evaluation criteria given +the biomedical domain's unique characteristics. Results show that our model +outperforms baselines in various capacities and matches the performance of +ChatGPT in a few abilities, despite having 50x training data with previous best +model and 100x parameters with ChatGPT. RLHF further improves the model's +instruction-following ability and safety.We also release our code, datasets and +model for further research. + +
+
+
+
+
+ + ♻ ☆ LeafAI: query generator for clinical cohort discovery rivaling a human + programmer + + +
+ Objective: Identifying study-eligible patients within clinical databases is a +critical step in clinical research. However, accurate query design typically +requires extensive technical and biomedical expertise. We sought to create a +system capable of generating data model-agnostic queries while also providing +novel logical reasoning capabilities for complex clinical trial eligibility +criteria. + Materials and Methods: The task of query creation from eligibility criteria +requires solving several text-processing problems, including named entity +recognition and relation extraction, sequence-to-sequence transformation, +normalization, and reasoning. We incorporated hybrid deep learning and +rule-based modules for these, as well as a knowledge base of the Unified +Medical Language System (UMLS) and linked ontologies. To enable data-model +agnostic query creation, we introduce a novel method for tagging database +schema elements using UMLS concepts. To evaluate our system, called LeafAI, we +compared the capability of LeafAI to a human database programmer to identify +patients who had been enrolled in 8 clinical trials conducted at our +institution. We measured performance by the number of actual enrolled patients +matched by generated queries. + Results: LeafAI matched a mean 43% of enrolled patients with 27,225 eligible +across 8 clinical trials, compared to 27% matched and 14,587 eligible in +queries by a human database programmer. The human programmer spent 26 total +hours crafting queries compared to several minutes by LeafAI. + Conclusions: Our work contributes a state-of-the-art data model-agnostic +query generation system capable of conditional reasoning using a knowledge +base. We demonstrate that LeafAI can rival an experienced human programmer in +finding patients eligible for clinical trials. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 135 + +
+
+
+ + ☆ Jurassic World Remake: Bringing Ancient Fossils Back to Life via + Zero-Shot Long Image-to-Image Translation + + +
+ With a strong understanding of the target domain from natural language, we +produce promising results in translating across large domain gaps and bringing +skeletons back to life. In this work, we use text-guided latent diffusion +models for zero-shot image-to-image translation (I2I) across large domain gaps +(longI2I), where large amounts of new visual features and new geometry need to +be generated to enter the target domain. Being able to perform translations +across large domain gaps has a wide variety of real-world applications in +criminology, astrology, environmental conservation, and paleontology. In this +work, we introduce a new task Skull2Animal for translating between skulls and +living animals. On this task, we find that unguided Generative Adversarial +Networks (GANs) are not capable of translating across large domain gaps. +Instead of these traditional I2I methods, we explore the use of guided +diffusion and image editing models and provide a new benchmark model, +Revive-2I, capable of performing zero-shot I2I via text-prompting latent +diffusion models. We find that guidance is necessary for longI2I because, to +bridge the large domain gap, prior knowledge about the target domain is needed. +In addition, we find that prompting provides the best and most scalable +information about the target domain as classifier-guided diffusion models +require retraining for specific use cases and lack stronger constraints on the +target domain because of the wide variety of images they are trained on. + +
+
+ comment: 9 pages, 10 figures, ACM Multimedia 2023 +
+
+
+
+
+ + ☆ Dual Associated Encoder for Face Restoration + + +
+ Restoring facial details from low-quality (LQ) images has remained a +challenging problem due to its ill-posedness induced by various degradations in +the wild. The existing codebook prior mitigates the ill-posedness by leveraging +an autoencoder and learned codebook of high-quality (HQ) features, achieving +remarkable quality. However, existing approaches in this paradigm frequently +depend on a single encoder pre-trained on HQ data for restoring HQ images, +disregarding the domain gap between LQ and HQ images. As a result, the encoding +of LQ inputs may be insufficient, resulting in suboptimal performance. To +tackle this problem, we propose a novel dual-branch framework named DAEFR. Our +method introduces an auxiliary LQ branch that extracts crucial information from +the LQ inputs. Additionally, we incorporate association training to promote +effective synergy between the two branches, enhancing code prediction and +output quality. We evaluate the effectiveness of DAEFR on both synthetic and +real-world datasets, demonstrating its superior performance in restoring facial +details. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Group Pose: A Simple Baseline for End-to-End Multi-person Pose + Estimation ICCV 2023 + + +
+ In this paper, we study the problem of end-to-end multi-person pose +estimation. State-of-the-art solutions adopt the DETR-like framework, and +mainly develop the complex decoder, e.g., regarding pose estimation as keypoint +box detection and combining with human detection in ED-Pose, hierarchically +predicting with pose decoder and joint (keypoint) decoder in PETR. We present a +simple yet effective transformer approach, named Group Pose. We simply regard +$K$-keypoint pose estimation as predicting a set of $N\times K$ keypoint +positions, each from a keypoint query, as well as representing each pose with +an instance query for scoring $N$ pose predictions. Motivated by the intuition +that the interaction, among across-instance queries of different types, is not +directly helpful, we make a simple modification to decoder self-attention. We +replace single self-attention over all the $N\times(K+1)$ queries with two +subsequent group self-attentions: (i) $N$ within-instance self-attention, with +each over $K$ keypoint queries and one instance query, and (ii) $(K+1)$ +same-type across-instance self-attention, each over $N$ queries of the same +type. The resulting decoder removes the interaction among across-instance +type-different queries, easing the optimization and thus improving the +performance. Experimental results on MS COCO and CrowdPose show that our +approach without human box supervision is superior to previous methods with +complex decoders, and even is slightly better than ED-Pose that uses human box +supervision. $\href{https://github.com/Michel-liu/GroupPose-Paddle}{\rm +Paddle}$ and $\href{https://github.com/Michel-liu/GroupPose}{\rm PyTorch}$ code +are available. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ A Unified Masked Autoencoder with Patchified Skeletons for Motion + Synthesis + + +
+ The synthesis of human motion has traditionally been addressed through +task-dependent models that focus on specific challenges, such as predicting +future motions or filling in intermediate poses conditioned on known key-poses. +In this paper, we present a novel task-independent model called UNIMASK-M, +which can effectively address these challenges using a unified architecture. +Our model obtains comparable or better performance than the state-of-the-art in +each field. Inspired by Vision Transformers (ViTs), our UNIMASK-M model +decomposes a human pose into body parts to leverage the spatio-temporal +relationships existing in human motion. Moreover, we reformulate various +pose-conditioned motion synthesis tasks as a reconstruction problem with +different masking patterns given as input. By explicitly informing our model +about the masked joints, our UNIMASK-M becomes more robust to occlusions. +Experimental results show that our model successfully forecasts human motion on +the Human3.6M dataset. Moreover, it achieves state-of-the-art results in motion +inbetweening on the LaFAN1 dataset, particularly in long transition periods. +More information can be found on the project website +https://sites.google.com/view/estevevallsmascaro/publications/unimask-m. + +
+
+
+
+
+ + ☆ Accurate Eye Tracking from Dense 3D Surface Reconstructions using + Single-Shot Deflectometry + + +
+ Eye-tracking plays a crucial role in the development of virtual reality +devices, neuroscience research, and psychology. Despite its significance in +numerous applications, achieving an accurate, robust, and fast eye-tracking +solution remains a considerable challenge for current state-of-the-art methods. +While existing reflection-based techniques (e.g., "glint tracking") are +considered the most accurate, their performance is limited by their reliance on +sparse 3D surface data acquired solely from the cornea surface. In this paper, +we rethink the way how specular reflections can be used for eye tracking: We +propose a novel method for accurate and fast evaluation of the gaze direction +that exploits teachings from single-shot phase-measuring-deflectometry (PMD). +In contrast to state-of-the-art reflection-based methods, our method acquires +dense 3D surface information of both cornea and sclera within only one single +camera frame (single-shot). Improvements in acquired reflection surface +points("glints") of factors $>3300 \times$ are easily achievable. We show the +feasibility of our approach with experimentally evaluated gaze errors of only +$\leq 0.25^\circ$ demonstrating a significant improvement over the current +state-of-the-art. + +
+
+
+
+
+ + ☆ A Robust Approach Towards Distinguishing Natural and Computer Generated + Images using Multi-Colorspace fused and Enriched Vision Transformer + + +
+ The works in literature classifying natural and computer generated images are +mostly designed as binary tasks either considering natural images versus +computer graphics images only or natural images versus GAN generated images +only, but not natural images versus both classes of the generated images. Also, +even though this forensic classification task of distinguishing natural and +computer generated images gets the support of the new convolutional neural +networks and transformer based architectures that can give remarkable +classification accuracies, they are seen to fail over the images that have +undergone some post-processing operations usually performed to deceive the +forensic algorithms, such as JPEG compression, gaussian noise, etc. This work +proposes a robust approach towards distinguishing natural and computer +generated images including both, computer graphics and GAN generated images +using a fusion of two vision transformers where each of the transformer +networks operates in different color spaces, one in RGB and the other in YCbCr +color space. The proposed approach achieves high performance gain when compared +to a set of baselines, and also achieves higher robustness and generalizability +than the baselines. The features of the proposed model when visualized are seen +to obtain higher separability for the classes than the input image features and +the baseline features. This work also studies the attention map visualizations +of the networks of the fused model and observes that the proposed methodology +can capture more image information relevant to the forensic task of classifying +natural and generated images. + +
+
+
+
+
+ + ☆ EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language + Models + + +
+ Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy +issues, which means they are unaware of unseen events or generate text with +incorrect facts owing to the outdated/noisy data. To this end, many knowledge +editing approaches for LLMs have emerged -- aiming to subtly inject/edit +updated knowledge or adjust undesired behavior while minimizing the impact on +unrelated inputs. Nevertheless, due to significant differences among various +knowledge editing methods and the variations in task setups, there is no +standard implementation framework available for the community, which hinders +practitioners to apply knowledge editing to applications. To address these +issues, we propose EasyEdit, an easy-to-use knowledge editing framework for +LLMs. It supports various cutting-edge knowledge editing approaches and can be +readily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc. +Empirically, we report the knowledge editing results on LlaMA-2 with EasyEdit, +demonstrating that knowledge editing surpasses traditional fine-tuning in terms +of reliability and generalization. We have released the source code on GitHub +at https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and +comprehensive documentation for beginners to get started. Besides, we present +an online system for real-time knowledge editing, and a demo video at +http://knowlm.zjukg.cn/easyedit.mp4. + +
+
+ comment: The project website is https://github.com/zjunlp/EasyEdit +
+
+
+
+
+ + ☆ Diving with Penguins: Detecting Penguins and their Prey in Animal-borne + Underwater Videos via Deep Learning + + +
+ African penguins (Spheniscus demersus) are an endangered species. Little is +known regarding their underwater hunting strategies and associated predation +success rates, yet this is essential for guiding conservation. Modern +bio-logging technology has the potential to provide valuable insights, but +manually analysing large amounts of data from animal-borne video recorders +(AVRs) is time-consuming. In this paper, we publish an animal-borne underwater +video dataset of penguins and introduce a ready-to-deploy deep learning system +capable of robustly detecting penguins (mAP50@98.0%) and also instances of fish +(mAP50@73.3%). We note that the detectors benefit explicitly from air-bubble +learning to improve accuracy. Extending this detector towards a dual-stream +behaviour recognition network, we also provide the first results for +identifying predation behaviour in penguin underwater videos. Whilst results +are promising, further work is required for useful applicability of predation +behaviour detection in field scenarios. In summary, we provide a highly +reliable underwater penguin detector, a fish detector, and a valuable first +attempt towards an automated visual detection of complex behaviours in a marine +predator. We publish the networks, the DivingWithPenguins video dataset, +annotations, splits, and weights for full reproducibility and immediate +usability by practitioners. + +
+
+ comment: 5 pages, 5 figures, 4 Tables, "3rd International Workshop on Camera + traps, AI, and Ecology (CamTrapAI)" +
+
+
+
+
+ + ☆ Efficient Real-time Smoke Filtration with 3D LiDAR for Search and Rescue + with Autonomous Heterogeneous Robotic Systems + + +
+ Search and Rescue (SAR) missions in harsh and unstructured Sub-Terranean +(Sub-T) environments in the presence of aerosol particles have recently become +the main focus in the field of robotics. Aerosol particles such as smoke and +dust directly affect the performance of any mobile robotic platform due to +their reliance on their onboard perception systems for autonomous navigation +and localization in Global Navigation Satellite System (GNSS)-denied +environments. Although obstacle avoidance and object detection algorithms are +robust to the presence of noise to some degree, their performance directly +relies on the quality of captured data by onboard sensors such as Light +Detection And Ranging (LiDAR) and camera. Thus, this paper proposes a novel +modular agnostic filtration pipeline based on intensity and spatial information +such as local point density for removal of detected smoke particles from Point +Cloud (PCL) prior to its utilization for collision detection. Furthermore, the +efficacy of the proposed framework in the presence of smoke during multiple +frontier exploration missions is investigated while the experimental results +are presented to facilitate comparison with other methodologies and their +computational impact. This provides valuable insight to the research community +for better utilization of filtration schemes based on available computation +resources while considering the safe autonomous navigation of mobile robots. + +
+
+ comment: Accepted in the 49th Annual Conference of the IEEE Industrial + Electronics Society [IECON2023] +
+
+
+
+
+ + ☆ Large-kernel Attention for Efficient and Robust Brain Lesion + Segmentation + + +
+ Vision transformers are effective deep learning models for vision tasks, +including medical image segmentation. However, they lack efficiency and +translational invariance, unlike convolutional neural networks (CNNs). To model +long-range interactions in 3D brain lesion segmentation, we propose an +all-convolutional transformer block variant of the U-Net architecture. We +demonstrate that our model provides the greatest compromise in three factors: +performance competitive with the state-of-the-art; parameter efficiency of a +CNN; and the favourable inductive biases of a transformer. Our public +implementation is available at https://github.com/liamchalcroft/MDUNet . + +
+
+
+
+
+ + ☆ AAFACE: Attribute-aware Attentional Network for Face Recognition ICIP 2023 + + +
+ In this paper, we present a new multi-branch neural network that +simultaneously performs soft biometric (SB) prediction as an auxiliary modality +and face recognition (FR) as the main task. Our proposed network named AAFace +utilizes SB attributes to enhance the discriminative ability of FR +representation. To achieve this goal, we propose an attribute-aware attentional +integration (AAI) module to perform weighted integration of FR with SB feature +maps. Our proposed AAI module is not only fully context-aware but also capable +of learning complex relationships between input features by means of the +sequential multi-scale channel and spatial sub-modules. Experimental results +verify the superiority of our proposed network compared with the +state-of-the-art (SoTA) SB prediction and FR methods. + +
+
+ comment: Accepted to $30^{th}$ IEEE International Conference on Image + Processing (ICIP 2023) as an oral presentation +
+
+
+
+
+ + ☆ UniWorld: Autonomous Driving Pre-training via World Models + + +
+ In this paper, we draw inspiration from Alberto Elfes' pioneering work in +1989, where he introduced the concept of the occupancy grid as World Models for +robots. We imbue the robot with a spatial-temporal world model, termed +UniWorld, to perceive its surroundings and predict the future behavior of other +participants. UniWorld involves initially predicting 4D geometric occupancy as +the World Models for foundational stage and subsequently fine-tuning on +downstream tasks. UniWorld can estimate missing information concerning the +world state and predict plausible future states of the world. Besides, +UniWorld's pre-training process is label-free, enabling the utilization of +massive amounts of image-LiDAR pairs to build a Foundational Model.The proposed +unified pre-training framework demonstrates promising results in key tasks such +as motion prediction, multi-camera 3D object detection, and surrounding +semantic scene completion. When compared to monocular pre-training methods on +the nuScenes dataset, UniWorld shows a significant improvement of about 1.5% in +IoU for motion prediction, 2.0% in mAP and 2.0% in NDS for multi-camera 3D +object detection, as well as a 3% increase in mIoU for surrounding semantic +scene completion. By adopting our unified pre-training method, a 25% reduction +in 3D training annotation costs can be achieved, offering significant practical +value for the implementation of real-world autonomous driving. Codes are +publicly available at https://github.com/chaytonmin/UniWorld. + +
+
+ comment: 8 pages, 5 figures. arXiv admin note: substantial text overlap with + arXiv:2305.18829 +
+
+
+
+
+ + ☆ RestoreFormer++: Towards Real-World Blind Face Restoration from + Undegraded Key-Value Pairs + + +
+ Blind face restoration aims at recovering high-quality face images from those +with unknown degradations. Current algorithms mainly introduce priors to +complement high-quality details and achieve impressive progress. However, most +of these algorithms ignore abundant contextual information in the face and its +interplay with the priors, leading to sub-optimal performance. Moreover, they +pay less attention to the gap between the synthetic and real-world scenarios, +limiting the robustness and generalization to real-world applications. In this +work, we propose RestoreFormer++, which on the one hand introduces +fully-spatial attention mechanisms to model the contextual information and the +interplay with the priors, and on the other hand, explores an extending +degrading model to help generate more realistic degraded face images to +alleviate the synthetic-to-real-world gap. Compared with current algorithms, +RestoreFormer++ has several crucial benefits. First, instead of using a +multi-head self-attention mechanism like the traditional visual transformer, we +introduce multi-head cross-attention over multi-scale features to fully explore +spatial interactions between corrupted information and high-quality priors. In +this way, it can facilitate RestoreFormer++ to restore face images with higher +realness and fidelity. Second, in contrast to the recognition-oriented +dictionary, we learn a reconstruction-oriented dictionary as priors, which +contains more diverse high-quality facial details and better accords with the +restoration target. Third, we introduce an extending degrading model that +contains more realistic degraded scenarios for training data synthesizing, and +thus helps to enhance the robustness and generalization of our RestoreFormer++ +model. Extensive experiments show that RestoreFormer++ outperforms +state-of-the-art algorithms on both synthetic and real-world datasets. + +
+
+ comment: Submitted to TPAMI. An extension of RestoreFormer +
+
+
+
+
+ + ☆ DS-Depth: Dynamic and Static Depth Estimation via a Fusion Cost Volume + + +
+ Self-supervised monocular depth estimation methods typically rely on the +reprojection error to capture geometric relationships between successive frames +in static environments. However, this assumption does not hold in dynamic +objects in scenarios, leading to errors during the view synthesis stage, such +as feature mismatch and occlusion, which can significantly reduce the accuracy +of the generated depth maps. To address this problem, we propose a novel +dynamic cost volume that exploits residual optical flow to describe moving +objects, improving incorrectly occluded regions in static cost volumes used in +previous work. Nevertheless, the dynamic cost volume inevitably generates extra +occlusions and noise, thus we alleviate this by designing a fusion module that +makes static and dynamic cost volumes compensate for each other. In other +words, occlusion from the static volume is refined by the dynamic volume, and +incorrect information from the dynamic volume is eliminated by the static +volume. Furthermore, we propose a pyramid distillation loss to reduce +photometric error inaccuracy at low resolutions and an adaptive photometric +error loss to alleviate the flow direction of the large gradient in the +occlusion regions. We conducted extensive experiments on the KITTI and +Cityscapes datasets, and the results demonstrate that our model outperforms +previously published baselines for self-supervised monocular depth estimation. + +
+
+
+
+
+ + ☆ Distance Matters For Improving Performance Estimation Under Covariate + Shift ICCV + + +
+ Performance estimation under covariate shift is a crucial component of safe +AI model deployment, especially for sensitive use-cases. Recently, several +solutions were proposed to tackle this problem, most leveraging model +predictions or softmax confidence to derive accuracy estimates. However, under +dataset shifts, confidence scores may become ill-calibrated if samples are too +far from the training distribution. In this work, we show that taking into +account distances of test samples to their expected training distribution can +significantly improve performance estimation under covariate shift. Precisely, +we introduce a "distance-check" to flag samples that lie too far from the +expected distribution, to avoid relying on their untrustworthy model outputs in +the accuracy estimation step. We demonstrate the effectiveness of this method +on 13 image classification tasks, across a wide-range of natural and synthetic +distribution shifts and hundreds of models, with a median relative MAE +improvement of 27% over the best baseline across all tasks, and SOTA +performance on 10 out of 13 tasks. Our code is publicly available at +https://github.com/melanibe/distance_matters_performance_estimation. + +
+
+ comment: Accepted to ICCV Workshop on Uncertainty Quantification for Computer + Vision 2023 +
+
+
+
+
+ + ☆ Automated Ensemble-Based Segmentation of Adult Brain Tumors: A Novel + Approach Using the BraTS AFRICA Challenge Data + + +
+ Brain tumors, particularly glioblastoma, continue to challenge medical +diagnostics and treatments globally. This paper explores the application of +deep learning to multi-modality magnetic resonance imaging (MRI) data for +enhanced brain tumor segmentation precision in the Sub-Saharan Africa patient +population. We introduce an ensemble method that comprises eleven unique +variations based on three core architectures: UNet3D, ONet3D, SphereNet3D and +modified loss functions. The study emphasizes the need for both age- and +population-based segmentation models, to fully account for the complexities in +the brain. Our findings reveal that the ensemble approach, combining different +architectures, outperforms single models, leading to improved evaluation +metrics. Specifically, the results exhibit Dice scores of 0.82, 0.82, and 0.87 +for enhancing tumor, tumor core, and whole tumor labels respectively. These +results underline the potential of tailored deep learning techniques in +precisely segmenting brain tumors and lay groundwork for future work to +fine-tune models and assess performance across different brain regions. + +
+
+ comment: 3 figs and 3 tables +
+
+
+
+
+ + ☆ Automated Ensemble-Based Segmentation of Pediatric Brain Tumors: A Novel + Approach Using the CBTN-CONNECT-ASNR-MICCAI BraTS-PEDs 2023 Challenge Data + + +
+ Brain tumors remain a critical global health challenge, necessitating +advancements in diagnostic techniques and treatment methodologies. In response +to the growing need for age-specific segmentation models, particularly for +pediatric patients, this study explores the deployment of deep learning +techniques using magnetic resonance imaging (MRI) modalities. By introducing a +novel ensemble approach using ONet and modified versions of UNet, coupled with +innovative loss functions, this study achieves a precise segmentation model for +the BraTS-PEDs 2023 Challenge. Data augmentation, including both single and +composite transformations, ensures model robustness and accuracy across +different scanning protocols. The ensemble strategy, integrating the ONet and +UNet models, shows greater effectiveness in capturing specific features and +modeling diverse aspects of the MRI images which result in lesion_wise dice +scores of 0.52, 0.72 and 0.78 for enhancing tumor, tumor core and whole tumor +labels respectively. Visual comparisons further confirm the superiority of the +ensemble method in accurate tumor region coverage. The results indicate that +this advanced ensemble approach, building upon the unique strengths of +individual models, offers promising prospects for enhanced diagnostic accuracy +and effective treatment planning for brain tumors in pediatric brains. + +
+
+ comment: 3 Figs, 3 Tables +
+
+
+
+
+ + ☆ Unified Data-Free Compression: Pruning and Quantization without + Fine-Tuning ICCV2023 + + +
+ Structured pruning and quantization are promising approaches for reducing the +inference time and memory footprint of neural networks. However, most existing +methods require the original training dataset to fine-tune the model. This not +only brings heavy resource consumption but also is not possible for +applications with sensitive or proprietary data due to privacy and security +concerns. Therefore, a few data-free methods are proposed to address this +problem, but they perform data-free pruning and quantization separately, which +does not explore the complementarity of pruning and quantization. In this +paper, we propose a novel framework named Unified Data-Free Compression(UDFC), +which performs pruning and quantization simultaneously without any data and +fine-tuning process. Specifically, UDFC starts with the assumption that the +partial information of a damaged(e.g., pruned or quantized) channel can be +preserved by a linear combination of other channels, and then derives the +reconstruction form from the assumption to restore the information loss due to +compression. Finally, we formulate the reconstruction error between the +original network and its compressed network, and theoretically deduce the +closed-form solution. We evaluate the UDFC on the large-scale image +classification task and obtain significant improvements over various network +architectures and compression methods. For example, we achieve a 20.54% +accuracy improvement on ImageNet dataset compared to SOTA method with 30% +pruning ratio and 6-bit quantization on ResNet-34. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ☆ FOLT: Fast Multiple Object Tracking from UAV-captured Videos Based on + Optical Flow + + +
+ Multiple object tracking (MOT) has been successfully investigated in computer +vision. + However, MOT for the videos captured by unmanned aerial vehicles (UAV) is +still challenging due to small object size, blurred object appearance, and very +large and/or irregular motion in both ground objects and UAV platforms. + In this paper, we propose FOLT to mitigate these problems and reach fast and +accurate MOT in UAV view. + Aiming at speed-accuracy trade-off, FOLT adopts a modern detector and +light-weight optical flow extractor to extract object detection features and +motion features at a minimum cost. + Given the extracted flow, the flow-guided feature augmentation is designed to +augment the object detection feature based on its optical flow, which improves +the detection of small objects. + Then the flow-guided motion prediction is also proposed to predict the +object's position in the next frame, which improves the tracking performance of +objects with very large displacements between adjacent frames. + Finally, the tracker matches the detected objects and predicted objects using +a spatially matching scheme to generate tracks for every object. + Experiments on Visdrone and UAVDT datasets show that our proposed model can +successfully track small objects with large and irregular motion and outperform +existing state-of-the-art methods in UAV-MOT tasks. + +
+
+
+
+
+ + ☆ Towards Robust Real-Time Scene Text Detection: From Semantic to Instance + Representation Learning ACM MM 2023 + + +
+ Due to the flexible representation of arbitrary-shaped scene text and simple +pipeline, bottom-up segmentation-based methods begin to be mainstream in +real-time scene text detection. Despite great progress, these methods show +deficiencies in robustness and still suffer from false positives and instance +adhesion. Different from existing methods which integrate multiple-granularity +features or multiple outputs, we resort to the perspective of representation +learning in which auxiliary tasks are utilized to enable the encoder to jointly +learn robust features with the main task of per-pixel classification during +optimization. For semantic representation learning, we propose global-dense +semantic contrast (GDSC), in which a vector is extracted for global semantic +representation, then used to perform element-wise contrast with the dense grid +features. To learn instance-aware representation, we propose to combine +top-down modeling (TDM) with the bottom-up framework to provide implicit +instance-level clues for the encoder. With the proposed GDSC and TDM, the +encoder network learns stronger representation without introducing any +parameters and computations during inference. Equipped with a very light +decoder, the detector can achieve more robust real-time scene text detection. +Experimental results on four public datasets show that the proposed method can +outperform or be comparable to the state-of-the-art on both accuracy and speed. +Specifically, the proposed method achieves 87.2% F-measure with 48.2 FPS on +Total-Text and 89.6% F-measure with 36.9 FPS on MSRA-TD500 on a single GeForce +RTX 2080 Ti GPU. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ SEMI-CenterNet: A Machine Learning Facilitated Approach for + Semiconductor Defect Inspection + + +
+ Continual shrinking of pattern dimensions in the semiconductor domain is +making it increasingly difficult to inspect defects due to factors such as the +presence of stochastic noise and the dynamic behavior of defect patterns and +types. Conventional rule-based methods and non-parametric supervised machine +learning algorithms like KNN mostly fail at the requirements of semiconductor +defect inspection at these advanced nodes. Deep Learning (DL)-based methods +have gained popularity in the semiconductor defect inspection domain because +they have been proven robust towards these challenging scenarios. In this +research work, we have presented an automated DL-based approach for efficient +localization and classification of defects in SEM images. We have proposed +SEMI-CenterNet (SEMI-CN), a customized CN architecture trained on SEM images of +semiconductor wafer defects. The use of the proposed CN approach allows +improved computational efficiency compared to previously studied DL models. +SEMI-CN gets trained to output the center, class, size, and offset of a defect +instance. This is different from the approach of most object detection models +that use anchors for bounding box prediction. Previous methods predict +redundant bounding boxes, most of which are discarded in postprocessing. CN +mitigates this by only predicting boxes for likely defect center points. We +train SEMI-CN on two datasets and benchmark two ResNet backbones for the +framework. Initially, ResNet models pretrained on the COCO dataset undergo +training using two datasets separately. Primarily, SEMI-CN shows significant +improvement in inference time against previous research works. Finally, +transfer learning (using weights of custom SEM dataset) is applied from ADI +dataset to AEI dataset and vice-versa, which reduces the required training time +for both backbones to reach the best mAP against conventional training method. + +
+
+
+
+
+ + ☆ HyperSparse Neural Networks: Shifting Exploration to Exploitation + through Adaptive Regularization ICCV'23 + + +
+ Sparse neural networks are a key factor in developing resource-efficient +machine learning applications. We propose the novel and powerful sparse +learning method Adaptive Regularized Training (ART) to compress dense into +sparse networks. Instead of the commonly used binary mask during training to +reduce the number of model weights, we inherently shrink weights close to zero +in an iterative manner with increasing weight regularization. Our method +compresses the pre-trained model knowledge into the weights of highest +magnitude. Therefore, we introduce a novel regularization loss named +HyperSparse that exploits the highest weights while conserving the ability of +weight exploration. Extensive experiments on CIFAR and TinyImageNet show that +our method leads to notable performance gains compared to other sparsification +methods, especially in extremely high sparsity regimes up to 99.8 percent model +sparsity. Additional investigations provide new insights into the patterns that +are encoded in weights with high magnitudes. + +
+
+ comment: ICCV'23 Workshops +
+
+
+
+
+ + ☆ SAM Meets Robotic Surgery: An Empirical Study on Generalization, + Robustness and Adaptation MICCAI 2023 + + +
+ The Segment Anything Model (SAM) serves as a fundamental model for semantic +segmentation and demonstrates remarkable generalization capabilities across a +wide range of downstream scenarios. In this empirical study, we examine SAM's +robustness and zero-shot generalizability in the field of robotic surgery. We +comprehensively explore different scenarios, including prompted and unprompted +situations, bounding box and points-based prompt approaches, as well as the +ability to generalize under corruptions and perturbations at five severity +levels. Additionally, we compare the performance of SAM with state-of-the-art +supervised models. We conduct all the experiments with two well-known robotic +instrument segmentation datasets from MICCAI EndoVis 2017 and 2018 challenges. +Our extensive evaluation results reveal that although SAM shows remarkable +zero-shot generalization ability with bounding box prompts, it struggles to +segment the whole instrument with point-based prompts and unprompted settings. +Furthermore, our qualitative figures demonstrate that the model either failed +to predict certain parts of the instrument mask (e.g., jaws, wrist) or +predicted parts of the instrument as wrong classes in the scenario of +overlapping instruments within the same bounding box or with the point-based +prompt. In fact, SAM struggles to identify instruments in complex surgical +scenarios characterized by the presence of blood, reflection, blur, and shade. +Additionally, SAM is insufficiently robust to maintain high performance when +subjected to various forms of data corruption. We also attempt to fine-tune SAM +using Low-rank Adaptation (LoRA) and propose SurgicalSAM, which shows the +capability in class-wise mask prediction without prompt. Therefore, we can +argue that, without further domain-specific fine-tuning, SAM is not ready for +downstream surgical tasks. + +
+
+ comment: Accepted as Oral Presentation at MedAGI Workshop - MICCAI 2023 1st + International Workshop on Foundation Models for General Medical AI. arXiv + admin note: substantial text overlap with arXiv:2304.14674 +
+
+
+
+
+ + ☆ DELO: Deep Evidential LiDAR Odometry using Partial Optimal Transport ICCV 2023 + + +
+ Accurate, robust, and real-time LiDAR-based odometry (LO) is imperative for +many applications like robot navigation, globally consistent 3D scene map +reconstruction, or safe motion-planning. Though LiDAR sensor is known for its +precise range measurement, the non-uniform and uncertain point sampling density +induce structural inconsistencies. Hence, existing supervised and unsupervised +point set registration methods fail to establish one-to-one matching +correspondences between LiDAR frames. We introduce a novel deep learning-based +real-time (approx. 35-40ms per frame) LO method that jointly learns accurate +frame-to-frame correspondences and model's predictive uncertainty (PU) as +evidence to safe-guard LO predictions. In this work, we propose (i) partial +optimal transportation of LiDAR feature descriptor for robust LO estimation, +(ii) joint learning of predictive uncertainty while learning odometry over +driving sequences, and (iii) demonstrate how PU can serve as evidence for +necessary pose-graph optimization when LO network is either under or over +confident. We evaluate our method on KITTI dataset and show competitive +performance, even superior generalization ability over recent state-of-the-art +approaches. Source codes are available. + +
+
+ comment: Accepted in ICCV 2023 Workshop +
+
+
+
+
+ + ☆ Diffusion Based Augmentation for Captioning and Retrieval in Cultural + Heritage ICCV 2023 + + +
+ Cultural heritage applications and advanced machine learning models are +creating a fruitful synergy to provide effective and accessible ways of +interacting with artworks. Smart audio-guides, personalized art-related content +and gamification approaches are just a few examples of how technology can be +exploited to provide additional value to artists or exhibitions. Nonetheless, +from a machine learning point of view, the amount of available artistic data is +often not enough to train effective models. Off-the-shelf computer vision +modules can still be exploited to some extent, yet a severe domain shift is +present between art images and standard natural image datasets used to train +such models. As a result, this can lead to degraded performance. This paper +introduces a novel approach to address the challenges of limited annotated data +and domain shifts in the cultural heritage domain. By leveraging generative +vision-language models, we augment art datasets by generating diverse +variations of artworks conditioned on their captions. This augmentation +strategy enhances dataset diversity, bridging the gap between natural images +and artworks, and improving the alignment of visual cues with knowledge from +general-purpose datasets. The generated variations assist in training vision +and language models with a deeper understanding of artistic characteristics and +that are able to generate better captions with appropriate jargon. + +
+
+ comment: Accepted at ICCV 2023 4th Workshop on e-Heritage +
+
+
+
+
+ + ☆ CTP: Towards Vision-Language Continual Pretraining via Compatible + Momentum Contrast and Topology Preservation ICCV 2023 + + +
+ Vision-Language Pretraining (VLP) has shown impressive results on diverse +downstream tasks by offline training on large-scale datasets. Regarding the +growing nature of real-world data, such an offline training paradigm on +ever-expanding data is unsustainable, because models lack the continual +learning ability to accumulate knowledge constantly. However, most continual +learning studies are limited to uni-modal classification and existing +multi-modal datasets cannot simulate continual non-stationary data stream +scenarios. To support the study of Vision-Language Continual Pretraining +(VLCP), we first contribute a comprehensive and unified benchmark dataset P9D +which contains over one million product image-text pairs from 9 industries. The +data from each industry as an independent task supports continual learning and +conforms to the real-world long-tail nature to simulate pretraining on web +data. We comprehensively study the characteristics and challenges of VLCP, and +propose a new algorithm: Compatible momentum contrast with Topology +Preservation, dubbed CTP. The compatible momentum model absorbs the knowledge +of the current and previous-task models to flexibly update the modal feature. +Moreover, Topology Preservation transfers the knowledge of embedding across +tasks while preserving the flexibility of feature adjustment. The experimental +results demonstrate our method not only achieves superior performance compared +with other baselines but also does not bring an expensive training burden. +Dataset and codes are available at https://github.com/KevinLight831/CTP. + +
+
+ comment: Accepted by ICCV 2023. Code: https://github.com/KevinLight831/CTP +
+
+
+
+
+ + ☆ A Time-aware tensor decomposition for tracking evolving patterns + + +
+ Time-evolving data sets can often be arranged as a higher-order tensor with +one of the modes being the time mode. While tensor factorizations have been +successfully used to capture the underlying patterns in such higher-order data +sets, the temporal aspect is often ignored, allowing for the reordering of time +points. In recent studies, temporal regularizers are incorporated in the time +mode to tackle this issue. Nevertheless, existing approaches still do not allow +underlying patterns to change in time (e.g., spatial changes in the brain, +contextual changes in topics). In this paper, we propose temporal PARAFAC2 +(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal +regularization to extract gradually evolving patterns from temporal data. +Through extensive experiments on synthetic data, we demonstrate that tPARAFAC2 +can capture the underlying evolving patterns accurately performing better than +PARAFAC2 and coupled matrix factorization with temporal smoothness +regularization. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ An Outlook into the Future of Egocentric Vision + + +
+ What will the future be? We wonder! In this survey, we explore the gap +between current research in egocentric vision and the ever-anticipated future, +where wearable computing, with outward facing cameras and digital overlays, is +expected to be integrated in our every day lives. To understand this gap, the +article starts by envisaging the future through character-based stories, +showcasing through examples the limitations of current technology. We then +provide a mapping between this future and previously defined research tasks. +For each task, we survey its seminal works, current state-of-the-art +methodologies and available datasets, then reflect on shortcomings that limit +its applicability to future research. Note that this survey focuses on software +models for egocentric vision, independent of any specific hardware. The paper +concludes with recommendations for areas of immediate explorations so as to +unlock our path to the future always-on, personalised and life-enhancing +egocentric vision. + +
+
+ comment: We invite comments, suggestions and corrections here: + https://openreview.net/forum?id=V3974SUk1w +
+
+
+
+
+ + ☆ On the Importance of Spatial Relations for Few-shot Action Recognition + + +
+ Deep learning has achieved great success in video recognition, yet still +struggles to recognize novel actions when faced with only a few examples. To +tackle this challenge, few-shot action recognition methods have been proposed +to transfer knowledge from a source dataset to a novel target dataset with only +one or a few labeled videos. However, existing methods mainly focus on modeling +the temporal relations between the query and support videos while ignoring the +spatial relations. In this paper, we find that the spatial misalignment between +objects also occurs in videos, notably more common than the temporal +inconsistency. We are thus motivated to investigate the importance of spatial +relations and propose a more accurate few-shot action recognition method that +leverages both spatial and temporal information. Particularly, a novel Spatial +Alignment Cross Transformer (SA-CT) which learns to re-adjust the spatial +relations and incorporates the temporal information is contributed. Experiments +reveal that, even without using any temporal information, the performance of +SA-CT is comparable to temporal based methods on 3/4 benchmarks. To further +incorporate the temporal information, we propose a simple yet effective +Temporal Mixer module. The Temporal Mixer enhances the video representation and +improves the performance of the full SA-CT model, achieving very competitive +results. In this work, we also exploit large-scale pretrained models for +few-shot action recognition, providing useful insights for this research +direction. + +
+
+
+
+
+ + ☆ SCSC: Spatial Cross-scale Convolution Module to Strengthen both CNNs and + Transformers ICCV2023 + + +
+ This paper presents a module, Spatial Cross-scale Convolution (SCSC), which +is verified to be effective in improving both CNNs and Transformers. Nowadays, +CNNs and Transformers have been successful in a variety of tasks. Especially +for Transformers, increasing works achieve state-of-the-art performance in the +computer vision community. Therefore, researchers start to explore the +mechanism of those architectures. Large receptive fields, sparse connections, +weight sharing, and dynamic weight have been considered keys to designing +effective base models. However, there are still some issues to be addressed: +large dense kernels and self-attention are inefficient, and large receptive +fields make it hard to capture local features. Inspired by the above analyses +and to solve the mentioned problems, in this paper, we design a general module +taking in these design keys to enhance both CNNs and Transformers. SCSC +introduces an efficient spatial cross-scale encoder and spatial embed module to +capture assorted features in one layer. On the face recognition task, +FaceResNet with SCSC can improve 2.7% with 68% fewer FLOPs and 79% fewer +parameters. On the ImageNet classification task, Swin Transformer with SCSC can +achieve even better performance with 22% fewer FLOPs, and ResNet with CSCS can +improve 5.3% with similar complexity. Furthermore, a traditional network (e.g., +ResNet) embedded with SCSC can match Swin Transformer's performance. + +
+
+ comment: ICCV2023 Workshop (New Ideas in Vision Transformers) +
+
+
+
+
+ + ☆ Checklist to Transparently Define Test Oracles for TP, FP, and FN + Objects in Automated Driving + + +
+ Popular test oracles for the perception subsystem of driving automation +systems identify true-positive (TP), false-positive (FP), and false-negative +(FN) objects. Oracle transparency is needed for comparing test results and for +safety cases. To date, there exists a common notion of TPs, FPs, and FNs in the +field, but apparently no published way to comprehensively define their oracles. +Therefore, this paper provides a checklist of functional aspects and +implementation details that affect the oracle behavior. Besides labeling +policies of the test set, we cover fields of view, occlusion handling, +safety-relevant areas, matching criteria, temporal and probabilistic issues, +and further aspects. Even though our checklist can hardly be formalized, it can +help practitioners maximize the transparency of their oracles, which, in turn, +makes statements on object perception more reliable and comparable. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ FocusFlow: Boosting Key-Points Optical Flow Estimation for Autonomous + Driving + + +
+ Key-point-based scene understanding is fundamental for autonomous driving +applications. At the same time, optical flow plays an important role in many +vision tasks. However, due to the implicit bias of equal attention on all +points, classic data-driven optical flow estimation methods yield less +satisfactory performance on key points, limiting their implementations in +key-point-critical safety-relevant scenarios. To address these issues, we +introduce a points-based modeling method that requires the model to learn +key-point-related priors explicitly. Based on the modeling method, we present +FocusFlow, a framework consisting of 1) a mix loss function combined with a +classic photometric loss function and our proposed Conditional Point Control +Loss (CPCL) function for diverse point-wise supervision; 2) a conditioned +controlling model which substitutes the conventional feature encoder by our +proposed Condition Control Encoder (CCE). CCE incorporates a Frame Feature +Encoder (FFE) that extracts features from frames, a Condition Feature Encoder +(CFE) that learns to control the feature extraction behavior of FFE from input +masks containing information of key points, and fusion modules that transfer +the controlling information between FFE and CFE. Our FocusFlow framework shows +outstanding performance with up to +44.5% precision improvement on various key +points such as ORB, SIFT, and even learning-based SiLK, along with exceptional +scalability for most existing data-driven optical flow methods like PWC-Net, +RAFT, and FlowFormer. Notably, FocusFlow yields competitive or superior +performances rivaling the original models on the whole frame. The source code +will be available at https://github.com/ZhonghuaYi/FocusFlow_official. + +
+
+ comment: The source code of FocusFlow will be available at + https://github.com/ZhonghuaYi/FocusFlow_official +
+
+
+
+
+ + ☆ Temporal Sentence Grounding in Streaming Videos ACM MM 2023 + + +
+ This paper aims to tackle a novel task - Temporal Sentence Grounding in +Streaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance +between a video stream and a given sentence query. Unlike regular videos, +streaming videos are acquired continuously from a particular source, and are +always desired to be processed on-the-fly in many applications such as +surveillance and live-stream analysis. Thus, TSGSV is challenging since it +requires the model to infer without future frames and process long historical +frames effectively, which is untouched in the early methods. To specifically +address the above challenges, we propose two novel methods: (1) a TwinNet +structure that enables the model to learn about upcoming events; and (2) a +language-guided feature compressor that eliminates redundant visual frames and +reinforces the frames that are relevant to the query. We conduct extensive +experiments using ActivityNet Captions, TACoS, and MAD datasets. The results +demonstrate the superiority of our proposed methods. A systematic ablation +study also confirms their effectiveness. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Masked Motion Predictors are Strong 3D Action Representation Learners ICCV 2023 + + +
+ In 3D human action recognition, limited supervised data makes it challenging +to fully tap into the modeling potential of powerful networks such as +transformers. As a result, researchers have been actively investigating +effective self-supervised pre-training strategies. In this work, we show that +instead of following the prevalent pretext task to perform masked +self-component reconstruction in human joints, explicit contextual motion +modeling is key to the success of learning effective feature representation for +3D action recognition. Formally, we propose the Masked Motion Prediction (MAMP) +framework. To be specific, the proposed MAMP takes as input the masked +spatio-temporal skeleton sequence and predicts the corresponding temporal +motion of the masked human joints. Considering the high temporal redundancy of +the skeleton sequence, in our MAMP, the motion information also acts as an +empirical semantic richness prior that guide the masking process, promoting +better attention to semantically rich temporal regions. Extensive experiments +on NTU-60, NTU-120, and PKU-MMD datasets show that the proposed MAMP +pre-training substantially improves the performance of the adopted vanilla +transformer, achieving state-of-the-art results without bells and whistles. The +source code of our MAMP is available at https://github.com/maoyunyao/MAMP. + +
+
+ comment: To appear in ICCV 2023 +
+
+
+
+
+ + ☆ ICPC: Instance-Conditioned Prompting with Contrastive Learning for + Semantic Segmentation + + +
+ Modern supervised semantic segmentation methods are usually finetuned based +on the supervised or self-supervised models pre-trained on ImageNet. Recent +work shows that transferring the knowledge from CLIP to semantic segmentation +via prompt learning can achieve promising performance. The performance boost +comes from the feature enhancement with multimodal alignment, i.e., the dot +product between vision and text embeddings. However, how to improve the +multimodal alignment for better transfer performance in dense tasks remains +underexplored. In this work, we focus on improving the quality of vision-text +alignment from two aspects of prompting design and loss function, and present +an instance-conditioned prompting with contrastive learning (ICPC) framework. +First, compared with the static prompt designs, we reveal that dynamic +prompting conditioned on image content can more efficiently utilize the text +encoder for complex dense tasks. Second, we propose an align-guided contrastive +loss to refine the alignment of vision and text embeddings. We further propose +lightweight multi-scale alignment for better performance. Extensive experiments +on three large-scale datasets (ADE20K, COCO-Stuff10k, and ADE20K-Full) +demonstrate that ICPC brings consistent improvements across diverse backbones. +Taking ResNet-50 as an example, ICPC outperforms the state-of-the-art +counterpart by 1.71%, 1.05%, and 1.41% mIoU on the three datasets, +respectively. + +
+
+
+
+
+ + ☆ Teeth And Root Canals Segmentation Using ZXYFormer With Uncertainty + Guidance And Weight Transfer + + +
+ This study attempts to segment teeth and root-canals simultaneously from CBCT +images, but there are very challenging problems in this process. First, the +clinical CBCT image data is very large (e.g., 672 *688 * 688), and the use of +downsampling operation will lose useful information about teeth and root +canals. Second, teeth and root canals are very different in morphology, and it +is difficult for a simple network to identify them precisely. In addition, +there are weak edges at the tooth, between tooth and root canal, which makes it +very difficult to segment such weak edges. To this end, we propose a +coarse-to-fine segmentation method based on inverse feature fusion transformer +and uncertainty estimation to address above challenging problems. First, we use +the downscaled volume data (e.g., 128 * 128 * 128) to conduct coarse +segmentation and map it to the original volume to obtain the area of teeth and +root canals. Then, we design a transformer with reverse feature fusion, which +can bring better segmentation effect of different morphological objects by +transferring deeper features to shallow features. Finally, we design an +auxiliary branch to calculate and refine the difficult areas in order to +improve the weak edge segmentation performance of teeth and root canals. +Through the combined tooth and root canal segmentation experiment of 157 +clinical high-resolution CBCT data, it is verified that the proposed method is +superior to the existing tooth or root canal segmentation methods. + +
+
+
+
+
+ + ☆ A Local Iterative Approach for the Extraction of 2D Manifolds from + Strongly Curved and Folded Thin-Layer Structures + + +
+ Ridge surfaces represent important features for the analysis of 3-dimensional +(3D) datasets in diverse applications and are often derived from varying +underlying data including flow fields, geological fault data, and point data, +but they can also be present in the original scalar images acquired using a +plethora of imaging techniques. Our work is motivated by the analysis of image +data acquired using micro-computed tomography (Micro-CT) of ancient, rolled and +folded thin-layer structures such as papyrus, parchment, and paper as well as +silver and lead sheets. From these documents we know that they are +2-dimensional (2D) in nature. Hence, we are particularly interested in +reconstructing 2D manifolds that approximate the document's structure. The +image data from which we want to reconstruct the 2D manifolds are often very +noisy and represent folded, densely-layered structures with many artifacts, +such as ruptures or layer splitting and merging. Previous ridge-surface +extraction methods fail to extract the desired 2D manifold for such challenging +data. We have therefore developed a novel method to extract 2D manifolds. The +proposed method uses a local fast marching scheme in combination with a +separation of the region covered by fast marching into two sub-regions. The 2D +manifold of interest is then extracted as the surface separating the two +sub-regions. The local scheme can be applied for both automatic propagation as +well as interactive analysis. We demonstrate the applicability and robustness +of our method on both artificial data as well as real-world data including +folded silver and papyrus sheets. + +
+
+ comment: 16 pages, 21 figures, to be published in IEEE Transactions on + Visualization and Computer Graphics +
+
+
+
+
+ + ☆ Diagnosis of Scalp Disorders using Machine Learning and Deep Learning + Approach -- A Review + + +
+ The morbidity of scalp diseases is minuscule compared to other diseases, but +the impact on the patient's life is enormous. It is common for people to +experience scalp problems that include Dandruff, Psoriasis, Tinea-Capitis, +Alopecia and Atopic-Dermatitis. In accordance with WHO research, approximately +70% of adults have problems with their scalp. It has been demonstrated in +descriptive research that hair quality is impaired by impaired scalp, but these +impacts are reversible with early diagnosis and treatment. Deep Learning +advances have demonstrated the effectiveness of CNN paired with FCN in +diagnosing scalp and skin disorders. In one proposed Deep-Learning-based scalp +inspection and diagnosis system, an imaging microscope and a trained model are +combined with an app that classifies scalp disorders accurately with an average +precision of 97.41%- 99.09%. Another research dealt with classifying the +Psoriasis using the CNN with an accuracy of 82.9%. As part of another study, an +ML based algorithm was also employed. It accurately classified the healthy +scalp and alopecia areata with 91.4% and 88.9% accuracy with SVM and KNN +algorithms. Using deep learning models to diagnose scalp related diseases has +improved due to advancements i computation capabilities and computer vision, +but there remains a wide horizon for further improvements. + +
+
+
+
+
+ + ☆ Survey on video anomaly detection in dynamic scenes with moving cameras + + +
+ The increasing popularity of compact and inexpensive cameras, e.g.~dash +cameras, body cameras, and cameras equipped on robots, has sparked a growing +interest in detecting anomalies within dynamic scenes recorded by moving +cameras. However, existing reviews primarily concentrate on Video Anomaly +Detection (VAD) methods assuming static cameras. The VAD literature with moving +cameras remains fragmented, lacking comprehensive reviews to date. To address +this gap, we endeavor to present the first comprehensive survey on Moving +Camera Video Anomaly Detection (MC-VAD). We delve into the research papers +related to MC-VAD, critically assessing their limitations and highlighting +associated challenges. Our exploration encompasses three application domains: +security, urban transportation, and marine environments, which in turn cover +six specific tasks. We compile an extensive list of 25 publicly-available +datasets spanning four distinct environments: underwater, water surface, +ground, and aerial. We summarize the types of anomalies these datasets +correspond to or contain, and present five main categories of approaches for +detecting such anomalies. Lastly, we identify future research directions and +discuss novel contributions that could advance the field of MC-VAD. With this +survey, we aim to offer a valuable reference for researchers and practitioners +striving to develop and advance state-of-the-art MC-VAD methods. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ The minimal computational substrate of fluid intelligence + + +
+ The quantification of cognitive powers rests on identifying a behavioural +task that depends on them. Such dependence cannot be assured, for the powers a +task invokes cannot be experimentally controlled or constrained a priori, +resulting in unknown vulnerability to failure of specificity and +generalisability. Evaluating a compact version of Raven's Advanced Progressive +Matrices (RAPM), a widely used clinical test of fluid intelligence, we show +that LaMa, a self-supervised artificial neural network trained solely on the +completion of partially masked images of natural environmental scenes, achieves +human-level test scores a prima vista, without any task-specific inductive bias +or training. Compared with cohorts of healthy and focally lesioned +participants, LaMa exhibits human-like variation with item difficulty, and +produces errors characteristic of right frontal lobe damage under degradation +of its ability to integrate global spatial patterns. LaMa's narrow training and +limited capacity -- comparable to the nervous system of the fruit fly -- +suggest RAPM may be open to computationally simple solutions that need not +necessarily invoke abstract reasoning. + +
+
+ comment: 26 pages, 5 figures +
+
+
+
+
+ + ☆ An Inherent Trade-Off in Noisy Neural Communication with Rank-Order + Coding + + +
+ Rank-order coding, a form of temporal coding, has emerged as a promising +scheme to explain the rapid ability of the mammalian brain. Owing to its speed +as well as efficiency, rank-order coding is increasingly gaining interest in +diverse research areas beyond neuroscience. However, much uncertainty still +exists about the performance of rank-order coding under noise. Herein we show +what information rates are fundamentally possible and what trade-offs are at +stake. An unexpected finding in this paper is the emergence of a special class +of errors that, in a regime, increase with less noise. + +
+
+
+
+
+ + ☆ S3IM: Stochastic Structural SIMilarity and Its Unreasonable + Effectiveness for Neural Fields ICCV 2023 + + +
+ Recently, Neural Radiance Field (NeRF) has shown great success in rendering +novel-view images of a given scene by learning an implicit representation with +only posed RGB images. NeRF and relevant neural field methods (e.g., neural +surface representation) typically optimize a point-wise loss and make +point-wise predictions, where one data point corresponds to one pixel. +Unfortunately, this line of research failed to use the collective supervision +of distant pixels, although it is known that pixels in an image or scene can +provide rich structural information. To the best of our knowledge, we are the +first to design a nonlocal multiplex training paradigm for NeRF and relevant +neural field methods via a novel Stochastic Structural SIMilarity (S3IM) loss +that processes multiple data points as a whole set instead of process multiple +inputs independently. Our extensive experiments demonstrate the unreasonable +effectiveness of S3IM in improving NeRF and neural surface representation for +nearly free. The improvements of quality metrics can be particularly +significant for those relatively difficult tasks: e.g., the test MSE loss +unexpectedly drops by more than 90% for TensoRF and DVGO over eight novel view +synthesis tasks; a 198% F-score gain and a 64% Chamfer $L_{1}$ distance +reduction for NeuS over eight surface reconstruction tasks. Moreover, S3IM is +consistently robust even with sparse inputs, corrupted images, and dynamic +scenes. + +
+
+ comment: ICCV 2023 main conference. Code: https://github.com/Madaoer/S3IM. 14 + pages, 5 figures, 17 tables +
+
+
+
+
+ + ☆ AdvCLIP: Downstream-agnostic Adversarial Examples in Multimodal + Contrastive Learning ACM MM '23 + + +
+ Multimodal contrastive learning aims to train a general-purpose feature +extractor, such as CLIP, on vast amounts of raw, unlabeled paired image-text +data. This can greatly benefit various complex downstream tasks, including +cross-modal image-text retrieval and image classification. Despite its +promising prospect, the security issue of cross-modal pre-trained encoder has +not been fully explored yet, especially when the pre-trained encoder is +publicly available for commercial use. + In this work, we propose AdvCLIP, the first attack framework for generating +downstream-agnostic adversarial examples based on cross-modal pre-trained +encoders. AdvCLIP aims to construct a universal adversarial patch for a set of +natural images that can fool all the downstream tasks inheriting the victim +cross-modal pre-trained encoder. To address the challenges of heterogeneity +between different modalities and unknown downstream tasks, we first build a +topological graph structure to capture the relevant positions between target +samples and their neighbors. Then, we design a topology-deviation based +generative adversarial network to generate a universal adversarial patch. By +adding the patch to images, we minimize their embeddings similarity to +different modality and perturb the sample distribution in the feature space, +achieving unviersal non-targeted attacks. Our results demonstrate the excellent +attack performance of AdvCLIP on two types of downstream tasks across eight +datasets. We also tailor three popular defenses to mitigate AdvCLIP, +highlighting the need for new defense mechanisms to defend cross-modal +pre-trained encoders. + +
+
+ comment: This paper has been accepted by the ACM International Conference on + Multimedia (ACM MM '23, October 29-November 3, 2023, Ottawa, ON, Canada) +
+
+
+
+
+ + ☆ PGT-Net: Progressive Guided Multi-task Neural Network for Small-area Wet + Fingerprint Denoising and Recognition + + +
+ Fingerprint recognition on mobile devices is an important method for identity +verification. However, real fingerprints usually contain sweat and moisture +which leads to poor recognition performance. In addition, for rolling out +slimmer and thinner phones, technology companies reduce the size of recognition +sensors by embedding them with the power button. Therefore, the limited size of +fingerprint data also increases the difficulty of recognition. Denoising the +small-area wet fingerprint images to clean ones becomes crucial to improve +recognition performance. In this paper, we propose an end-to-end trainable +progressive guided multi-task neural network (PGT-Net). The PGT-Net includes a +shared stage and specific multi-task stages, enabling the network to train +binary and non-binary fingerprints sequentially. The binary information is +regarded as guidance for output enhancement which is enriched with the ridge +and valley details. Moreover, a novel residual scaling mechanism is introduced +to stabilize the training process. Experiment results on the FW9395 and +FT-lightnoised dataset provided by FocalTech shows that PGT-Net has promising +performance on the wet-fingerprint denoising and significantly improves the +fingerprint recognition rate (FRR). On the FT-lightnoised dataset, the FRR of +fingerprint recognition can be declined from 17.75% to 4.47%. On the FW9395 +dataset, the FRR of fingerprint recognition can be declined from 9.45% to +1.09%. + +
+
+
+
+
+ + ☆ Contrastive Bi-Projector for Unsupervised Domain Adaption + + +
+ This paper proposes a novel unsupervised domain adaption (UDA) method based +on contrastive bi-projector (CBP), which can improve the existing UDA methods. +It is called CBPUDA here, which effectively promotes the feature extractors +(FEs) to reduce the generation of ambiguous features for classification and +domain adaption. The CBP differs from traditional bi-classifier-based methods +at that these two classifiers are replaced with two projectors of performing a +mapping from the input feature to two distinct features. These two projectors +and the FEs in the CBPUDA can be trained adversarially to obtain more refined +decision boundaries so that it can possess powerful classification performance. +Two properties of the proposed loss function are analyzed here. The first +property is to derive an upper bound of joint prediction entropy, which is used +to form the proposed loss function, contrastive discrepancy (CD) loss. The CD +loss takes the advantages of the contrastive learning and the bi-classifier. +The second property is to analyze the gradient of the CD loss and then overcome +the drawback of the CD loss. The result of the second property is utilized in +the development of the gradient scaling (GS) scheme in this paper. The GS +scheme can be exploited to tackle the unstable problem of the CD loss because +training the CBPUDA requires using contrastive learning and adversarial +learning at the same time. Therefore, using the CD loss with the GS scheme +overcomes the problem mentioned above to make features more compact for +intra-class and distinguishable for inter-class. Experimental results express +that the CBPUDA is superior to conventional UDA methods under consideration in +this paper for UDA and fine-grained UDA tasks. + +
+
+
+
+
+ + ☆ HPFormer: Hyperspectral image prompt object tracking + + +
+ Hyperspectral imagery contains abundant spectral information beyond the +visible RGB bands, providing rich discriminative details about objects in a +scene. Leveraging such data has the potential to enhance visual tracking +performance. While prior hyperspectral trackers employ CNN or hybrid +CNN-Transformer architectures, we propose a novel approach HPFormer on +Transformers to capitalize on their powerful representation learning +capabilities. The core of HPFormer is a Hyperspectral Hybrid Attention (HHA) +module which unifies feature extraction and fusion within one component through +token interactions. Additionally, a Transform Band Module (TBM) is introduced +to selectively aggregate spatial details and spectral signatures from the full +hyperspectral input for injecting informative target representations. Extensive +experiments demonstrate state-of-the-art performance of HPFormer on benchmark +NIR and VIS tracking datasets. Our work provides new insights into harnessing +the strengths of transformers and hyperspectral fusion to advance robust object +tracking. + +
+
+
+
+
+ + ☆ ACTIVE: Towards Highly Transferable 3D Physical Camouflage for Universal + and Robust Vehicle Evasion ICCV 2023 + + +
+ Adversarial camouflage has garnered attention for its ability to attack +object detectors from any viewpoint by covering the entire object's surface. +However, universality and robustness in existing methods often fall short as +the transferability aspect is often overlooked, thus restricting their +application only to a specific target with limited performance. To address +these challenges, we present Adversarial Camouflage for Transferable and +Intensive Vehicle Evasion (ACTIVE), a state-of-the-art physical camouflage +attack framework designed to generate universal and robust adversarial +camouflage capable of concealing any 3D vehicle from detectors. Our framework +incorporates innovative techniques to enhance universality and robustness: a +refined texture rendering that enables common texture application to different +vehicles without being constrained to a specific texture map, a novel stealth +loss that renders the vehicle undetectable, and a smooth and camouflage loss to +enhance the naturalness of the adversarial camouflage. Our extensive +experiments on 15 different models show that ACTIVE consistently outperforms +existing works on various public detectors, including the latest YOLOv7. +Notably, our universality evaluations reveal promising transferability to other +vehicle classes, tasks (segmentation models), and the real world, not just +other vehicles. + +
+
+ comment: Accepted for ICCV 2023. Main Paper with Supplementary Material. + Project Page: https://islab-ai.github.io/active-iccv2023/ +
+
+
+
+
+ + ☆ Deepbet: Fast brain extraction of T1-weighted MRI using Convolutional + Neural Networks + + +
+ Brain extraction in magnetic resonance imaging (MRI) data is an important +segmentation step in many neuroimaging preprocessing pipelines. Image +segmentation is one of the research fields in which deep learning had the +biggest impact in recent years enabling high precision segmentation with +minimal compute. Consequently, traditional brain extraction methods are now +being replaced by deep learning-based methods. Here, we used a unique dataset +comprising 568 T1-weighted (T1w) MR images from 191 different studies in +combination with cutting edge deep learning methods to build a fast, +high-precision brain extraction tool called deepbet. deepbet uses LinkNet, a +modern UNet architecture, in a two stage prediction process. This increases its +segmentation performance, setting a novel state-of-the-art performance during +cross-validation with a median Dice score (DSC) of 99.0% on unseen datasets, +outperforming current state of the art models (DSC = 97.8% and DSC = 97.9%). +While current methods are more sensitive to outliers, resulting in Dice scores +as low as 76.5%, deepbet manages to achieve a Dice score of > 96.9% for all +samples. Finally, our model accelerates brain extraction by a factor of ~10 +compared to current methods, enabling the processing of one image in ~2 seconds +on low level hardware. + +
+
+
+
+
+ + ☆ Mutual Information-driven Triple Interaction Network for Efficient Image + Dehazing ACM MM 2023 + + +
+ Multi-stage architectures have exhibited efficacy in image dehazing, which +usually decomposes a challenging task into multiple more tractable sub-tasks +and progressively estimates latent hazy-free images. Despite the remarkable +progress, existing methods still suffer from the following shortcomings: (1) +limited exploration of frequency domain information; (2) insufficient +information interaction; (3) severe feature redundancy. To remedy these issues, +we propose a novel Mutual Information-driven Triple interaction Network +(MITNet) based on spatial-frequency dual domain information and two-stage +architecture. To be specific, the first stage, named amplitude-guided haze +removal, aims to recover the amplitude spectrum of the hazy images for haze +removal. And the second stage, named phase-guided structure refined, devotes to +learning the transformation and refinement of the phase spectrum. To facilitate +the information exchange between two stages, an Adaptive Triple Interaction +Module (ATIM) is developed to simultaneously aggregate cross-domain, +cross-scale, and cross-stage features, where the fused features are further +used to generate content-adaptive dynamic filters so that applying them to +enhance global context representation. In addition, we impose the mutual +information minimization constraint on paired scale encoder and decoder +features from both stages. Such an operation can effectively reduce information +redundancy and enhance cross-stage feature complementarity. Extensive +experiments on multiple public datasets exhibit that our MITNet performs +superior performance with lower model complexity.The code and models are +available at https://github.com/it-hao/MITNet. + +
+
+ comment: Accepted in ACM MM 2023 +
+
+
+
+
+ + ☆ PatchContrast: Self-Supervised Pre-training for 3D Object Detection + + +
+ Accurately detecting objects in the environment is a key challenge for +autonomous vehicles. However, obtaining annotated data for detection is +expensive and time-consuming. We introduce PatchContrast, a novel +self-supervised point cloud pre-training framework for 3D object detection. We +propose to utilize two levels of abstraction to learn discriminative +representation from unlabeled data: proposal-level and patch-level. The +proposal-level aims at localizing objects in relation to their surroundings, +whereas the patch-level adds information about the internal connections between +the object's components, hence distinguishing between different objects based +on their individual components. We demonstrate how these levels can be +integrated into self-supervised pre-training for various backbones to enhance +the downstream 3D detection task. We show that our method outperforms existing +state-of-the-art models on three commonly-used 3D detection datasets. + +
+
+
+
+
+ + ☆ pNNCLR: Stochastic Pseudo Neighborhoods for Contrastive Learning based + Unsupervised Representation Learning Problems + + +
+ Nearest neighbor (NN) sampling provides more semantic variations than +pre-defined transformations for self-supervised learning (SSL) based image +recognition problems. However, its performance is restricted by the quality of +the support set, which holds positive samples for the contrastive loss. In this +work, we show that the quality of the support set plays a crucial role in any +nearest neighbor based method for SSL. We then provide a refined baseline +(pNNCLR) to the nearest neighbor based SSL approach (NNCLR). To this end, we +introduce pseudo nearest neighbors (pNN) to control the quality of the support +set, wherein, rather than sampling the nearest neighbors, we sample in the +vicinity of hard nearest neighbors by varying the magnitude of the resultant +vector and employing a stochastic sampling strategy to improve the performance. +Additionally, to stabilize the effects of uncertainty in NN-based learning, we +employ a smooth-weight-update approach for training the proposed network. +Evaluation of the proposed method on multiple public image recognition and +medical image recognition datasets shows that it performs up to 8 percent +better than the baseline nearest neighbor method, and is comparable to other +previously proposed SSL methods. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ A One Stop 3D Target Reconstruction and multilevel Segmentation Method + + +
+ 3D object reconstruction and multilevel segmentation are fundamental to +computer vision research. Existing algorithms usually perform 3D scene +reconstruction and target objects segmentation independently, and the +performance is not fully guaranteed due to the challenge of the 3D +segmentation. Here we propose an open-source one stop 3D target reconstruction +and multilevel segmentation framework (OSTRA), which performs segmentation on +2D images, tracks multiple instances with segmentation labels in the image +sequence, and then reconstructs labelled 3D objects or multiple parts with +Multi-View Stereo (MVS) or RGBD-based 3D reconstruction methods. We extend +object tracking and 3D reconstruction algorithms to support continuous +segmentation labels to leverage the advances in the 2D image segmentation, +especially the Segment-Anything Model (SAM) which uses the pretrained neural +network without additional training for new scenes, for 3D object segmentation. +OSTRA supports most popular 3D object models including point cloud, mesh and +voxel, and achieves high performance for semantic segmentation, instance +segmentation and part segmentation on several 3D datasets. It even surpasses +the manual segmentation in scenes with complex structures and occlusions. Our +method opens up a new avenue for reconstructing 3D targets embedded with rich +multi-scale segmentation information in complex scenes. OSTRA is available from +https://github.com/ganlab/OSTRA. + +
+
+
+
+
+ + ☆ How inter-rater variability relates to aleatoric and epistemic + uncertainty: a case study with deep learning-based paraspinal muscle + segmentation MICCAI 2023 + + +
+ Recent developments in deep learning (DL) techniques have led to great +performance improvement in medical image segmentation tasks, especially with +the latest Transformer model and its variants. While labels from fusing +multi-rater manual segmentations are often employed as ideal ground truths in +DL model training, inter-rater variability due to factors such as training +bias, image noise, and extreme anatomical variability can still affect the +performance and uncertainty of the resulting algorithms. Knowledge regarding +how inter-rater variability affects the reliability of the resulting DL +algorithms, a key element in clinical deployment, can help inform better +training data construction and DL models, but has not been explored +extensively. In this paper, we measure aleatoric and epistemic uncertainties +using test-time augmentation (TTA), test-time dropout (TTD), and deep ensemble +to explore their relationship with inter-rater variability. Furthermore, we +compare UNet and TransUNet to study the impacts of Transformers on model +uncertainty with two label fusion strategies. We conduct a case study using +multi-class paraspinal muscle segmentation from T2w MRIs. Our study reveals the +interplay between inter-rater variability and uncertainties, affected by +choices of label fusion strategies and DL models. + +
+
+ comment: Accepted in UNSURE MICCAI 2023 +
+
+
+
+
+ + ☆ Color-NeuS: Reconstructing Neural Implicit Surfaces with Color + + +
+ The reconstruction of object surfaces from multi-view images or monocular +video is a fundamental issue in computer vision. However, much of the recent +research concentrates on reconstructing geometry through implicit or explicit +methods. In this paper, we shift our focus towards reconstructing mesh in +conjunction with color. We remove the view-dependent color from neural volume +rendering while retaining volume rendering performance through a relighting +network. Mesh is extracted from the signed distance function (SDF) network for +the surface, and color for each surface vertex is drawn from the global color +network. To evaluate our approach, we conceived a in hand object scanning task +featuring numerous occlusions and dramatic shifts in lighting conditions. We've +gathered several videos for this task, and the results surpass those of any +existing methods capable of reconstructing mesh alongside color. Additionally, +our method's performance was assessed using public datasets, including DTU, +BlendedMVS, and OmniObject3D. The results indicated that our method performs +well across all these datasets. Project page: +https://colmar-zlicheng.github.io/color_neus. + +
+
+
+
+
+ + ☆ CEmb-SAM: Segment Anything Model with Condition Embedding for Joint + Learning from Heterogeneous Datasets + + +
+ Automated segmentation of ultrasound images can assist medical experts with +diagnostic and therapeutic procedures. Although using the common modality of +ultrasound, one typically needs separate datasets in order to segment, for +example, different anatomical structures or lesions with different levels of +malignancy. In this paper, we consider the problem of jointly learning from +heterogeneous datasets so that the model can improve generalization abilities +by leveraging the inherent variability among datasets. We merge the +heterogeneous datasets into one dataset and refer to each component dataset as +a subgroup. We propose to train a single segmentation model so that the model +can adapt to each sub-group. For robust segmentation, we leverage recently +proposed Segment Anything model (SAM) in order to incorporate sub-group +information into the model. We propose SAM with Condition Embedding block +(CEmb-SAM) which encodes sub-group conditions and combines them with image +embeddings from SAM. The conditional embedding block effectively adapts SAM to +each image sub-group by incorporating dataset properties through learnable +parameters for normalization. Experiments show that CEmb-SAM outperforms the +baseline methods on ultrasound image segmentation for peripheral nerves and +breast cancer. The experiments highlight the effectiveness of Cemb-SAM in +learning from heterogeneous datasets in medical image segmentation tasks. + +
+
+
+
+
+ + ☆ Global Features are All You Need for Image Retrieval and Reranking ICCV 2023 + + +
+ Utilizing a two-stage paradigm comprising of coarse image retrieval and +precise reranking, a well-established image retrieval system is formed. It has +been widely accepted for long time that local feature is imperative to the +subsequent stage - reranking, but this requires sizeable storage and computing +capacities. We, for the first time, propose an image retrieval paradigm +leveraging global feature only to enable accurate and lightweight image +retrieval for both coarse retrieval and reranking, thus the name - SuperGlobal. +It consists of several plug-in modules that can be easily integrated into an +already trained model, for both coarse retrieval and reranking stage. This +series of approaches is inspired by the investigation into Generalized Mean +(GeM) Pooling. Possessing these tools, we strive to defy the notion that local +feature is essential for a high-performance image retrieval paradigm. Extensive +experiments demonstrate substantial improvements compared to the state of the +art in standard benchmarks. Notably, on the Revisited Oxford (ROxford)+1M Hard +dataset, our single-stage results improve by 8.2% absolute, while our two-stage +version gain reaches 3.7% with a strong 7568X speedup. Furthermore, when the +full SuperGlobal is compared with the current single-stage state-of-the-art +method, we achieve roughly 17% improvement with a minimal 0.005% time overhead. +Code: https://github.com/ShihaoShao-GH/SuperGlobal. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Channel-Wise Contrastive Learning for Learning with Noisy Labels + + +
+ In real-world datasets, noisy labels are pervasive. The challenge of learning +with noisy labels (LNL) is to train a classifier that discerns the actual +classes from given instances. For this, the model must identify features +indicative of the authentic labels. While research indicates that genuine label +information is embedded in the learned features of even inaccurately labeled +data, it's often intertwined with noise, complicating its direct application. +Addressing this, we introduce channel-wise contrastive learning (CWCL). This +method distinguishes authentic label information from noise by undertaking +contrastive learning across diverse channels. Unlike conventional instance-wise +contrastive learning (IWCL), CWCL tends to yield more nuanced and resilient +features aligned with the authentic labels. Our strategy is twofold: firstly, +using CWCL to extract pertinent features to identify cleanly labeled samples, +and secondly, progressively fine-tuning using these samples. Evaluations on +several benchmark datasets validate our method's superiority over existing +approaches. + +
+
+
+
+
+ + ☆ MixBCT: Towards Self-Adapting Backward-Compatible Training + + +
+ The exponential growth of data, alongside advancements in model structures +and loss functions, has necessitated the enhancement of image retrieval systems +through the utilization of new models with superior feature embeddings. +However, the expensive process of updating the old retrieval database by +replacing embeddings poses a challenge. As a solution, backward-compatible +training can be employed to avoid the necessity of updating old retrieval +datasets. While previous methods achieved backward compatibility by aligning +prototypes of the old model, they often overlooked the distribution of the old +features, thus limiting their effectiveness when the old model's low quality +leads to a weakly discriminative feature distribution. On the other hand, +instance-based methods like L2 regression take into account the distribution of +old features but impose strong constraints on the performance of the new model +itself. In this paper, we propose MixBCT, a simple yet highly effective +backward-compatible training method that serves as a unified framework for old +models of varying qualities. Specifically, we summarize four constraints that +are essential for ensuring backward compatibility in an ideal scenario, and we +construct a single loss function to facilitate backward-compatible training. +Our approach adaptively adjusts the constraint domain for new features based on +the distribution of the old embeddings. We conducted extensive experiments on +the large-scale face recognition datasets MS1Mv3 and IJB-C to verify the +effectiveness of our method. The experimental results clearly demonstrate its +superiority over previous methods. Code is available at +https://github.com/yuleung/MixBCT + +
+
+
+
+
+ + ☆ Knowing Where to Focus: Event-aware Transformer for Video Grounding ICCV 2023 + + +
+ Recent DETR-based video grounding models have made the model directly predict +moment timestamps without any hand-crafted components, such as a pre-defined +proposal or non-maximum suppression, by learning moment queries. However, their +input-agnostic moment queries inevitably overlook an intrinsic temporal +structure of a video, providing limited positional information. In this paper, +we formulate an event-aware dynamic moment query to enable the model to take +the input-specific content and positional information of the video into +account. To this end, we present two levels of reasoning: 1) Event reasoning +that captures distinctive event units constituting a given video using a slot +attention mechanism; and 2) moment reasoning that fuses the moment queries with +a given sentence through a gated fusion transformer layer and learns +interactions between the moment queries and video-sentence representations to +predict moment timestamps. Extensive experiments demonstrate the effectiveness +and efficiency of the event-aware dynamic moment queries, outperforming +state-of-the-art approaches on several video grounding benchmarks. + +
+
+ comment: ICCV 2023. Code is available at https://github.com/jinhyunj/EaTR +
+
+
+
+
+ + ☆ Semantic-aware Network for Aerial-to-Ground Image Synthesis ICIP 2021 + + +
+ Aerial-to-ground image synthesis is an emerging and challenging problem that +aims to synthesize a ground image from an aerial image. Due to the highly +different layout and object representation between the aerial and ground +images, existing approaches usually fail to transfer the components of the +aerial scene into the ground scene. In this paper, we propose a novel framework +to explore the challenges by imposing enhanced structural alignment and +semantic awareness. We introduce a novel semantic-attentive feature +transformation module that allows to reconstruct the complex geographic +structures by aligning the aerial feature to the ground layout. Furthermore, we +propose semantic-aware loss functions by leveraging a pre-trained segmentation +network. The network is enforced to synthesize realistic objects across various +classes by separately calculating losses for different classes and balancing +them. Extensive experiments including comparisons with previous methods and +ablation studies show the effectiveness of the proposed framework both +qualitatively and quantitatively. + +
+
+ comment: ICIP 2021. Code is available at https://github.com/jinhyunj/SANet +
+
+
+
+
+ + ☆ One-shot lip-based biometric authentication: extending behavioral + features with authentication phrase information + + +
+ Lip-based biometric authentication (LBBA) is an authentication method based +on a person's lip movements during speech in the form of video data captured by +a camera sensor. LBBA can utilize both physical and behavioral characteristics +of lip movements without requiring any additional sensory equipment apart from +an RGB camera. State-of-the-art (SOTA) approaches use one-shot learning to +train deep siamese neural networks which produce an embedding vector out of +these features. Embeddings are further used to compute the similarity between +an enrolled user and a user being authenticated. A flaw of these approaches is +that they model behavioral features as style-of-speech without relation to what +is being said. This makes the system vulnerable to video replay attacks of the +client speaking any phrase. To solve this problem we propose a one-shot +approach which models behavioral features to discriminate against what is being +said in addition to style-of-speech. We achieve this by customizing the GRID +dataset to obtain required triplets and training a siamese neural network based +on 3D convolutions and recurrent neural network layers. A custom triplet loss +for batch-wise hard-negative mining is proposed. Obtained results using an +open-set protocol are 3.2% FAR and 3.8% FRR on the test set of the customized +GRID dataset. Additional analysis of the results was done to quantify the +influence and discriminatory power of behavioral and physical features for +LBBA. + +
+
+ comment: 28 pages, 10 figures, 7 tables +
+
+
+
+
+ + ☆ Radiomics-Informed Deep Learning for Classification of Atrial + Fibrillation Sub-Types from Left-Atrium CT Volumes MICCAI23 + + +
+ Atrial Fibrillation (AF) is characterized by rapid, irregular heartbeats, and +can lead to fatal complications such as heart failure. The disease is divided +into two sub-types based on severity, which can be automatically classified +through CT volumes for disease screening of severe cases. However, existing +classification approaches rely on generic radiomic features that may not be +optimal for the task, whilst deep learning methods tend to over-fit to the +high-dimensional volume inputs. In this work, we propose a novel +radiomics-informed deep-learning method, RIDL, that combines the advantages of +deep learning and radiomic approaches to improve AF sub-type classification. +Unlike existing hybrid techniques that mostly rely on na\"ive feature +concatenation, we observe that radiomic feature selection methods can serve as +an information prior, and propose supplementing low-level deep neural network +(DNN) features with locally computed radiomic features. This reduces DNN +over-fitting and allows local variations between radiomic features to be better +captured. Furthermore, we ensure complementary information is learned by deep +and radiomic features by designing a novel feature de-correlation loss. +Combined, our method addresses the limitations of deep learning and radiomic +approaches and outperforms state-of-the-art radiomic, deep learning, and hybrid +approaches, achieving 86.9% AUC for the AF sub-type classification task. Code +is available at https://github.com/xmed-lab/RIDL. + +
+
+ comment: Accepted by MICCAI23 +
+
+
+
+
+ + ☆ OpenGCD: Assisting Open World Recognition with Generalized Category + Discovery + + +
+ A desirable open world recognition (OWR) system requires performing three +tasks: (1) Open set recognition (OSR), i.e., classifying the known (classes +seen during training) and rejecting the unknown (unseen$/$novel classes) +online; (2) Grouping and labeling these unknown as novel known classes; (3) +Incremental learning (IL), i.e., continual learning these novel classes and +retaining the memory of old classes. Ideally, all of these steps should be +automated. However, existing methods mostly assume that the second task is +completely done manually. To bridge this gap, we propose OpenGCD that combines +three key ideas to solve the above problems sequentially: (a) We score the +origin of instances (unknown or specifically known) based on the uncertainty of +the classifier's prediction; (b) For the first time, we introduce generalized +category discovery (GCD) techniques in OWR to assist humans in grouping +unlabeled data; (c) For the smooth execution of IL and GCD, we retain an equal +number of informative exemplars for each class with diversity as the goal. +Moreover, we present a new performance evaluation metric for GCD called +harmonic clustering accuracy. Experiments on two standard classification +benchmarks and a challenging dataset demonstrate that OpenGCD not only offers +excellent compatibility but also substantially outperforms other baselines. +Code: https://github.com/Fulin-Gao/OpenGCD. + +
+
+
+
+
+ + ☆ CBA: Improving Online Continual Learning via Continual Bias Adaptor ICCV 2023 + + +
+ Online continual learning (CL) aims to learn new knowledge and consolidate +previously learned knowledge from non-stationary data streams. Due to the +time-varying training setting, the model learned from a changing distribution +easily forgets the previously learned knowledge and biases toward the newly +received task. To address this problem, we propose a Continual Bias Adaptor +(CBA) module to augment the classifier network to adapt to catastrophic +distribution change during training, such that the classifier network is able +to learn a stable consolidation of previously learned tasks. In the testing +stage, CBA can be removed which introduces no additional computation cost and +memory overhead. We theoretically reveal the reason why the proposed method can +effectively alleviate catastrophic distribution shifts, and empirically +demonstrate its effectiveness through extensive experiments based on four +rehearsal-based baselines and three public continual learning benchmarks. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Hierarchy Flow For High-Fidelity Image-to-Image Translation + + +
+ Image-to-image (I2I) translation comprises a wide spectrum of tasks. Here we +divide this problem into three levels: strong-fidelity translation, +normal-fidelity translation, and weak-fidelity translation, indicating the +extent to which the content of the original image is preserved. Although +existing methods achieve good performance in weak-fidelity translation, they +fail to fully preserve the content in both strong- and normal-fidelity tasks, +e.g. sim2real, style transfer and low-level vision. In this work, we propose +Hierarchy Flow, a novel flow-based model to achieve better content preservation +during translation. Specifically, 1) we first unveil the drawbacks of standard +flow-based models when applied to I2I translation. 2) Next, we propose a new +design, namely hierarchical coupling for reversible feature transformation and +multi-scale modeling, to constitute Hierarchy Flow. 3) Finally, we present a +dedicated aligned-style loss for a better trade-off between content +preservation and stylization during translation. Extensive experiments on a +wide range of I2I translation benchmarks demonstrate that our approach achieves +state-of-the-art performance, with convincing advantages in both strong- and +normal-fidelity tasks. Code and models will be at +https://github.com/WeichenFan/HierarchyFlow. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2207.01909 +
+
+
+
+
+ + ☆ The Michigan Robotics Undergraduate Curriculum: Defining the Discipline + of Robotics for Equity and Excellence + + +
+ The Robotics Major at the University of Michigan was successfully launched in +the 2022-23 academic year as an innovative step forward to better serve +students, our communities, and our society. Building on our guiding principle +of "Robotics with Respect" and our larger Robotics Pathways model, the Michigan +Robotics Major was designed to define robotics as a true academic discipline +with both equity and excellence as our highest priorities. Understanding that +talent is equally distributed but opportunity is not, the Michigan Robotics +Major has embraced an adaptable curriculum that is accessible through a +diversity of student pathways and enables successful and sustained career-long +participation in robotics, AI, and automation professions. The results after +our planning efforts (2019-22) and first academic year (2022-23) have been +highly encouraging: more than 100 students declared Robotics as their major, +completion of the Robotics major by our first two graduates, soaring +enrollments in our Robotics classes, thriving partnerships with Historically +Black Colleges and Universities. This document provides our original curricular +proposal for the Robotics Undergraduate Program at the University of Michigan, +submitted to the Michigan Association of State Universities in April 2022 and +approved in June 2022. The dissemination of our program design is in the spirit +of continued growth for higher education towards realizing equity and +excellence. + The most recent version of this document is also available on Google Docs +through this link: https://ocj.me/robotics_major + +
+
+ comment: 49 pages, approximately 25 figures +
+
+
+
+
+ + ☆ Exploring Lightweight Hierarchical Vision Transformers for Efficient + Visual Tracking ICCV2023 + + +
+ Transformer-based visual trackers have demonstrated significant progress +owing to their superior modeling capabilities. However, existing trackers are +hampered by low speed, limiting their applicability on devices with limited +computational power. To alleviate this problem, we propose HiT, a new family of +efficient tracking models that can run at high speed on different devices while +retaining high performance. The central idea of HiT is the Bridge Module, which +bridges the gap between modern lightweight transformers and the tracking +framework. The Bridge Module incorporates the high-level information of deep +features into the shallow large-resolution features. In this way, it produces +better features for the tracking head. We also propose a novel dual-image +position encoding technique that simultaneously encodes the position +information of both the search region and template images. The HiT model +achieves promising speed with competitive performance. For instance, it runs at +61 frames per second (fps) on the Nvidia Jetson AGX edge device. Furthermore, +HiT attains 64.6% AUC on the LaSOT benchmark, surpassing all previous efficient +trackers. + +
+
+ comment: This paper was accepted by ICCV2023 +
+
+
+
+
+ + ☆ Orthogonal Temporal Interpolation for Zero-Shot Video Recognition + + +
+ Zero-shot video recognition (ZSVR) is a task that aims to recognize video +categories that have not been seen during the model training process. Recently, +vision-language models (VLMs) pre-trained on large-scale image-text pairs have +demonstrated impressive transferability for ZSVR. To make VLMs applicable to +the video domain, existing methods often use an additional temporal learning +module after the image-level encoder to learn the temporal relationships among +video frames. Unfortunately, for video from unseen categories, we observe an +abnormal phenomenon where the model that uses spatial-temporal feature performs +much worse than the model that removes temporal learning module and uses only +spatial feature. We conjecture that improper temporal modeling on video +disrupts the spatial feature of the video. To verify our hypothesis, we propose +Feature Factorization to retain the orthogonal temporal feature of the video +and use interpolation to construct refined spatial-temporal feature. The model +using appropriately refined spatial-temporal feature performs better than the +one using only spatial feature, which verifies the effectiveness of the +orthogonal temporal feature for the ZSVR task. Therefore, an Orthogonal +Temporal Interpolation module is designed to learn a better refined +spatial-temporal video feature during training. Additionally, a Matching Loss +is introduced to improve the quality of the orthogonal temporal feature. We +propose a model called OTI for ZSVR by employing orthogonal temporal +interpolation and the matching loss based on VLMs. The ZSVR accuracies on +popular video datasets (i.e., Kinetics-600, UCF101 and HMDB51) show that OTI +outperforms the previous state-of-the-art method by a clear margin. + +
+
+
+
+
+ + ☆ Robustness Stress Testing in Medical Image Classification + + +
+ Deep neural networks have shown impressive performance for image-based +disease detection. Performance is commonly evaluated through clinical +validation on independent test sets to demonstrate clinically acceptable +accuracy. Reporting good performance metrics on test sets, however, is not +always a sufficient indication of the generalizability and robustness of an +algorithm. In particular, when the test data is drawn from the same +distribution as the training data, the iid test set performance can be an +unreliable estimate of the accuracy on new data. In this paper, we employ +stress testing to assess model robustness and subgroup performance disparities +in disease detection models. We design progressive stress testing using five +different bidirectional and unidirectional image perturbations with six +different severity levels. As a use case, we apply stress tests to measure the +robustness of disease detection models for chest X-ray and skin lesion images, +and demonstrate the importance of studying class and domain-specific model +behaviour. Our experiments indicate that some models may yield more robust and +equitable performance than others. We also find that pretraining +characteristics play an important role in downstream robustness. We conclude +that progressive stress testing is a viable and important tool and should +become standard practice in the clinical validation of image-based disease +detection models. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Robustified ANNs Reveal Wormholes Between Human Category Percepts + + +
+ The visual object category reports of artificial neural networks (ANNs) are +notoriously sensitive to tiny, adversarial image perturbations. Because human +category reports (aka human percepts) are thought to be insensitive to those +same small-norm perturbations -- and locally stable in general -- this argues +that ANNs are incomplete scientific models of human visual perception. +Consistent with this, we show that when small-norm image perturbations are +generated by standard ANN models, human object category percepts are indeed +highly stable. However, in this very same "human-presumed-stable" regime, we +find that robustified ANNs reliably discover low-norm image perturbations that +strongly disrupt human percepts. These previously undetectable human perceptual +disruptions are massive in amplitude, approaching the same level of sensitivity +seen in robustified ANNs. Further, we show that robustified ANNs support +precise perceptual state interventions: they guide the construction of low-norm +image perturbations that strongly alter human category percepts toward specific +prescribed percepts. These observations suggest that for arbitrary starting +points in image space, there exists a set of nearby "wormholes", each leading +the subject from their current category perceptual state into a semantically +very different state. Moreover, contemporary ANN models of biological visual +processing are now accurate enough to consistently guide us to those portals. + +
+
+ comment: *Equal contribution +
+
+
+
+
+ + ☆ Towards Open-Set Test-Time Adaptation Utilizing the Wisdom of Crowds in + Entropy Minimization ICCV 2023 + + +
+ Test-time adaptation (TTA) methods, which generally rely on the model's +predictions (e.g., entropy minimization) to adapt the source pretrained model +to the unlabeled target domain, suffer from noisy signals originating from 1) +incorrect or 2) open-set predictions. Long-term stable adaptation is hampered +by such noisy signals, so training models without such error accumulation is +crucial for practical TTA. To address these issues, including open-set TTA, we +propose a simple yet effective sample selection method inspired by the +following crucial empirical finding. While entropy minimization compels the +model to increase the probability of its predicted label (i.e., confidence +values), we found that noisy samples rather show decreased confidence values. +To be more specific, entropy minimization attempts to raise the confidence +values of an individual sample's prediction, but individual confidence values +may rise or fall due to the influence of signals from numerous other +predictions (i.e., wisdom of crowds). Due to this fact, noisy signals +misaligned with such 'wisdom of crowds', generally found in the correct +signals, fail to raise the individual confidence values of wrong samples, +despite attempts to increase them. Based on such findings, we filter out the +samples whose confidence values are lower in the adapted model than in the +original model, as they are likely to be noisy. Our method is widely applicable +to existing TTA methods and improves their long-term adaptation performance in +both image classification (e.g., 49.4% reduced error rates with TENT) and +semantic segmentation (e.g., 11.7% gain in mIoU with TENT). + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Shape-Graph Matching Network (SGM-net): Registration for Statistical + Shape Analysis + + +
+ This paper focuses on the statistical analysis of shapes of data objects +called shape graphs, a set of nodes connected by articulated curves with +arbitrary shapes. A critical need here is a constrained registration of points +(nodes to nodes, edges to edges) across objects. This, in turn, requires +optimization over the permutation group, made challenging by differences in +nodes (in terms of numbers, locations) and edges (in terms of shapes, +placements, and sizes) across objects. This paper tackles this registration +problem using a novel neural-network architecture and involves an unsupervised +loss function developed using the elastic shape metric for curves. This +architecture results in (1) state-of-the-art matching performance and (2) an +order of magnitude reduction in the computational cost relative to baseline +approaches. We demonstrate the effectiveness of the proposed approach using +both simulated data and real-world 2D and 3D shape graphs. Code and data will +be made publicly available after review to foster research. + +
+
+
+
+
+ + ☆ Camera Based mmWave Beam Prediction: Towards Multi-Candidate Real-World + Scenarios + + +
+ Leveraging sensory information to aid the millimeter-wave (mmWave) and +sub-terahertz (sub-THz) beam selection process is attracting increasing +interest. This sensory data, captured for example by cameras at the +basestations, has the potential of significantly reducing the beam sweeping +overhead and enabling highly-mobile applications. The solutions developed so +far, however, have mainly considered single-candidate scenarios, i.e., +scenarios with a single candidate user in the visual scene, and were evaluated +using synthetic datasets. To address these limitations, this paper extensively +investigates the sensing-aided beam prediction problem in a real-world +multi-object vehicle-to-infrastructure (V2I) scenario and presents a +comprehensive machine learning-based framework. In particular, this paper +proposes to utilize visual and positional data to predict the optimal beam +indices as an alternative to the conventional beam sweeping approaches. For +this, a novel user (transmitter) identification solution has been developed, a +key step in realizing sensing-aided multi-candidate and multi-user beam +prediction solutions. The proposed solutions are evaluated on the large-scale +real-world DeepSense $6$G dataset. Experimental results in realistic V2I +communication scenarios indicate that the proposed solutions achieve close to +$100\%$ top-5 beam prediction accuracy for the scenarios with single-user and +close to $95\%$ top-5 beam prediction accuracy for multi-candidate scenarios. +Furthermore, the proposed approach can identify the probable transmitting +candidate with more than $93\%$ accuracy across the different scenarios. This +highlights a promising approach for nearly eliminating the beam training +overhead in mmWave/THz communication systems. + +
+
+ comment: Dataset and code files are available on the DeepSense 6G website + https://deepsense6g.net/ +
+
+
+
+
+ + ☆ SpecTracle: Wearable Facial Motion Tracking from Unobtrusive Peripheral + Cameras + + +
+ Facial motion tracking in head-mounted displays (HMD) has the potential to +enable immersive "face-to-face" interaction in a virtual environment. However, +current works on facial tracking are not suitable for unobtrusive augmented +reality (AR) glasses or do not have the ability to track arbitrary facial +movements. In this work, we demonstrate a novel system called SpecTracle that +tracks a user's facial motions using two wide-angle cameras mounted right next +to the visor of a Hololens. Avoiding the usage of cameras extended in front of +the face, our system greatly improves the feasibility to integrate full-face +tracking into a low-profile form factor. We also demonstrate that a neural +network-based model processing the wide-angle cameras can run in real-time at +24 frames per second (fps) on a mobile GPU and track independent facial +movement for different parts of the face with a user-independent model. Using a +short personalized calibration, the system improves its tracking performance by +42.3% compared to the user-independent model. + +
+
+
+
+
+ + ☆ DREAMWALKER: Mental Planning for Continuous Vision-Language Navigation ICCV 2023 + + +
+ VLN-CE is a recently released embodied task, where AI agents need to navigate +a freely traversable environment to reach a distant target location, given +language instructions. It poses great challenges due to the huge space of +possible strategies. Driven by the belief that the ability to anticipate the +consequences of future actions is crucial for the emergence of intelligent and +interpretable planning behavior, we propose DREAMWALKER -- a world model based +VLN-CE agent. The world model is built to summarize the visual, topological, +and dynamic properties of the complicated continuous environment into a +discrete, structured, and compact representation. DREAMWALKER can simulate and +evaluate possible plans entirely in such internal abstract world, before +executing costly actions. As opposed to existing model-free VLN-CE agents +simply making greedy decisions in the real world, which easily results in +shortsighted behaviors, DREAMWALKER is able to make strategic planning through +large amounts of ``mental experiments.'' Moreover, the imagined future +scenarios reflect our agent's intention, making its decision-making process +more transparent. Extensive experiments and ablation studies on VLN-CE dataset +confirm the effectiveness of the proposed approach and outline fruitful +directions for future work. + +
+
+ comment: Accepted at ICCV 2023; Project page: + https://github.com/hanqingwangai/Dreamwalker +
+
+
+
+
+ + ☆ BSED: Baseline Shapley-Based Explainable Detector + + +
+ Explainable artificial intelligence (XAI) has witnessed significant advances +in the field of object recognition, with saliency maps being used to highlight +image features relevant to the predictions of learned models. Although these +advances have made AI-based technology more interpretable to humans, several +issues have come to light. Some approaches present explanations irrelevant to +predictions, and cannot guarantee the validity of XAI (axioms). In this study, +we propose the Baseline Shapley-based Explainable Detector (BSED), which +extends the Shapley value to object detection, thereby enhancing the validity +of interpretation. The Shapley value can attribute the prediction of a learned +model to a baseline feature while satisfying the explainability axioms. The +processing cost for the BSED is within the reasonable range, while the original +Shapley value is prohibitively computationally expensive. Furthermore, BSED is +a generalizable method that can be applied to various detectors in a +model-agnostic manner, and interpret various detection targets without +fine-grained parameter tuning. These strengths can enable the practical +applicability of XAI. We present quantitative and qualitative comparisons with +existing methods to demonstrate the superior performance of our method in terms +of explanation validity. Moreover, we present some applications, such as +correcting detection based on explanations from our method. + +
+
+
+
+
+ + ☆ Space Object Identification and Classification from Hyperspectral + Material Analysis + + +
+ This paper presents a data processing pipeline designed to extract +information from the hyperspectral signature of unknown space objects. The +methodology proposed in this paper determines the material composition of space +objects from single pixel images. Two techniques are used for material +identification and classification: one based on machine learning and the other +based on a least square match with a library of known spectra. From this +information, a supervised machine learning algorithm is used to classify the +object into one of several categories based on the detection of materials on +the object. The behaviour of the material classification methods is +investigated under non-ideal circumstances, to determine the effect of +weathered materials, and the behaviour when the training library is missing a +material that is present in the object being observed. Finally the paper will +present some preliminary results on the identification and classification of +space objects. + +
+
+ comment: 30 pages, 24 figures +
+
+
+
+
+ + ☆ Probabilistic MIMO U-Net: Efficient and Accurate Uncertainty Estimation + for Pixel-wise Regression ICCV + + +
+ Uncertainty estimation in machine learning is paramount for enhancing the +reliability and interpretability of predictive models, especially in +high-stakes real-world scenarios. Despite the availability of numerous methods, +they often pose a trade-off between the quality of uncertainty estimation and +computational efficiency. Addressing this challenge, we present an adaptation +of the Multiple-Input Multiple-Output (MIMO) framework -- an approach +exploiting the overparameterization of deep neural networks -- for pixel-wise +regression tasks. Our MIMO variant expands the applicability of the approach +from simple image classification to broader computer vision domains. For that +purpose, we adapted the U-Net architecture to train multiple subnetworks within +a single model, harnessing the overparameterization in deep neural networks. +Additionally, we introduce a novel procedure for synchronizing subnetwork +performance within the MIMO framework. Our comprehensive evaluations of the +resulting MIMO U-Net on two orthogonal datasets demonstrate comparable accuracy +to existing models, superior calibration on in-distribution data, robust +out-of-distribution detection capabilities, and considerable improvements in +parameter size and inference time. Code available at +github.com/antonbaumann/MIMO-Unet + +
+
+ comment: 8 pages (references do not count), Accepted at UnCV (Workshop on + Uncertainty Quantification for Computer Vision at ICCV) +
+
+
+
+
+ + ☆ Reducing Training Demands for 3D Gait Recognition with Deep Koopman + Operator Constraints + + +
+ Deep learning research has made many biometric recognition solution viable, +but it requires vast training data to achieve real-world generalization. Unlike +other biometric traits, such as face and ear, gait samples cannot be easily +crawled from the web to form massive unconstrained datasets. As the human body +has been extensively studied for different digital applications, one can rely +on prior shape knowledge to overcome data scarcity. This work follows the +recent trend of fitting a 3D deformable body model into gait videos using deep +neural networks to obtain disentangled shape and pose representations for each +frame. To enforce temporal consistency in the network, we introduce a new +Linear Dynamical Systems (LDS) module and loss based on Koopman operator +theory, which provides an unsupervised motion regularization for the periodic +nature of gait, as well as a predictive capacity for extending gait sequences. +We compare LDS to the traditional adversarial training approach and use the USF +HumanID and CASIA-B datasets to show that LDS can obtain better accuracy with +less training data. Finally, we also show that our 3D modeling approach is much +better than other 3D gait approaches in overcoming viewpoint variation under +normal, bag-carrying and clothing change conditions. + +
+
+
+
+
+ + ☆ There Is a Digital Art History + + +
+ In this paper, we revisit Johanna Drucker's question, "Is there a digital art +history?" -- posed exactly a decade ago -- in the light of the emergence of +large-scale, transformer-based vision models. While more traditional types of +neural networks have long been part of digital art history, and digital +humanities projects have recently begun to use transformer models, their +epistemic implications and methodological affordances have not yet been +systematically analyzed. We focus our analysis on two main aspects that, +together, seem to suggest a coming paradigm shift towards a "digital" art +history in Drucker's sense. On the one hand, the visual-cultural repertoire +newly encoded in large-scale vision models has an outsized effect on digital +art history. The inclusion of significant numbers of non-photographic images +allows for the extraction and automation of different forms of visual logics. +Large-scale vision models have "seen" large parts of the Western visual canon +mediated by Net visual culture, and they continuously solidify and concretize +this canon through their already widespread application in all aspects of +digital life. On the other hand, based on two technical case studies of +utilizing a contemporary large-scale visual model to investigate basic +questions from the fields of art history and urbanism, we suggest that such +systems require a new critical methodology that takes into account the +epistemic entanglement of a model and its applications. This new methodology +reads its corpora through a neural model's training data, and vice versa: the +visual ideologies of research datasets and training datasets become entangled. + +
+
+
+
+
+ + ☆ Open-set Face Recognition using Ensembles trained on Clustered Data + + +
+ Open-set face recognition describes a scenario where unknown subjects, unseen +during the training stage, appear on test time. Not only it requires methods +that accurately identify individuals of interest, but also demands approaches +that effectively deal with unfamiliar faces. This work details a scalable +open-set face identification approach to galleries composed of hundreds and +thousands of subjects. It is composed of clustering and an ensemble of binary +learning algorithms that estimates when query face samples belong to the face +gallery and then retrieves their correct identity. The approach selects the +most suitable gallery subjects and uses the ensemble to improve prediction +performance. We carry out experiments on well-known LFW and YTF benchmarks. +Results show that competitive performance can be achieved even when targeting +scalability. + +
+
+ comment: [Original paper title: Unconstrained Face Identification using + Ensembles trained on Clustered Data] [2020 IEEE International Joint + Conference on Biometrics (IJCB)] + [https://ieeexplore.ieee.org/document/9304882] +
+
+
+
+
+ + ☆ The Performance of Transferability Metrics does not Translate to Medical + Tasks MICCAI 2023 + + +
+ Transfer learning boosts the performance of medical image analysis by +enabling deep learning (DL) on small datasets through the knowledge acquired +from large ones. As the number of DL architectures explodes, exhaustively +attempting all candidates becomes unfeasible, motivating cheaper alternatives +for choosing them. Transferability scoring methods emerge as an enticing +solution, allowing to efficiently calculate a score that correlates with the +architecture accuracy on any target dataset. However, since transferability +scores have not been evaluated on medical datasets, their use in this context +remains uncertain, preventing them from benefiting practitioners. We fill that +gap in this work, thoroughly evaluating seven transferability scores in three +medical applications, including out-of-distribution scenarios. Despite +promising results in general-purpose datasets, our results show that no +transferability score can reliably and consistently estimate target performance +in medical contexts, inviting further work in that direction. + +
+
+ comment: 10 pages, 3 figures. Accepted at the DART workshop @ MICCAI 2023 +
+
+
+
+
+ + ☆ Interaction-Aware Personalized Vehicle Trajectory Prediction Using + Temporal Graph Neural Networks + + +
+ Accurate prediction of vehicle trajectories is vital for advanced driver +assistance systems and autonomous vehicles. Existing methods mainly rely on +generic trajectory predictions derived from large datasets, overlooking the +personalized driving patterns of individual drivers. To address this gap, we +propose an approach for interaction-aware personalized vehicle trajectory +prediction that incorporates temporal graph neural networks. Our method +utilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to +model the spatio-temporal interactions between target vehicles and their +surrounding traffic. To personalize the predictions, we establish a pipeline +that leverages transfer learning: the model is initially pre-trained on a +large-scale trajectory dataset and then fine-tuned for each driver using their +specific driving data. We employ human-in-the-loop simulation to collect +personalized naturalistic driving trajectories and corresponding surrounding +vehicle trajectories. Experimental results demonstrate the superior performance +of our personalized GCN-LSTM model, particularly for longer prediction +horizons, compared to its generic counterpart. Moreover, the personalized model +outperforms individual models created without pre-training, emphasizing the +significance of pre-training on a large dataset to avoid overfitting. By +incorporating personalization, our approach enhances trajectory prediction +accuracy. + +
+
+
+
+
+ + ☆ UniBrain: Unify Image Reconstruction and Captioning All in One Diffusion + Model from Human Brain Activity + + +
+ Image reconstruction and captioning from brain activity evoked by visual +stimuli allow researchers to further understand the connection between the +human brain and the visual perception system. While deep generative models have +recently been employed in this field, reconstructing realistic captions and +images with both low-level details and high semantic fidelity is still a +challenging problem. In this work, we propose UniBrain: Unify Image +Reconstruction and Captioning All in One Diffusion Model from Human Brain +Activity. For the first time, we unify image reconstruction and captioning from +visual-evoked functional magnetic resonance imaging (fMRI) through a latent +diffusion model termed Versatile Diffusion. Specifically, we transform fMRI +voxels into text and image latent for low-level information and guide the +backward diffusion process through fMRI-based image and text conditions derived +from CLIP to generate realistic captions and images. UniBrain outperforms +current methods both qualitatively and quantitatively in terms of image +reconstruction and reports image captioning results for the first time on the +Natural Scenes Dataset (NSD) dataset. Moreover, the ablation experiments and +functional region-of-interest (ROI) analysis further exhibit the superiority of +UniBrain and provide comprehensive insight for visual-evoked brain decoding. + +
+
+
+
+
+ + ☆ U-Turn Diffusion + + +
+ We present a comprehensive examination of score-based diffusion models of AI +for generating synthetic images. These models hinge upon a dynamic auxiliary +time mechanism driven by stochastic differential equations, wherein the score +function is acquired from input images. Our investigation unveils a criterion +for evaluating efficiency of the score-based diffusion models: the power of the +generative process depends on the ability to de-construct fast correlations +during the reverse/de-noising phase. To improve the quality of the produced +synthetic images, we introduce an approach coined "U-Turn Diffusion". The +U-Turn Diffusion technique starts with the standard forward diffusion process, +albeit with a condensed duration compared to conventional settings. +Subsequently, we execute the standard reverse dynamics, initialized with the +concluding configuration from the forward process. This U-Turn Diffusion +procedure, combining forward, U-turn, and reverse processes, creates a +synthetic image approximating an independent and identically distributed +(i.i.d.) sample from the probability distribution implicitly described via +input samples. To analyze relevant time scales we employ various analytical +tools, including auto-correlation analysis, weighted norm of the score-function +analysis, and Kolmogorov-Smirnov Gaussianity test. The tools guide us to +establishing that the Kernel Intersection Distance, a metric comparing the +quality of synthetic samples with real data samples, is minimized at the +optimal U-turn time. + +
+
+
+
+
+ + ☆ Semantify: Simplifying the Control of 3D Morphable Models using CLIP + + +
+ We present Semantify: a self-supervised method that utilizes the semantic +power of CLIP language-vision foundation model to simplify the control of 3D +morphable models. Given a parametric model, training data is created by +randomly sampling the model's parameters, creating various shapes and rendering +them. The similarity between the output images and a set of word descriptors is +calculated in CLIP's latent space. Our key idea is first to choose a small set +of semantically meaningful and disentangled descriptors that characterize the +3DMM, and then learn a non-linear mapping from scores across this set to the +parametric coefficients of the given 3DMM. The non-linear mapping is defined by +training a neural network without a human-in-the-loop. We present results on +numerous 3DMMs: body shape models, face shape and expression models, as well as +animal shapes. We demonstrate how our method defines a simple slider interface +for intuitive modeling, and show how the mapping can be used to instantly fit a +3D parametric body shape to in-the-wild images. + +
+
+
+
+
+ + ☆ A Unified Query-based Paradigm for Camouflaged Instance Segmentation ACM MM2023 + + +
+ Due to the high similarity between camouflaged instances and the background, +the recently proposed camouflaged instance segmentation (CIS) faces challenges +in accurate localization and instance segmentation. To this end, inspired by +query-based transformers, we propose a unified query-based multi-task learning +framework for camouflaged instance segmentation, termed UQFormer, which builds +a set of mask queries and a set of boundary queries to learn a shared composed +query representation and efficiently integrates global camouflaged object +region and boundary cues, for simultaneous instance segmentation and instance +boundary detection in camouflaged scenarios. Specifically, we design a composed +query learning paradigm that learns a shared representation to capture object +region and boundary features by the cross-attention interaction of mask queries +and boundary queries in the designed multi-scale unified learning transformer +decoder. Then, we present a transformer-based multi-task learning framework for +simultaneous camouflaged instance segmentation and camouflaged instance +boundary detection based on the learned composed query representation, which +also forces the model to learn a strong instance-level query representation. +Notably, our model views the instance segmentation as a query-based direct set +prediction problem, without other post-processing such as non-maximal +suppression. Compared with 14 state-of-the-art approaches, our UQFormer +significantly improves the performance of camouflaged instance segmentation. +Our code will be available at https://github.com/dongbo811/UQFormer. + +
+
+ comment: This paper has been accepted by ACM MM2023 +
+
+
+
+
+ + ☆ PARIS: Part-level Reconstruction and Motion Analysis for Articulated + Objects ICCV 2023 + + +
+ We address the task of simultaneous part-level reconstruction and motion +parameter estimation for articulated objects. Given two sets of multi-view +images of an object in two static articulation states, we decouple the movable +part from the static part and reconstruct shape and appearance while predicting +the motion parameters. To tackle this problem, we present PARIS: a +self-supervised, end-to-end architecture that learns part-level implicit shape +and appearance models and optimizes motion parameters jointly without any 3D +supervision, motion, or semantic annotation. Our experiments show that our +method generalizes better across object categories, and outperforms baselines +and prior work that are given 3D point clouds as input. Our approach improves +reconstruction relative to state-of-the-art baselines with a Chamfer-L1 +distance reduction of 3.94 (45.2%) for objects and 26.79 (84.5%) for parts, and +achieves 5% error rate for motion estimation across 10 object categories. + Video summary at: https://youtu.be/tDSrROPCgUc + +
+
+ comment: Presented at ICCV 2023. Project website: + https://3dlg-hcvc.github.io/paris/ +
+
+
+
+
+ + ☆ DISBELIEVE: Distance Between Client Models is Very Essential for + Effective Local Model Poisoning Attacks MICCAI 2023 + + +
+ Federated learning is a promising direction to tackle the privacy issues +related to sharing patients' sensitive data. Often, federated systems in the +medical image analysis domain assume that the participating local clients are +\textit{honest}. Several studies report mechanisms through which a set of +malicious clients can be introduced that can poison the federated setup, +hampering the performance of the global model. To overcome this, robust +aggregation methods have been proposed that defend against those attacks. We +observe that most of the state-of-the-art robust aggregation methods are +heavily dependent on the distance between the parameters or gradients of +malicious clients and benign clients, which makes them prone to local model +poisoning attacks when the parameters or gradients of malicious and benign +clients are close. Leveraging this, we introduce DISBELIEVE, a local model +poisoning attack that creates malicious parameters or gradients such that their +distance to benign clients' parameters or gradients is low respectively but at +the same time their adverse effect on the global model's performance is high. +Experiments on three publicly available medical image datasets demonstrate the +efficacy of the proposed DISBELIEVE attack as it significantly lowers the +performance of the state-of-the-art \textit{robust aggregation} methods for +medical image analysis. Furthermore, compared to state-of-the-art local model +poisoning attacks, DISBELIEVE attack is also effective on natural images where +we observe a severe drop in classification performance of the global model for +multi-class classification on benchmark dataset CIFAR-10. + +
+
+ comment: Accepted by MICCAI 2023 - DeCaF +
+
+
+
+
+ + ☆ The Devil in the Details: Simple and Effective Optical Flow Synthetic + Data Generation + + +
+ Recent work on dense optical flow has shown significant progress, primarily +in a supervised learning manner requiring a large amount of labeled data. Due +to the expensiveness of obtaining large scale real-world data, computer +graphics are typically leveraged for constructing datasets. However, there is a +common belief that synthetic-to-real domain gaps limit generalization to real +scenes. In this paper, we show that the required characteristics in an optical +flow dataset are rather simple and present a simpler synthetic data generation +method that achieves a certain level of realism with compositions of elementary +operations. With 2D motion-based datasets, we systematically analyze the +simplest yet critical factors for generating synthetic datasets. Furthermore, +we propose a novel method of utilizing occlusion masks in a supervised method +and observe that suppressing gradients on occluded regions serves as a powerful +initial state in the curriculum learning sense. The RAFT network initially +trained on our dataset outperforms the original RAFT on the two most +challenging online benchmarks, MPI Sintel and KITTI 2015. + +
+
+
+
+
+ + ♻ ☆ Source-free Domain Adaptive Human Pose Estimation ICCV 2023 + + +
+ Human Pose Estimation (HPE) is widely used in various fields, including +motion analysis, healthcare, and virtual reality. However, the great expenses +of labeled real-world datasets present a significant challenge for HPE. To +overcome this, one approach is to train HPE models on synthetic datasets and +then perform domain adaptation (DA) on real-world data. Unfortunately, existing +DA methods for HPE neglect data privacy and security by using both source and +target data in the adaptation process. To this end, we propose a new task, +named source-free domain adaptive HPE, which aims to address the challenges of +cross-domain learning of HPE without access to source data during the +adaptation process. We further propose a novel framework that consists of three +models: source model, intermediate model, and target model, which explores the +task from both source-protect and target-relevant perspectives. The +source-protect module preserves source information more effectively while +resisting noise, and the target-relevant module reduces the sparsity of spatial +representations by building a novel spatial probability space, and +pose-specific contrastive learning and information maximization are proposed on +the basis of this space. Comprehensive experiments on several domain adaptive +HPE benchmarks show that the proposed method outperforms existing approaches by +a considerable margin. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Developability Approximation for Neural Implicits through Rank + Minimization + + +
+ Developability refers to the process of creating a surface without any +tearing or shearing from a two-dimensional plane. It finds practical +applications in the fabrication industry. An essential characteristic of a +developable 3D surface is its zero Gaussian curvature, which means that either +one or both of the principal curvatures are zero. This paper introduces a +method for reconstructing an approximate developable surface from a neural +implicit surface. The central idea of our method involves incorporating a +regularization term that operates on the second-order derivatives of the neural +implicits, effectively promoting zero Gaussian curvature. Implicit surfaces +offer the advantage of smoother deformation with infinite resolution, +overcoming the high polygonal constraints of state-of-the-art methods using +discrete representations. We draw inspiration from the properties of surface +curvature and employ rank minimization techniques derived from compressed +sensing. Experimental results on both developable and non-developable surfaces, +including those affected by noise, validate the generalizability of our method. + +
+
+
+
+
+ + ♻ ☆ SLIC: Large Receptive Field Learning with Self-Conditioned Adaptability + for Learned Image Compression + + +
+ Recently, transformers are trending as replacements for CNNs in vision tasks, +including compression. This trend compels us to question the inherent +limitations of CNNs compared to transformers and to explore if CNNs can be +enhanced to achieve the same or even better performance than transformers. We +want to design a pure CNN based model for compression as most devices are +optimized for CNNs well. In our analysis, we find that the key strengths of +transformers lie in their dynamic weights and large receptive fields. To enable +CNNs with such properties, we propose a novel transform module with large +receptive filed learning and self-conditioned adaptability for learned image +compression, named SLIC. Specifically, we enlarge the receptive field of +depth-wise convolution with suitable complexity and generate the weights +according to given conditions. In addition, we also investigate the +self-conditioned factor for channels. To prove the effectiveness of our +proposed transform module, we equip it with existing entropy models ChARM, +SCCTX, and SWAtten and we obtain models SLIC-ChARM, SLIC-SCCTX, and +SLIC-SWAtten. Extensive experiments demonstrate our SLIC-ChARM, SLIC-SCCTX, and +SLIC-SWAtten have significant improvements over corresponding baselines and +achieve SOTA performances with suitable complexity on 5 test datasets (Kodak, +Tecnick, CLIC 20, CLIC 21, JPEGAI). Code will be available at +https://github.com/JiangWeibeta/SLIC. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ C2F2NeUS: Cascade Cost Frustum Fusion for High Fidelity and + Generalizable Neural Surface Reconstruction ICCV2023 + + +
+ There is an emerging effort to combine the two popular 3D frameworks using +Multi-View Stereo (MVS) and Neural Implicit Surfaces (NIS) with a specific +focus on the few-shot / sparse view setting. In this paper, we introduce a +novel integration scheme that combines the multi-view stereo with neural signed +distance function representations, which potentially overcomes the limitations +of both methods. MVS uses per-view depth estimation and cross-view fusion to +generate accurate surfaces, while NIS relies on a common coordinate volume. +Based on this strategy, we propose to construct per-view cost frustum for finer +geometry estimation, and then fuse cross-view frustums and estimate the +implicit signed distance functions to tackle artifacts that are due to noise +and holes in the produced surface reconstruction. We further apply a cascade +frustum fusion strategy to effectively captures global-local information and +structural consistency. Finally, we apply cascade sampling and a +pseudo-geometric loss to foster stronger integration between the two +architectures. Extensive experiments demonstrate that our method reconstructs +robust surfaces and outperforms existing state-of-the-art methods. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Point Cloud Registration for LiDAR and Photogrammetric Data: a Critical + Synthesis and Performance Analysis on Classic and Deep Learning Algorithms + + +
+ Recent advances in computer vision and deep learning have shown promising +performance in estimating rigid/similarity transformation between unregistered +point clouds of complex objects and scenes. However, their performances are +mostly evaluated using a limited number of datasets from a single sensor (e.g. +Kinect or RealSense cameras), lacking a comprehensive overview of their +applicability in photogrammetric 3D mapping scenarios. In this work, we provide +a comprehensive review of the state-of-the-art (SOTA) point cloud registration +methods, where we analyze and evaluate these methods using a diverse set of +point cloud data from indoor to satellite sources. The quantitative analysis +allows for exploring the strengths, applicability, challenges, and future +trends of these methods. In contrast to existing analysis works that introduce +point cloud registration as a holistic process, our experimental analysis is +based on its inherent two-step process to better comprehend these approaches +including feature/keypoint-based initial coarse registration and dense fine +registration through cloud-to-cloud (C2C) optimization. More than ten methods, +including classic hand-crafted, deep-learning-based feature correspondence, and +robust C2C methods were tested. We observed that the success rate of most of +the algorithms are fewer than 40% over the datasets we tested and there are +still are large margin of improvement upon existing algorithms concerning 3D +sparse corresopondence search, and the ability to register point clouds with +complex geometry and occlusions. With the evaluated statistics on three +datasets, we conclude the best-performing methods for each step and provide our +recommendations, and outlook future efforts. + +
+
+ comment: 7 figures +
+
+
+
+
+ + ♻ ☆ Knowledge Restore and Transfer for Multi-label Class-Incremental + Learning + + +
+ Current class-incremental learning research mainly focuses on single-label +classification tasks while multi-label class-incremental learning (MLCIL) with +more practical application scenarios is rarely studied. Although there have +been many anti-forgetting methods to solve the problem of catastrophic +forgetting in class-incremental learning, these methods have difficulty in +solving the MLCIL problem due to label absence and information dilution. In +this paper, we propose a knowledge restore and transfer (KRT) framework for +MLCIL, which includes a dynamic pseudo-label (DPL) module to restore the old +class knowledge and an incremental cross-attention(ICA) module to save +session-specific knowledge and transfer old class knowledge to the new model +sufficiently. Besides, we propose a token loss to jointly optimize the +incremental cross-attention module. Experimental results on MS-COCO and PASCAL +VOC datasets demonstrate the effectiveness of our method for improving +recognition performance and mitigating forgetting on multi-label +class-incremental learning tasks. + +
+
+
+
+
+ + ♻ ☆ Inadequately Pre-trained Models are Better Feature Extractors ICCV'2023 + + +
+ Pre-training has been a popular learning paradigm in deep learning era, +especially in annotation-insufficient scenario. Better ImageNet pre-trained +models have been demonstrated, from the perspective of architecture, by +previous research to have better transferability to downstream tasks. However, +in this paper, we found that during the same pre-training process, models at +middle epochs, which is inadequately pre-trained, can outperform fully trained +models when used as feature extractors (FE), while the fine-tuning (FT) +performance still grows with the source performance. This reveals that there is +not a solid positive correlation between top-1 accuracy on ImageNet and the +transferring result on target data. Based on the contradictory phenomenon +between FE and FT that better feature extractor fails to be fine-tuned better +accordingly, we conduct comprehensive analyses on features before softmax layer +to provide insightful explanations. Our discoveries suggest that, during +pre-training, models tend to first learn spectral components corresponding to +large singular values and the residual components contribute more when +fine-tuning. + +
+
+ comment: Accepted by ICCV'2023 +
+
+
+
+
+ + ♻ ☆ Decoupling Dynamic Monocular Videos for Dynamic View Synthesis + + +
+ The challenge of dynamic view synthesis from dynamic monocular videos, i.e., +synthesizing novel views for free viewpoints given a monocular video of a +dynamic scene captured by a moving camera, mainly lies in accurately modeling +the dynamic objects of a scene using limited 2D frames, each with a varying +timestamp and viewpoint. Existing methods usually require pre-processed 2D +optical flow and depth maps by off-the-shelf methods to supervise the network, +making them suffer from the inaccuracy of the pre-processed supervision and the +ambiguity when lifting the 2D information to 3D. In this paper, we tackle this +challenge in an unsupervised fashion. Specifically, we decouple the motion of +the dynamic objects into object motion and camera motion, respectively +regularized by proposed unsupervised surface consistency and patch-based +multi-view constraints. The former enforces the 3D geometric surfaces of moving +objects to be consistent over time, while the latter regularizes their +appearances to be consistent across different viewpoints. Such a fine-grained +motion formulation can alleviate the learning difficulty for the network, thus +enabling it to produce not only novel views with higher quality but also more +accurate scene flows and depth than existing methods requiring extra +supervision. + +
+
+
+
+
+ + ♻ ☆ TeViS:Translating Text Synopses to Video Storyboards + + +
+ A video storyboard is a roadmap for video creation which consists of +shot-by-shot images to visualize key plots in a text synopsis. Creating video +storyboards, however, remains challenging which not only requires cross-modal +association between high-level texts and images but also demands long-term +reasoning to make transitions smooth across shots. In this paper, we propose a +new task called Text synopsis to Video Storyboard (TeViS) which aims to +retrieve an ordered sequence of images as the video storyboard to visualize the +text synopsis. We construct a MovieNet-TeViS dataset based on the public +MovieNet dataset. It contains 10K text synopses each paired with keyframes +manually selected from corresponding movies by considering both relevance and +cinematic coherence. To benchmark the task, we present strong CLIP-based +baselines and a novel VQ-Trans. VQ-Trans first encodes text synopsis and images +into a joint embedding space and uses vector quantization (VQ) to improve the +visual representation. Then, it auto-regressively generates a sequence of +visual features for retrieval and ordering. Experimental results demonstrate +that VQ-Trans significantly outperforms prior methods and the CLIP-based +baselines. Nevertheless, there is still a large gap compared to human +performance suggesting room for promising future work. The code and data are +available at: \url{https://ruc-aimind.github.io/projects/TeViS/} + +
+
+ comment: Accepted to ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis ICCV 2023 + + +
+ Neural Radiance Field (NeRF) has been a mainstream in novel view synthesis +with its remarkable quality of rendered images and simple architecture. +Although NeRF has been developed in various directions improving continuously +its performance, the necessity of a dense set of multi-view images still exists +as a stumbling block to progress for practical application. In this work, we +propose FlipNeRF, a novel regularization method for few-shot novel view +synthesis by utilizing our proposed flipped reflection rays. The flipped +reflection rays are explicitly derived from the input ray directions and +estimated normal vectors, and play a role of effective additional training rays +while enabling to estimate more accurate surface normals and learn the 3D +geometry effectively. Since the surface normal and the scene depth are both +derived from the estimated densities along a ray, the accurate surface normal +leads to more exact depth estimation, which is a key factor for few-shot novel +view synthesis. Furthermore, with our proposed Uncertainty-aware Emptiness Loss +and Bottleneck Feature Consistency Loss, FlipNeRF is able to estimate more +reliable outputs with reducing floating artifacts effectively across the +different scene structures, and enhance the feature-level consistency between +the pair of the rays cast toward the photo-consistent pixels without any +additional feature extractor, respectively. Our FlipNeRF achieves the SOTA +performance on the multiple benchmarks across all the scenarios. + +
+
+ comment: ICCV 2023. Project Page: https://shawn615.github.io/flipnerf/ +
+
+
+
+
+ + ♻ ☆ RemoteNet: Remote Sensing Image Segmentation Network based on + Global-Local Information + + +
+ Remotely captured images possess an immense scale and object appearance +variability due to the complex scene. It becomes challenging to capture the +underlying attributes in the global and local context for their segmentation. +Existing networks struggle to capture the inherent features due to the +cluttered background. To address these issues, we propose a remote sensing +image segmentation network, RemoteNet, for semantic segmentation of remote +sensing images. We capture the global and local features by leveraging the +benefits of the transformer and convolution mechanisms. RemoteNet is an +encoder-decoder design that uses multi-scale features. We construct an +attention map module to generate channel-wise attention scores for fusing these +features. We construct a global-local transformer block (GLTB) in the decoder +network to support learning robust representations during a decoding phase. +Further, we designed a feature refinement module to refine the fused output of +the shallow stage encoder feature and the deepest GLTB feature of the decoder. +Experimental findings on the two public datasets show the effectiveness of the +proposed RemoteNet. + +
+
+
+
+
+ + ♻ ☆ 2D3D-MATR: 2D-3D Matching Transformer for Detection-free Registration + between Images and Point Clouds ICCV 2023 + + +
+ The commonly adopted detect-then-match approach to registration finds +difficulties in the cross-modality cases due to the incompatible keypoint +detection and inconsistent feature description. We propose, 2D3D-MATR, a +detection-free method for accurate and robust registration between images and +point clouds. Our method adopts a coarse-to-fine pipeline where it first +computes coarse correspondences between downsampled patches of the input image +and the point cloud and then extends them to form dense correspondences between +pixels and points within the patch region. The coarse-level patch matching is +based on transformer which jointly learns global contextual constraints with +self-attention and cross-modality correlations with cross-attention. To resolve +the scale ambiguity in patch matching, we construct a multi-scale pyramid for +each image patch and learn to find for each point patch the best matching image +patch at a proper resolution level. Extensive experiments on two public +benchmarks demonstrate that 2D3D-MATR outperforms the previous state-of-the-art +P2-Net by around $20$ percentage points on inlier ratio and over $10$ points on +registration recall. Our code and models are available at +https://github.com/minhaolee/2D3DMATR. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ InterTracker: Discovering and Tracking General Objects Interacting with + Hands in the Wild IROS 2023 + + +
+ Understanding human interaction with objects is an important research topic +for embodied Artificial Intelligence and identifying the objects that humans +are interacting with is a primary problem for interaction understanding. +Existing methods rely on frame-based detectors to locate interacting objects. +However, this approach is subjected to heavy occlusions, background clutter, +and distracting objects. To address the limitations, in this paper, we propose +to leverage spatio-temporal information of hand-object interaction to track +interactive objects under these challenging cases. Without prior knowledge of +the general objects to be tracked like object tracking problems, we first +utilize the spatial relation between hands and objects to adaptively discover +the interacting objects from the scene. Second, the consistency and continuity +of the appearance of objects between successive frames are exploited to track +the objects. With this tracking formulation, our method also benefits from +training on large-scale general object-tracking datasets. We further curate a +video-level hand-object interaction dataset for testing and evaluation from +100DOH. The quantitative results demonstrate that our proposed method +outperforms the state-of-the-art methods. Specifically, in scenes with +continuous interaction with different objects, we achieve an impressive +improvement of about 10% as evaluated using the Average Precision (AP) metric. +Our qualitative findings also illustrate that our method can produce more +continuous trajectories for interacting objects. + +
+
+ comment: IROS 2023 +
+
+
+
+
+ + ♻ ☆ Unlearnable Examples Give a False Sense of Security: Piercing through + Unexploitable Data with Learnable Examples + + +
+ Safeguarding data from unauthorized exploitation is vital for privacy and +security, especially in recent rampant research in security breach such as +adversarial/membership attacks. To this end, \textit{unlearnable examples} +(UEs) have been recently proposed as a compelling protection, by adding +imperceptible perturbation to data so that models trained on them cannot +classify them accurately on original clean distribution. Unfortunately, we find +UEs provide a false sense of security, because they cannot stop unauthorized +users from utilizing other unprotected data to remove the protection, by +turning unlearnable data into learnable again. Motivated by this observation, +we formally define a new threat by introducing \textit{learnable unauthorized +examples} (LEs) which are UEs with their protection removed. The core of this +approach is a novel purification process that projects UEs onto the manifold of +LEs. This is realized by a new joint-conditional diffusion model which denoises +UEs conditioned on the pixel and perceptual similarity between UEs and LEs. +Extensive experiments demonstrate that LE delivers state-of-the-art countering +performance against both supervised UEs and unsupervised UEs in various +scenarios, which is the first generalizable countermeasure to UEs across +supervised learning and unsupervised learning. Our code is available at +\url{https://github.com/jiangw-0/LE_JCDP}. + +
+
+
+
+
+ + ♻ ☆ HumanMAC: Masked Motion Completion for Human Motion Prediction ICCV 2023 + + +
+ Human motion prediction is a classical problem in computer vision and +computer graphics, which has a wide range of practical applications. Previous +effects achieve great empirical performance based on an encoding-decoding +style. The methods of this style work by first encoding previous motions to +latent representations and then decoding the latent representations into +predicted motions. However, in practice, they are still unsatisfactory due to +several issues, including complicated loss constraints, cumbersome training +processes, and scarce switch of different categories of motions in prediction. +In this paper, to address the above issues, we jump out of the foregoing style +and propose a novel framework from a new perspective. Specifically, our +framework works in a masked completion fashion. In the training stage, we learn +a motion diffusion model that generates motions from random noise. In the +inference stage, with a denoising procedure, we make motion prediction +conditioning on observed motions to output more continuous and controllable +predictions. The proposed framework enjoys promising algorithmic properties, +which only needs one loss in optimization and is trained in an end-to-end +manner. Additionally, it accomplishes the switch of different categories of +motions effectively, which is significant in realistic tasks, e.g., the +animation task. Comprehensive experiments on benchmarks confirm the superiority +of the proposed framework. The project page is available at +https://lhchen.top/Human-MAC. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ CAD-Estate: Large-scale CAD Model Annotation in RGB Videos + + +
+ We propose a method for annotating videos of complex multi-object scenes with +a globally-consistent 3D representation of the objects. We annotate each object +with a CAD model from a database, and place it in the 3D coordinate frame of +the scene with a 9-DoF pose transformation. Our method is semi-automatic and +works on commonly-available RGB videos, without requiring a depth sensor. Many +steps are performed automatically, and the tasks performed by humans are +simple, well-specified, and require only limited reasoning in 3D. This makes +them feasible for crowd-sourcing and has allowed us to construct a large-scale +dataset by annotating real-estate videos from YouTube. Our dataset CAD-Estate +offers 101k instances of 12k unique CAD models placed in the 3D representations +of 20k videos. In comparison to Scan2CAD, the largest existing dataset with CAD +model annotations on real scenes, CAD-Estate has 7x more instances and 4x more +unique CAD models. We showcase the benefits of pre-training a Mask2CAD model on +CAD-Estate for the task of automatic 3D object reconstruction and pose +estimation, demonstrating that it leads to performance improvements on the +popular Scan2CAD benchmark. The dataset is available at +https://github.com/google-research/cad-estate. + +
+
+ comment: Project page: https://github.com/google-research/cad-estate +
+
+
+
+
+ + ♻ ☆ MotionBERT: A Unified Perspective on Learning Human Motion + Representations ICCV 2023 + + +
+ We present a unified perspective on tackling various human-centric video +tasks by learning human motion representations from large-scale and +heterogeneous data resources. Specifically, we propose a pretraining stage in +which a motion encoder is trained to recover the underlying 3D motion from +noisy partial 2D observations. The motion representations acquired in this way +incorporate geometric, kinematic, and physical knowledge about human motion, +which can be easily transferred to multiple downstream tasks. We implement the +motion encoder with a Dual-stream Spatio-temporal Transformer (DSTformer) +neural network. It could capture long-range spatio-temporal relationships among +the skeletal joints comprehensively and adaptively, exemplified by the lowest +3D pose estimation error so far when trained from scratch. Furthermore, our +proposed framework achieves state-of-the-art performance on all three +downstream tasks by simply finetuning the pretrained motion encoder with a +simple regression head (1-2 layers), which demonstrates the versatility of the +learned motion representations. Code and models are available at +https://motionbert.github.io/ + +
+
+ comment: ICCV 2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ Multiscale Attention via Wavelet Neural Operators for Vision + Transformers + + +
+ Transformers have achieved widespread success in computer vision. At their +heart, there is a Self-Attention (SA) mechanism, an inductive bias that +associates each token in the input with every other token through a weighted +basis. The standard SA mechanism has quadratic complexity with the sequence +length, which impedes its utility to long sequences appearing in high +resolution vision. Recently, inspired by operator learning for PDEs, Adaptive +Fourier Neural Operators (AFNO) were introduced for high resolution attention +based on global convolution that is efficiently implemented via FFT. However, +the AFNO global filtering cannot well represent small and moderate scale +structures that commonly appear in natural images. To leverage the +coarse-to-fine scale structures we introduce a Multiscale Wavelet Attention +(MWA) by leveraging wavelet neural operators which incurs linear complexity in +the sequence size. We replace the attention in ViT with MWA and our experiments +with CIFAR and Tiny-ImageNet classification demonstrate significant improvement +over alternative Fourier-based attentions such as AFNO and Global Filter +Network (GFN). + +
+
+
+
+
+ + ♻ ☆ Downstream-agnostic Adversarial Examples ICCV '23 + + +
+ Self-supervised learning usually uses a large amount of unlabeled data to +pre-train an encoder which can be used as a general-purpose feature extractor, +such that downstream users only need to perform fine-tuning operations to enjoy +the benefit of "large model". Despite this promising prospect, the security of +pre-trained encoder has not been thoroughly investigated yet, especially when +the pre-trained encoder is publicly available for commercial use. + In this paper, we propose AdvEncoder, the first framework for generating +downstream-agnostic universal adversarial examples based on the pre-trained +encoder. AdvEncoder aims to construct a universal adversarial perturbation or +patch for a set of natural images that can fool all the downstream tasks +inheriting the victim pre-trained encoder. Unlike traditional adversarial +example works, the pre-trained encoder only outputs feature vectors rather than +classification labels. Therefore, we first exploit the high frequency component +information of the image to guide the generation of adversarial examples. Then +we design a generative attack framework to construct adversarial +perturbations/patches by learning the distribution of the attack surrogate +dataset to improve their attack success rates and transferability. Our results +show that an attacker can successfully attack downstream tasks without knowing +either the pre-training dataset or the downstream dataset. We also tailor four +defenses for pre-trained encoders, the results of which further prove the +attack ability of AdvEncoder. + +
+
+ comment: This paper has been accepted by the International Conference on + Computer Vision (ICCV '23, October 2--6, 2023, Paris, France) +
+
+
+
+
+ + ♻ ☆ NeuralReshaper: Single-image Human-body Retouching with Deep Neural + Networks + + +
+ In this paper, we present NeuralReshaper, a novel method for semantic +reshaping of human bodies in single images using deep generative networks. To +achieve globally coherent reshaping effects, our approach follows a +fit-then-reshape pipeline, which first fits a parametric 3D human model to a +source human image and then reshapes the fitted 3D model with respect to +user-specified semantic attributes. Previous methods rely on image warping to +transfer 3D reshaping effects to the entire image domain and thus often cause +distortions in both foreground and background. In contrast, we resort to +generative adversarial nets conditioned on the source image and a 2D warping +field induced by the reshaped 3D model, to achieve more realistic reshaping +results. Specifically, we separately encode the foreground and background +information in the source image using a two-headed UNet-like generator, and +guide the information flow from the foreground branch to the background branch +via feature space warping. Furthermore, to deal with the lack-of-data problem +that no paired data exist (i.e., the same human bodies in varying shapes), we +introduce a novel self-supervised strategy to train our network. Unlike +previous methods that often require manual efforts to correct undesirable +artifacts caused by incorrect body-to-image fitting, our method is fully +automatic. Extensive experiments on both indoor and outdoor datasets +demonstrate the superiority of our method over previous approaches. + +
+
+
+
+
+ + ♻ ☆ Rethinking Mobile Block for Efficient Attention-based Models + + +
+ This paper focuses on developing modern, efficient, lightweight models for +dense predictions while trading off parameters, FLOPs, and performance. +Inverted Residual Block (IRB) serves as the infrastructure for lightweight +CNNs, but no counterpart has been recognized by attention-based studies. This +work rethinks lightweight infrastructure from efficient IRB and effective +components of Transformer from a unified perspective, extending CNN-based IRB +to attention-based models and abstracting a one-residual Meta Mobile Block +(MMB) for lightweight model design. Following simple but effective design +criterion, we deduce a modern Inverted Residual Mobile Block (iRMB) and build a +ResNet-like Efficient MOdel (EMO) with only iRMB for down-stream tasks. +Extensive experiments on ImageNet-1K, COCO2017, and ADE20K benchmarks +demonstrate the superiority of our EMO over state-of-the-art methods, e.g., +EMO-1M/2M/5M achieve 71.5, 75.1, and 78.4 Top-1 that surpass equal-order +CNN-/Attention-based models, while trading-off the parameter, efficiency, and +accuracy well: running 2.8-4.0x faster than EdgeNeXt on iPhone14. + +
+
+
+
+
+ + ♻ ☆ WaterScenes: A Multi-Task 4D Radar-Camera Fusion Dataset and Benchmark + for Autonomous Driving on Water Surfaces + + +
+ Autonomous driving on water surfaces plays an essential role in executing +hazardous and time-consuming missions, such as maritime surveillance, survivors +rescue, environmental monitoring, hydrography mapping and waste cleaning. This +work presents WaterScenes, the first multi-task 4D radar-camera fusion dataset +for autonomous driving on water surfaces. Equipped with a 4D radar and a +monocular camera, our Unmanned Surface Vehicle (USV) proffers all-weather +solutions for discerning object-related information, including color, shape, +texture, range, velocity, azimuth, and elevation. Focusing on typical static +and dynamic objects on water surfaces, we label the camera images and radar +point clouds at pixel-level and point-level, respectively. In addition to basic +perception tasks, such as object detection, instance segmentation and semantic +segmentation, we also provide annotations for free-space segmentation and +waterline segmentation. Leveraging the multi-task and multi-modal data, we +conduct benchmark experiments on the uni-modality of radar and camera, as well +as the fused modalities. Experimental results demonstrate that 4D radar-camera +fusion can considerably improve the accuracy and robustness of perception on +water surfaces, especially in adverse lighting and weather conditions. +WaterScenes dataset is public on https://waterscenes.github.io. + +
+
+
+
+
+ + ♻ ☆ FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of + Autonomous Driving + + +
+ Building a multi-modality multi-task neural network toward accurate and +robust performance is a de-facto standard in perception task of autonomous +driving. However, leveraging such data from multiple sensors to jointly +optimize the prediction and planning tasks remains largely unexplored. In this +paper, we present FusionAD, to the best of our knowledge, the first unified +framework that fuse the information from two most critical sensors, camera and +LiDAR, goes beyond perception task. Concretely, we first build a transformer +based multi-modality fusion network to effectively produce fusion based +features. In constrast to camera-based end-to-end method UniAD, we then +establish a fusion aided modality-aware prediction and status-aware planning +modules, dubbed FMSPnP that take advantages of multi-modality features. We +conduct extensive experiments on commonly used benchmark nuScenes dataset, our +FusionAD achieves state-of-the-art performance and surpassing baselines on +average 15% on perception tasks like detection and tracking, 10% on occupancy +prediction accuracy, reducing prediction error from 0.708 to 0.389 in ADE score +and reduces the collision rate from 0.31% to only 0.12%. + +
+
+
+
+
+ + ♻ ☆ Decision-BADGE: Decision-based Adversarial Batch Attack with Directional + Gradient Estimation + + +
+ The susceptibility of deep neural networks (DNNs) to adversarial examples has +prompted an increase in the deployment of adversarial attacks. Image-agnostic +universal adversarial perturbations (UAPs) are much more threatening, but many +limitations exist to implementing UAPs in real-world scenarios where only +binary decisions are returned. In this research, we propose Decision-BADGE, a +novel method to craft universal adversarial perturbations for executing +decision-based black-box attacks. To optimize perturbation with decisions, we +addressed two challenges, namely the magnitude and the direction of the +gradient. First, we use batch loss, differences from distributions of ground +truth, and accumulating decisions in batches to determine the magnitude of the +gradient. This magnitude is applied in the direction of the revised +simultaneous perturbation stochastic approximation (SPSA) to update the +perturbation. This simple yet efficient method can be easily extended to +score-based attacks as well as targeted attacks. Experimental validation across +multiple victim models demonstrates that the Decision-BADGE outperforms +existing attack methods, even image-specific and score-based attacks. In +particular, our proposed method shows a superior success rate with less +training time. The research also shows that Decision-BADGE can successfully +deceive unseen victim models and accurately target specific classes. + +
+
+ comment: 9 pages (7 pages except for references), 4 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models: + A Comprehensive Survey + + +
+ Diffusion models and large language models have emerged as leading-edge +generative models and have sparked a revolutionary impact on various aspects of +human life. However, the practical implementation of these models has also +exposed inherent risks, highlighting their dual nature and raising concerns +regarding their trustworthiness. Despite the abundance of literature on this +subject, a comprehensive survey specifically delving into the intersection of +large-scale generative models and their trustworthiness remains largely absent. +To bridge this gap, This paper investigates both the long-standing and emerging +threats associated with these models across four fundamental dimensions: +privacy, security, fairness, and responsibility. In this way, we construct an +extensive map outlining the trustworthiness of these models, while also +providing practical recommendations and identifying future directions. These +efforts are crucial for promoting the trustworthy deployment of these models, +ultimately benefiting society as a whole. + +
+
+ comment: draft version +
+
+
+
+
+ + ♻ ☆ Lifelong-MonoDepth: Lifelong Learning for Multi-Domain Monocular Metric + Depth Estimation + + +
+ With the rapid advancements in autonomous driving and robot navigation, there +is a growing demand for lifelong learning models capable of estimating metric +(absolute) depth. Lifelong learning approaches potentially offer significant +cost savings in terms of model training, data storage, and collection. However, +the quality of RGB images and depth maps is sensor-dependent, and depth maps in +the real world exhibit domain-specific characteristics, leading to variations +in depth ranges. These challenges limit existing methods to lifelong learning +scenarios with small domain gaps and relative depth map estimation. To +facilitate lifelong metric depth learning, we identify three crucial technical +challenges that require attention: i) developing a model capable of addressing +the depth scale variation through scale-aware depth learning, ii) devising an +effective learning strategy to handle significant domain gaps, and iii) +creating an automated solution for domain-aware depth inference in practical +applications. Based on the aforementioned considerations, in this paper, we +present i) a lightweight multi-head framework that effectively tackles the +depth scale imbalance, ii) an uncertainty-aware lifelong learning solution that +adeptly handles significant domain gaps, and iii) an online domain-specific +predictor selection method for real-time inference. Through extensive numerical +studies, we show that the proposed method can achieve good efficiency, +stability, and plasticity, leading the benchmarks by 8% to 15%. + +
+
+
+
+
+ + ♻ ☆ Logistic-Normal Likelihoods for Heteroscedastic Label Noise + + +
+ A natural way of estimating heteroscedastic label noise in regression is to +model the observed (potentially noisy) target as a sample from a normal +distribution, whose parameters can be learned by minimizing the negative +log-likelihood. This formulation has desirable loss attenuation properties, as +it reduces the contribution of high-error examples. Intuitively, this behavior +can improve robustness against label noise by reducing overfitting. We propose +an extension of this simple and probabilistic approach to classification that +has the same desirable loss attenuation properties. Furthermore, we discuss and +address some practical challenges of this extension. We evaluate the +effectiveness of the method by measuring its robustness against label noise in +classification. We perform enlightening experiments exploring the inner +workings of the method, including sensitivity to hyperparameters, ablation +studies, and other insightful analyses. + +
+
+
+
+
+ + ♻ ☆ Towards Unified Text-based Person Retrieval: A Large-scale + Multi-Attribute and Language Search Benchmark + + +
+ In this paper, we introduce a large Multi-Attribute and Language Search +dataset for text-based person retrieval, called MALS, and explore the +feasibility of performing pre-training on both attribute recognition and +image-text matching tasks in one stone. In particular, MALS contains 1,510,330 +image-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES, +and all images are annotated with 27 attributes. Considering the privacy +concerns and annotation costs, we leverage the off-the-shelf diffusion models +to generate the dataset. To verify the feasibility of learning from the +generated data, we develop a new joint Attribute Prompt Learning and Text +Matching Learning (APTM) framework, considering the shared knowledge between +attribute and text. As the name implies, APTM contains an attribute prompt +learning stream and a text matching learning stream. (1) The attribute prompt +learning leverages the attribute prompts for image-attribute alignment, which +enhances the text matching learning. (2) The text matching learning facilitates +the representation learning on fine-grained details, and in turn, boosts the +attribute prompt learning. Extensive experiments validate the effectiveness of +the pre-training on MALS, achieving state-of-the-art retrieval performance via +APTM on three challenging real-world benchmarks. In particular, APTM achieves a +consistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on +CUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively. + +
+
+
+
+
+ + ♻ ☆ Chaos to Order: A Label Propagation Perspective on Source-Free Domain + Adaptation ACM MM2023 + + +
+ Source-free domain adaptation (SFDA), where only a pre-trained source model +is used to adapt to the target distribution, is a more general approach to +achieving domain adaptation in the real world. However, it can be challenging +to capture the inherent structure of the target features accurately due to the +lack of supervised information on the target domain. By analyzing the +clustering performance of the target features, we show that they still contain +core features related to discriminative attributes but lack the collation of +semantic information. Inspired by this insight, we present Chaos to Order +(CtO), a novel approach for SFDA that strives to constrain semantic credibility +and propagate label information among target subpopulations. CtO divides the +target data into inner and outlier samples based on the adaptive threshold of +the learning state, customizing the learning strategy to fit the data +properties best. Specifically, inner samples are utilized for learning +intra-class structure thanks to their relatively well-clustered properties. The +low-density outlier samples are regularized by input consistency to achieve +high accuracy with respect to the ground truth labels. In CtO, by employing +different learning strategies to propagate the labels from the inner local to +outlier instances, it clusters the global samples from chaos to order. We +further adaptively regulate the neighborhood affinity of the inner samples to +constrain the local semantic credibility. In theoretical and empirical +analyses, we demonstrate that our algorithm not only propagates from inner to +outlier but also prevents local clustering from forming spurious clusters. +Empirical evidence demonstrates that CtO outperforms the state of the arts on +three public benchmarks: Office-31, Office-Home, and VisDA. + +
+
+ comment: Accepted by ACM MM2023 +
+
+
+
+
+ + ♻ ☆ Towards Deeply Unified Depth-aware Panoptic Segmentation with + Bi-directional Guidance Learning ICCV 2023 + + +
+ Depth-aware panoptic segmentation is an emerging topic in computer vision +which combines semantic and geometric understanding for more robust scene +interpretation. Recent works pursue unified frameworks to tackle this challenge +but mostly still treat it as two individual learning tasks, which limits their +potential for exploring cross-domain information. We propose a deeply unified +framework for depth-aware panoptic segmentation, which performs joint +segmentation and depth estimation both in a per-segment manner with identical +object queries. To narrow the gap between the two tasks, we further design a +geometric query enhancement method, which is able to integrate scene geometry +into object queries using latent representations. In addition, we propose a +bi-directional guidance learning approach to facilitate cross-task feature +learning by taking advantage of their mutual relations. Our method sets the new +state of the art for depth-aware panoptic segmentation on both Cityscapes-DVPS +and SemKITTI-DVPS datasets. Moreover, our guidance learning approach is shown +to deliver performance improvement even under incomplete supervision labels. + +
+
+ comment: to be published in ICCV 2023 +
+
+
+
+
+ + ♻ ☆ A jet tagging algorithm of graph network with HaarPooling message + passing + + +
+ Recently methods of graph neural networks (GNNs) have been applied to solving +the problems in high energy physics (HEP) and have shown its great potential +for quark-gluon tagging with graph representation of jet events. In this paper, +we introduce an approach of GNNs combined with a HaarPooling operation to +analyze the events, called HaarPooling Message Passing neural network (HMPNet). +In HMPNet, HaarPooling not only extracts the features of graph, but embeds +additional information obtained by clustering of k-means of different particle +features. We construct Haarpooling from five different features: absolute +energy $\log E$, transverse momentum $\log p_T$, relative coordinates +$(\Delta\eta,\Delta\phi)$, the mixed ones $(\log E, \log p_T)$ and $(\log E, +\log p_T, \Delta\eta,\Delta\phi)$. The results show that an appropriate +selection of information for HaarPooling enhances the accuracy of quark-gluon +tagging, as adding extra information of $\log P_T$ to the HMPNet outperforms +all the others, whereas adding relative coordinates information +$(\Delta\eta,\Delta\phi)$ is not very effective. This implies that by adding +effective particle features from HaarPooling can achieve much better results +than solely pure message passing neutral network (MPNN) can do, which +demonstrates significant improvement of feature extraction via the pooling +process. Finally we compare the HMPNet study, ordering by $p_T$, with other +studies and prove that the HMPNet is also a good choice of GNN algorithms for +jet tagging. + +
+
+
+
+
+ + ♻ ☆ MAFW: A Large-scale, Multi-modal, Compound Affective Database for + Dynamic Facial Expression Recognition in the Wild ACM MM'22 + + +
+ Dynamic facial expression recognition (FER) databases provide important data +support for affective computing and applications. However, most FER databases +are annotated with several basic mutually exclusive emotional categories and +contain only one modality, e.g., videos. The monotonous labels and modality +cannot accurately imitate human emotions and fulfill applications in the real +world. In this paper, we propose MAFW, a large-scale multi-modal compound +affective database with 10,045 video-audio clips in the wild. Each clip is +annotated with a compound emotional category and a couple of sentences that +describe the subjects' affective behaviors in the clip. For the compound +emotion annotation, each clip is categorized into one or more of the 11 +widely-used emotions, i.e., anger, disgust, fear, happiness, neutral, sadness, +surprise, contempt, anxiety, helplessness, and disappointment. To ensure high +quality of the labels, we filter out the unreliable annotations by an +Expectation Maximization (EM) algorithm, and then obtain 11 single-label +emotion categories and 32 multi-label emotion categories. To the best of our +knowledge, MAFW is the first in-the-wild multi-modal database annotated with +compound emotion annotations and emotion-related captions. Additionally, we +also propose a novel Transformer-based expression snippet feature learning +method to recognize the compound emotions leveraging the expression-change +relations among different emotions and modalities. Extensive experiments on +MAFW database show the advantages of the proposed method over other +state-of-the-art methods for both uni- and multi-modal FER. Our MAFW database +is publicly available from https://mafw-database.github.io/MAFW. + +
+
+ comment: This paper has been accepted by ACM MM'22 +
+
+
+
+
+ + ♻ ☆ Aerial-Ground Person Re-ID ICME2023 + + +
+ Person re-ID matches persons across multiple non-overlapping cameras. Despite +the increasing deployment of airborne platforms in surveillance, current +existing person re-ID benchmarks' focus is on ground-ground matching and very +limited efforts on aerial-aerial matching. We propose a new benchmark dataset - +AG-ReID, which performs person re-ID matching in a new setting: across aerial +and ground cameras. Our dataset contains 21,983 images of 388 identities and 15 +soft attributes for each identity. The data was collected by a UAV flying at +altitudes between 15 to 45 meters and a ground-based CCTV camera on a +university campus. Our dataset presents a novel elevated-viewpoint challenge +for person re-ID due to the significant difference in person appearance across +these cameras. We propose an explainable algorithm to guide the person re-ID +model's training with soft attributes to address this challenge. Experiments +demonstrate the efficacy of our method on the aerial-ground person re-ID task. +The dataset will be published and the baseline codes will be open-sourced at +https://github.com/huynguyen792/AG-ReID to facilitate research in this area. + +
+
+ comment: Published on IEEE International Conference on Multimedia and Expo + 2023 (ICME2023) +
+
+
+
+
+ + ♻ ☆ A General Implicit Framework for Fast NeRF Composition and Rendering + + +
+ A variety of Neural Radiance Fields (NeRF) methods have recently achieved +remarkable success in high render speed. However, current accelerating methods +are specialized and incompatible with various implicit methods, preventing +real-time composition over various types of NeRF works. Because NeRF relies on +sampling along rays, it is possible to provide general guidance for +acceleration. To that end, we propose a general implicit pipeline for composing +NeRF objects quickly. Our method enables the casting of dynamic shadows within +or between objects using analytical light sources while allowing multiple NeRF +objects to be seamlessly placed and rendered together with any arbitrary rigid +transformations. Mainly, our work introduces a new surface representation known +as Neural Depth Fields (NeDF) that quickly determines the spatial relationship +between objects by allowing direct intersection computation between rays and +implicit surfaces. It leverages an intersection neural network to query NeRF +for acceleration instead of depending on an explicit spatial structure.Our +proposed method is the first to enable both the progressive and interactive +composition of NeRF objects. Additionally, it also serves as a previewing +plugin for a range of existing NeRF works. + +
+
+ comment: 7 pages for main content +
+
+
+
+
+ + ♻ ☆ An Embarrassingly Simple Backdoor Attack on Self-supervised Learning ICCV '23 + + +
+ As a new paradigm in machine learning, self-supervised learning (SSL) is +capable of learning high-quality representations of complex data without +relying on labels. In addition to eliminating the need for labeled data, +research has found that SSL improves the adversarial robustness over supervised +learning since lacking labels makes it more challenging for adversaries to +manipulate model predictions. However, the extent to which this robustness +superiority generalizes to other types of attacks remains an open question. + We explore this question in the context of backdoor attacks. Specifically, we +design and evaluate CTRL, an embarrassingly simple yet highly effective +self-supervised backdoor attack. By only polluting a tiny fraction of training +data (<= 1%) with indistinguishable poisoning samples, CTRL causes any +trigger-embedded input to be misclassified to the adversary's designated class +with a high probability (>= 99%) at inference time. Our findings suggest that +SSL and supervised learning are comparably vulnerable to backdoor attacks. More +importantly, through the lens of CTRL, we study the inherent vulnerability of +SSL to backdoor attacks. With both empirical and analytical evidence, we reveal +that the representation invariance property of SSL, which benefits adversarial +robustness, may also be the very reason making \ssl highly susceptible to +backdoor attacks. Our findings also imply that the existing defenses against +supervised backdoor attacks are not easily retrofitted to the unique +vulnerability of SSL. + +
+
+ comment: The 2023 International Conference on Computer Vision (ICCV '23) +
+
+
+
+
+ + ♻ ☆ Agent-Controller Representations: Principled Offline RL with Rich + Exogenous Information ICML 2023 + + +
+ Learning to control an agent from data collected offline in a rich +pixel-based visual observation space is vital for real-world applications of +reinforcement learning (RL). A major challenge in this setting is the presence +of input information that is hard to model and irrelevant to controlling the +agent. This problem has been approached by the theoretical RL community through +the lens of exogenous information, i.e, any control-irrelevant information +contained in observations. For example, a robot navigating in busy streets +needs to ignore irrelevant information, such as other people walking in the +background, textures of objects, or birds in the sky. In this paper, we focus +on the setting with visually detailed exogenous information, and introduce new +offline RL benchmarks offering the ability to study this problem. We find that +contemporary representation learning techniques can fail on datasets where the +noise is a complex and time dependent process, which is prevalent in practical +applications. To address these, we propose to use multi-step inverse models, +which have seen a great deal of interest in the RL theory community, to learn +Agent-Controller Representations for Offline-RL (ACRO). Despite being simple +and requiring no reward, we show theoretically and empirically that the +representation created by this objective greatly outperforms baselines. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ♻ ☆ SegPGD: An Effective and Efficient Adversarial Attack for Evaluating and + Boosting Segmentation Robustness + + +
+ Deep neural network-based image classifications are vulnerable to adversarial +perturbations. The image classifications can be easily fooled by adding +artificial small and imperceptible perturbations to input images. As one of the +most effective defense strategies, adversarial training was proposed to address +the vulnerability of classification models, where the adversarial examples are +created and injected into training data during training. The attack and defense +of classification models have been intensively studied in past years. Semantic +segmentation, as an extension of classifications, has also received great +attention recently. Recent work shows a large number of attack iterations are +required to create effective adversarial examples to fool segmentation models. +The observation makes both robustness evaluation and adversarial training on +segmentation models challenging. In this work, we propose an effective and +efficient segmentation attack method, dubbed SegPGD. Besides, we provide a +convergence analysis to show the proposed SegPGD can create more effective +adversarial examples than PGD under the same number of attack iterations. +Furthermore, we propose to apply our SegPGD as the underlying attack method for +segmentation adversarial training. Since SegPGD can create more effective +adversarial examples, the adversarial training with our SegPGD can boost the +robustness of segmentation models. Our proposals are also verified with +experiments on popular Segmentation model architectures and standard +segmentation datasets. + +
+
+
+
+
+ + ♻ ☆ OmniLabel: A Challenging Benchmark for Language-Based Object Detection ICCV 2023 + + +
+ Language-based object detection is a promising direction towards building a +natural interface to describe objects in images that goes far beyond plain +category names. While recent methods show great progress in that direction, +proper evaluation is lacking. With OmniLabel, we propose a novel task +definition, dataset, and evaluation metric. The task subsumes standard- and +open-vocabulary detection as well as referring expressions. With more than 28K +unique object descriptions on over 25K images, OmniLabel provides a challenging +benchmark with diverse and complex object descriptions in a naturally +open-vocabulary setting. Moreover, a key differentiation to existing benchmarks +is that our object descriptions can refer to one, multiple or even no object, +hence, providing negative examples in free-form text. The proposed evaluation +handles the large label space and judges performance via a modified average +precision metric, which we validate by evaluating strong language-based +baselines. OmniLabel indeed provides a challenging test bed for future research +on language-based detection. + +
+
+ comment: ICCV 2023 Oral - Visit our project website at + https://www.omnilabel.org +
+
+
+
+
+ + ♻ ☆ Self-supervised Pseudo Multi-class Pre-training for Unsupervised Anomaly + Detection and Segmentation in Medical Images + + +
+ Unsupervised anomaly detection (UAD) methods are trained with normal (or +healthy) images only, but during testing, they are able to classify normal and +abnormal (or disease) images. UAD is an important medical image analysis (MIA) +method to be applied in disease screening problems because the training sets +available for those problems usually contain only normal images. However, the +exclusive reliance on normal images may result in the learning of ineffective +low-dimensional image representations that are not sensitive enough to detect +and segment unseen abnormal lesions of varying size, appearance, and shape. +Pre-training UAD methods with self-supervised learning, based on computer +vision techniques, can mitigate this challenge, but they are sub-optimal +because they do not explore domain knowledge for designing the pretext tasks, +and their contrastive learning losses do not try to cluster the normal training +images, which may result in a sparse distribution of normal images that is +ineffective for anomaly detection. In this paper, we propose a new +self-supervised pre-training method for MIA UAD applications, named Pseudo +Multi-class Strong Augmentation via Contrastive Learning (PMSACL). PMSACL +consists of a novel optimisation method that contrasts a normal image class +from multiple pseudo classes of synthesised abnormal images, with each class +enforced to form a dense cluster in the feature space. In the experiments, we +show that our PMSACL pre-training improves the accuracy of SOTA UAD methods on +many MIA benchmarks using colonoscopy, fundus screening and Covid-19 Chest +X-ray datasets. The code is made publicly available via +https://github.com/tianyu0207/PMSACL. + +
+
+ comment: Accepted to Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ MotionDeltaCNN: Sparse CNN Inference of Frame Differences in Moving + Camera Videos + + +
+ Convolutional neural network inference on video input is computationally +expensive and requires high memory bandwidth. Recently, DeltaCNN managed to +reduce the cost by only processing pixels with significant updates over the +previous frame. However, DeltaCNN relies on static camera input. Moving cameras +add new challenges in how to fuse newly unveiled image regions with already +processed regions efficiently to minimize the update rate - without increasing +memory overhead and without knowing the camera extrinsics of future frames. In +this work, we propose MotionDeltaCNN, a sparse CNN inference framework that +supports moving cameras. We introduce spherical buffers and padded convolutions +to enable seamless fusion of newly unveiled regions and previously processed +regions -- without increasing memory footprint. Our evaluation shows that we +outperform DeltaCNN by up to 90% for moving camera videos. + +
+
+
+
+
+ + ♻ ☆ Efficient Computation Sharing for Multi-Task Visual Scene Understanding ICCV 2023 + + +
+ Solving multiple visual tasks using individual models can be +resource-intensive, while multi-task learning can conserve resources by sharing +knowledge across different tasks. Despite the benefits of multi-task learning, +such techniques can struggle with balancing the loss for each task, leading to +potential performance degradation. We present a novel computation- and +parameter-sharing framework that balances efficiency and accuracy to perform +multiple visual tasks utilizing individually-trained single-task transformers. +Our method is motivated by transfer learning schemes to reduce computational +and parameter storage costs while maintaining the desired performance. Our +approach involves splitting the tasks into a base task and the other sub-tasks, +and sharing a significant portion of activations and parameters/weights between +the base and sub-tasks to decrease inter-task redundancies and enhance +knowledge sharing. The evaluation conducted on NYUD-v2 and PASCAL-context +datasets shows that our method is superior to the state-of-the-art +transformer-based multi-task learning techniques with higher accuracy and +reduced computational resources. Moreover, our method is extended to video +stream inputs, further reducing computational costs by efficiently sharing +information across the temporal domain as well as the task domain. Our codes +and models will be publicly available. + +
+
+ comment: Camera-Ready version. Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ DIME-FM: DIstilling Multimodal and Efficient Foundation Models ICCV 2023 + + +
+ Large Vision-Language Foundation Models (VLFM), such as CLIP, ALIGN and +Florence, are trained on large-scale datasets of image-caption pairs and +achieve superior transferability and robustness on downstream tasks, but they +are difficult to use in many practical applications due to their large size, +high latency and fixed architectures. Unfortunately, recent work shows training +a small custom VLFM for resource-limited applications is currently very +difficult using public and smaller-scale data. In this paper, we introduce a +new distillation mechanism (DIME-FM) that allows us to transfer the knowledge +contained in large VLFMs to smaller, customized foundation models using a +relatively small amount of inexpensive, unpaired images and sentences. We +transfer the knowledge from the pre-trained CLIP-ViTL/14 model to a ViT-B/32 +model, with only 40M public images and 28.4M unpaired public sentences. The +resulting model "Distill-ViT-B/32" rivals the CLIP-ViT-B/32 model pre-trained +on its private WiT dataset (400M image-text pairs): Distill-ViT-B/32 achieves +similar results in terms of zero-shot and linear-probing performance on both +ImageNet and the ELEVATER (20 image classification tasks) benchmarks. It also +displays comparable robustness when evaluated on five datasets with natural +distribution shifts from ImageNet. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Vision Backbone Enhancement via Multi-Stage Cross-Scale Attention + + +
+ Convolutional neural networks (CNNs) and vision transformers (ViTs) have +achieved remarkable success in various vision tasks. However, many +architectures do not consider interactions between feature maps from different +stages and scales, which may limit their performance. In this work, we propose +a simple add-on attention module to overcome these limitations via multi-stage +and cross-scale interactions. Specifically, the proposed Multi-Stage +Cross-Scale Attention (MSCSA) module takes feature maps from different stages +to enable multi-stage interactions and achieves cross-scale interactions by +computing self-attention at different scales based on the multi-stage feature +maps. Our experiments on several downstream tasks show that MSCSA provides a +significant performance boost with modest additional FLOPs and runtime. + +
+
+
+
+
+ + ♻ ☆ Continual Domain Adaptation on Aerial Images under Gradually Degrading + Weather + + +
+ Domain adaptation (DA) strives to mitigate the domain gap between the source +domain where a model is trained, and the target domain where the model is +deployed. When a deep learning model is deployed on an aerial platform, it may +face gradually degrading weather conditions during operation, leading to +widening domain gaps between the training data and the encountered evaluation +data. We synthesize two such gradually worsening weather conditions on real +images from two existing aerial imagery datasets, generating a total of four +benchmark datasets. Under the continual, or test-time adaptation setting, we +evaluate three DA models on our datasets: a baseline standard DA model and two +continual DA models. In such setting, the models can access only one small +portion, or one batch of the target data at a time, and adaptation takes place +continually, and over only one epoch of the data. The combination of the +constraints of continual adaptation, and gradually deteriorating weather +conditions provide the practical DA scenario for aerial deployment. Among the +evaluated models, we consider both convolutional and transformer architectures +for comparison. We discover stability issues during adaptation for existing +buffer-fed continual DA methods, and offer gradient normalization as a simple +solution to curb training instability. + +
+
+
+
+
+ + ♻ ☆ MonoNeRF: Learning a Generalizable Dynamic Radiance Field from Monocular + Videos ICCV 2023 + + +
+ In this paper, we target at the problem of learning a generalizable dynamic +radiance field from monocular videos. Different from most existing NeRF methods +that are based on multiple views, monocular videos only contain one view at +each timestamp, thereby suffering from ambiguity along the view direction in +estimating point features and scene flows. Previous studies such as DynNeRF +disambiguate point features by positional encoding, which is not transferable +and severely limits the generalization ability. As a result, these methods have +to train one independent model for each scene and suffer from heavy +computational costs when applying to increasing monocular videos in real-world +applications. To address this, We propose MonoNeRF to simultaneously learn +point features and scene flows with point trajectory and feature correspondence +constraints across frames. More specifically, we learn an implicit velocity +field to estimate point trajectory from temporal features with Neural ODE, +which is followed by a flow-based feature aggregation module to obtain spatial +features along the point trajectory. We jointly optimize temporal and spatial +features in an end-to-end manner. Experiments show that our MonoNeRF is able to +learn from multiple scenes and support new applications such as scene editing, +unseen frame synthesis, and fast novel scene adaptation. Codes are available at +https://github.com/tianfr/MonoNeRF. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+
+
+
+ + Information Retrieval 26 + +
+
+
+ + ☆ Cross-Attribute Matrix Factorization Model with Shared User Embedding + + +
+ Over the past few years, deep learning has firmly established its prowess +across various domains, including computer vision, speech recognition, and +natural language processing. Motivated by its outstanding success, researchers +have been directing their efforts towards applying deep learning techniques to +recommender systems. Neural collaborative filtering (NCF) and Neural Matrix +Factorization (NeuMF) refreshes the traditional inner product in matrix +factorization with a neural architecture capable of learning complex and +data-driven functions. While these models effectively capture user-item +interactions, they overlook the specific attributes of both users and items. +This can lead to robustness issues, especially for items and users that belong +to the "long tail". Such challenges are commonly recognized in recommender +systems as a part of the cold-start problem. A direct and intuitive approach to +address this issue is by leveraging the features and attributes of the items +and users themselves. In this paper, we introduce a refined NeuMF model that +considers not only the interaction between users and items, but also acrossing +associated attributes. Moreover, our proposed architecture features a shared +user embedding, seamlessly integrating with user embeddings to imporve the +robustness and effectively address the cold-start problem. Rigorous experiments +on both the Movielens and Pinterest datasets demonstrate the superiority of our +Cross-Attribute Matrix Factorization model, particularly in scenarios +characterized by higher dataset sparsity. + +
+
+
+
+
+ + ☆ EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language + Models + + +
+ Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy +issues, which means they are unaware of unseen events or generate text with +incorrect facts owing to the outdated/noisy data. To this end, many knowledge +editing approaches for LLMs have emerged -- aiming to subtly inject/edit +updated knowledge or adjust undesired behavior while minimizing the impact on +unrelated inputs. Nevertheless, due to significant differences among various +knowledge editing methods and the variations in task setups, there is no +standard implementation framework available for the community, which hinders +practitioners to apply knowledge editing to applications. To address these +issues, we propose EasyEdit, an easy-to-use knowledge editing framework for +LLMs. It supports various cutting-edge knowledge editing approaches and can be +readily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc. +Empirically, we report the knowledge editing results on LlaMA-2 with EasyEdit, +demonstrating that knowledge editing surpasses traditional fine-tuning in terms +of reliability and generalization. We have released the source code on GitHub +at https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and +comprehensive documentation for beginners to get started. Besides, we present +an online system for real-time knowledge editing, and a demo video at +http://knowlm.zjukg.cn/easyedit.mp4. + +
+
+ comment: The project website is https://github.com/zjunlp/EasyEdit +
+
+
+
+
+ + ☆ MM-GEF: Multi-modal representation meet collaborative filtering + + +
+ In modern e-commerce, item content features in various modalities offer +accurate yet comprehensive information to recommender systems. The majority of +previous work either focuses on learning effective item representation during +modelling user-item interactions, or exploring item-item relationships by +analysing multi-modal features. Those methods, however, fail to incorporate the +collaborative item-user-item relationships into the multi-modal feature-based +item structure. In this work, we propose a graph-based item structure +enhancement method MM-GEF: Multi-Modal recommendation with Graph Early-Fusion, +which effectively combines the latent item structure underlying multi-modal +contents with the collaborative signals. Instead of processing the content +feature in different modalities separately, we show that the early-fusion of +multi-modal features provides significant improvement. MM-GEF learns refined +item representations by injecting structural information obtained from both +multi-modal and collaborative signals. Through extensive experiments on four +publicly available datasets, we demonstrate systematical improvements of our +method over state-of-the-art multi-modal recommendation methods. + +
+
+
+
+
+ + ☆ gSASRec: Reducing Overconfidence in Sequential Recommendation Trained + with Negative Sampling RecSys 2023 + + +
+ A large catalogue size is one of the central challenges in training +recommendation models: a large number of items makes them memory and +computationally inefficient to compute scores for all items during training, +forcing these models to deploy negative sampling. However, negative sampling +increases the proportion of positive interactions in the training data, and +therefore models trained with negative sampling tend to overestimate the +probabilities of positive interactions a phenomenon we call overconfidence. +While the absolute values of the predicted scores or probabilities are not +important for the ranking of retrieved recommendations, overconfident models +may fail to estimate nuanced differences in the top-ranked items, resulting in +degraded performance. In this paper, we show that overconfidence explains why +the popular SASRec model underperforms when compared to BERT4Rec. This is +contrary to the BERT4Rec authors explanation that the difference in performance +is due to the bi-directional attention mechanism. To mitigate overconfidence, +we propose a novel Generalised Binary Cross-Entropy Loss function (gBCE) and +theoretically prove that it can mitigate overconfidence. We further propose the +gSASRec model, an improvement over SASRec that deploys an increased number of +negatives and the gBCE loss. We show through detailed experiments on three +datasets that gSASRec does not exhibit the overconfidence problem. As a result, +gSASRec can outperform BERT4Rec (e.g. +9.47% NDCG on the MovieLens-1M dataset), +while requiring less training time (e.g. -73% training time on MovieLens-1M). +Moreover, in contrast to BERT4Rec, gSASRec is suitable for large datasets that +contain more than 1 million items. + +
+
+ comment: Accepted at ACM RecSys 2023 +
+
+
+
+
+ + ☆ Natural Language is All a Graph Needs + + +
+ The emergence of large-scale pre-trained language models, such as ChatGPT, +has revolutionized various research fields in artificial intelligence. +Transformers-based large language models (LLMs) have gradually replaced CNNs +and RNNs to unify fields of computer vision and natural language processing. +Compared with the data that exists relatively independently such as images, +videos or texts, graph is a type of data that contains rich structural and +relational information. Meanwhile, natural language, as one of the most +expressive mediums, excels in describing complex structures. However, existing +work on incorporating graph learning problems into the generative language +modeling framework remains very limited. As the importance of language models +continues to grow, it becomes essential to explore whether LLMs can also +replace GNNs as the foundational model for graphs. In this paper, we propose +InstructGLM (Instruction-finetuned Graph Language Model), systematically design +highly scalable prompts based on natural language instructions, and use natural +language to describe the geometric structure and node features of the graph for +instruction tuning an LLMs to perform learning and inference on graphs in a +generative manner. Our method exceeds all competitive GNN baselines on +ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of +our method and sheds light on generative language models replacing GNNs as the +foundation model for graph machine learning. + +
+
+ comment: 21 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ Large Language Models for Information Retrieval: A Survey + + +
+ As a primary means of information acquisition, information retrieval (IR) +systems, such as search engines, have integrated themselves into our daily +lives. These systems also serve as components of dialogue, question-answering, +and recommender systems. The trajectory of IR has evolved dynamically from its +origins in term-based methods to its integration with advanced neural models. +While the neural models excel at capturing complex contextual signals and +semantic nuances, thereby reshaping the IR landscape, they still face +challenges such as data scarcity, interpretability, and the generation of +contextually plausible yet potentially inaccurate responses. This evolution +requires a combination of both traditional methods (such as term-based sparse +retrieval methods with rapid response) and modern neural architectures (such as +language models with powerful language understanding capacity). Meanwhile, the +emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has +revolutionized natural language processing due to their remarkable language +understanding, generation, generalization, and reasoning abilities. +Consequently, recent research has sought to leverage LLMs to improve IR +systems. Given the rapid evolution of this research trajectory, it is necessary +to consolidate existing methodologies and provide nuanced insights through a +comprehensive overview. In this survey, we delve into the confluence of LLMs +and IR systems, including crucial aspects such as query rewriters, retrievers, +rerankers, and readers. Additionally, we explore promising directions within +this expanding field. + +
+
+
+
+
+ + ☆ UIPC-MF: User-Item Prototype Connection Matrix Factorization for + Explainable Collaborative Filtering + + +
+ Recommending items to potentially interested users has been an important +commercial task that faces two main challenges: accuracy and explainability. +While most collaborative filtering models rely on statistical computations on a +large scale of interaction data between users and items and can achieve high +performance, they often lack clear explanatory power. We propose UIPC-MF, a +prototype-based matrix factorization method for explainable collaborative +filtering recommendations. In UIPC-MF, both users and items are associated with +sets of prototypes, capturing general collaborative attributes. To enhance +explainability, UIPC-MF learns connection weights that reflect the associative +relations between user and item prototypes for recommendations. UIPC-MF +outperforms other prototype-based baseline methods in terms of Hit Ratio and +Normalized Discounted Cumulative Gain on three datasets, while also providing +better transparency. + +
+
+
+
+
+ + ☆ The Scientometrics and Reciprocality Underlying Co-Authorship Panels in + Google Scholar Profiles + + +
+ Online academic profiles are used by scholars to reflect a desired image to +their online audience. In Google Scholar, scholars can select a subset of +co-authors for presentation in a central location on their profile using a +social feature called the Co-authroship panel. In this work, we examine whether +scientometrics and reciprocality can explain the observed selections. To this +end, we scrape and thoroughly analyze a novel set of 120,000 Google Scholar +profiles, ranging across four disciplines and various academic institutions. +Our results suggest that scholars tend to favor co-authors with higher +scientometrics over others for inclusion in their co-authorship panels. +Interestingly, as one's own scientometrics are higher, the tendency to include +co-authors with high scientometrics is diminishing. Furthermore, we find that +reciprocality is central to explaining scholars' selections. + +
+
+
+
+
+ + ☆ Discrete Conditional Diffusion for Reranking in Recommendation + + +
+ Reranking plays a crucial role in modern multi-stage recommender systems by +rearranging the initial ranking list to model interplay between items. +Considering the inherent challenges of reranking such as combinatorial +searching space, some previous studies have adopted the evaluator-generator +paradigm, with a generator producing feasible sequences and a evaluator +selecting the best one based on estimated listwise utility. Inspired by the +remarkable success of diffusion generative models, this paper explores the +potential of diffusion models for generating high-quality sequences in +reranking. However, we argue that it is nontrivial to take diffusion models as +the generator in the context of recommendation. Firstly, diffusion models +primarily operate in continuous data space, differing from the discrete data +space of item permutations. Secondly, the recommendation task is different from +conventional generation tasks as the purpose of recommender systems is to +fulfill user interests. Lastly, real-life recommender systems require +efficiency, posing challenges for the inference of diffusion models. To +overcome these challenges, we propose a novel Discrete Conditional Diffusion +Reranking (DCDR) framework for recommendation. DCDR extends traditional +diffusion models by introducing a discrete forward process with tractable +posteriors, which adds noise to item sequences through step-wise discrete +operations (e.g., swapping). Additionally, DCDR incorporates a conditional +reverse process that generates item sequences conditioned on expected user +responses. Extensive offline experiments conducted on public datasets +demonstrate that DCDR outperforms state-of-the-art reranking methods. +Furthermore, DCDR has been deployed in a real-world video app with over 300 +million daily active users, significantly enhancing online recommendation +quality. + +
+
+
+
+
+ + ☆ AutoAssign+: Automatic Shared Embedding Assignment in Streaming + Recommendation + + +
+ In the domain of streaming recommender systems, conventional methods for +addressing new user IDs or item IDs typically involve assigning initial ID +embeddings randomly. However, this practice results in two practical +challenges: (i) Items or users with limited interactive data may yield +suboptimal prediction performance. (ii) Embedding new IDs or low-frequency IDs +necessitates consistently expanding the embedding table, leading to unnecessary +memory consumption. In light of these concerns, we introduce a reinforcement +learning-driven framework, namely AutoAssign+, that facilitates Automatic +Shared Embedding Assignment Plus. To be specific, AutoAssign+ utilizes an +Identity Agent as an actor network, which plays a dual role: (i) Representing +low-frequency IDs field-wise with a small set of shared embeddings to enhance +the embedding initialization, and (ii) Dynamically determining which ID +features should be retained or eliminated in the embedding table. The policy of +the agent is optimized with the guidance of a critic network. To evaluate the +effectiveness of our approach, we perform extensive experiments on three +commonly used benchmark datasets. Our experiment results demonstrate that +AutoAssign+ is capable of significantly enhancing recommendation performance by +mitigating the cold-start problem. Furthermore, our framework yields a +reduction in memory usage of approximately 20-30%, verifying its practical +effectiveness and efficiency for streaming recommender systems. + +
+
+
+
+
+ + ☆ Bridging Offline-Online Evaluation with a Time-dependent and Popularity + Bias-free Offline Metric for Recommenders KDD + + +
+ The evaluation of recommendation systems is a complex task. The offline and +online evaluation metrics for recommender systems are ambiguous in their true +objectives. The majority of recently published papers benchmark their methods +using ill-posed offline evaluation methodology that often fails to predict true +online performance. Because of this, the impact that academic research has on +the industry is reduced. The aim of our research is to investigate and compare +the online performance of offline evaluation metrics. We show that penalizing +popular items and considering the time of transactions during the evaluation +significantly improves our ability to choose the best recommendation model for +a live recommender system. Our results, averaged over five large-size +real-world live data procured from recommenders, aim to help the academic +community to understand better offline evaluation and optimization criteria +that are more relevant for real applications of recommender systems. + +
+
+ comment: Accepted to evalRS 2023@KDD +
+
+
+
+
+ + ☆ AutoSeqRec: Autoencoder for Efficient Sequential Recommendation CIKM 2023 + + +
+ Sequential recommendation demonstrates the capability to recommend items by +modeling the sequential behavior of users. Traditional methods typically treat +users as sequences of items, overlooking the collaborative relationships among +them. Graph-based methods incorporate collaborative information by utilizing +the user-item interaction graph. However, these methods sometimes face +challenges in terms of time complexity and computational efficiency. To address +these limitations, this paper presents AutoSeqRec, an incremental +recommendation model specifically designed for sequential recommendation tasks. +AutoSeqRec is based on autoencoders and consists of an encoder and three +decoders within the autoencoder architecture. These components consider both +the user-item interaction matrix and the rows and columns of the item +transition matrix. The reconstruction of the user-item interaction matrix +captures user long-term preferences through collaborative filtering. In +addition, the rows and columns of the item transition matrix represent the item +out-degree and in-degree hopping behavior, which allows for modeling the user's +short-term interests. When making incremental recommendations, only the input +matrices need to be updated, without the need to update parameters, which makes +AutoSeqRec very efficient. Comprehensive evaluations demonstrate that +AutoSeqRec outperforms existing methods in terms of accuracy, while showcasing +its robustness and efficiency. + +
+
+ comment: 10 pages, accepted by CIKM 2023 +
+
+
+
+
+ + ☆ A Survey on Point-of-Interest Recommendations Leveraging Heterogeneous + Data + + +
+ Tourism is an important application domain for recommender systems. In this +domain, recommender systems are for example tasked with providing personalized +recommendations for transportation, accommodation, points-of-interest (POIs), +or tourism services. Among these tasks, in particular the problem of +recommending POIs that are of likely interest to individual tourists has gained +growing attention in recent years. Providing POI recommendations to tourists +\emph{during their trip} can however be especially challenging due to the +variability of the users' context. With the rapid development of the Web and +today's multitude of online services, vast amounts of data from various sources +have become available, and these heterogeneous data sources represent a huge +potential to better address the challenges of in-trip POI recommendation +problems. In this work, we provide a comprehensive survey of published research +on POI recommendation between 2017 and 2022 from the perspective of +heterogeneous data sources. Specifically, we investigate which types of data +are used in the literature and which technical approaches and evaluation +methods are predominant. Among other aspects, we find that today's research +works often focus on a narrow range of data sources, leaving great potential +for future works that better utilize heterogeneous data sources and diverse +data types for improved in-trip recommendations. + +
+
+ comment: 35 pages, 19 figures, submitted to Information Technology & Tourism + (ITT) +
+
+
+
+
+ + ☆ Improving ICD-based semantic similarity by accounting for varying + degrees of comorbidity + + +
+ Finding similar patients is a common objective in precision medicine, +facilitating treatment outcome assessment and clinical decision support. +Choosing widely-available patient features and appropriate mathematical methods +for similarity calculations is crucial. International Statistical +Classification of Diseases and Related Health Problems (ICD) codes are used +worldwide to encode diseases and are available for nearly all patients. +Aggregated as sets consisting of primary and secondary diagnoses they can +display a degree of comorbidity and reveal comorbidity patterns. It is possible +to compute the similarity of patients based on their ICD codes by using +semantic similarity algorithms. These algorithms have been traditionally +evaluated using a single-term expert rated data set. + However, real-word patient data often display varying degrees of documented +comorbidities that might impair algorithm performance. To account for this, we +present a scale term that considers documented comorbidity-variance. In this +work, we compared the performance of 80 combinations of established algorithms +in terms of semantic similarity based on ICD-code sets. The sets have been +extracted from patients with a C25.X (pancreatic cancer) primary diagnosis and +provide a variety of different combinations of ICD-codes. Using our scale term +we yielded the best results with a combination of level-based information +content, Leacock & Chodorow concept similarity and bipartite graph matching for +the set similarities reaching a correlation of 0.75 with our expert's ground +truth. Our results highlight the importance of accounting for comorbidity +variance while demonstrating how well current semantic similarity algorithms +perform. + +
+
+ comment: 11 pages, 6 figures, 1 table +
+
+
+
+
+ + ☆ Context-Aware Service Recommendation System for the Social Internet of + Things + + +
+ The Social Internet of Things (SIoT) enables interconnected smart devices to +share data and services, opening up opportunities for personalized service +recommendations. However, existing research often overlooks crucial aspects +that can enhance the accuracy and relevance of recommendations in the SIoT +context. Specifically, existing techniques tend to consider the extraction of +social relationships between devices and neglect the contextual presentation of +service reviews. This study aims to address these gaps by exploring the +contextual representation of each device-service pair. Firstly, we propose a +latent features combination technique that can capture latent feature +interactions, by aggregating the device-device relationships within the SIoT. +Then, we leverage Factorization Machines to model higher-order feature +interactions specific to each SIoT device-service pair to accomplish accurate +rating prediction. Finally, we propose a service recommendation framework for +SIoT based on review aggregation and feature learning processes. The +experimental evaluation demonstrates the framework's effectiveness in improving +service recommendation accuracy and relevance. + +
+
+
+
+
+ + ☆ Knowledge Prompt-tuning for Sequential Recommendation + + +
+ Pre-trained language models (PLMs) have demonstrated strong performance in +sequential recommendation (SR), which are utilized to extract general +knowledge. However, existing methods still lack domain knowledge and struggle +to capture users' fine-grained preferences. Meanwhile, many traditional SR +methods improve this issue by integrating side information while suffering from +information loss. To summarize, we believe that a good recommendation system +should utilize both general and domain knowledge simultaneously. Therefore, we +introduce an external knowledge base and propose Knowledge Prompt-tuning for +Sequential Recommendation (\textbf{KP4SR}). Specifically, we construct a set of +relationship templates and transform a structured knowledge graph (KG) into +knowledge prompts to solve the problem of the semantic gap. However, knowledge +prompts disrupt the original data structure and introduce a significant amount +of noise. We further construct a knowledge tree and propose a knowledge tree +mask, which restores the data structure in a mask matrix form, thus mitigating +the noise problem. We evaluate KP4SR on three real-world datasets, and +experimental results show that our approach outperforms state-of-the-art +methods on multiple evaluation metrics. Specifically, compared with PLM-based +methods, our method improves NDCG@5 and HR@5 by \textcolor{red}{40.65\%} and +\textcolor{red}{36.42\%} on the books dataset, \textcolor{red}{11.17\%} and +\textcolor{red}{11.47\%} on the music dataset, and \textcolor{red}{22.17\%} and +\textcolor{red}{19.14\%} on the movies dataset, respectively. Our code is +publicly available at the link: +\href{https://github.com/zhaijianyang/KP4SR}{\textcolor{blue}{https://github.com/zhaijianyang/KP4SR}.} + +
+
+
+
+
+ + ☆ HyperBandit: Contextual Bandit with Hypernewtork for Time-Varying User + Preferences in Streaming Recommendation + + +
+ In real-world streaming recommender systems, user preferences often +dynamically change over time (e.g., a user may have different preferences +during weekdays and weekends). Existing bandit-based streaming recommendation +models only consider time as a timestamp, without explicitly modeling the +relationship between time variables and time-varying user preferences. This +leads to recommendation models that cannot quickly adapt to dynamic scenarios. +To address this issue, we propose a contextual bandit approach using +hypernetwork, called HyperBandit, which takes time features as input and +dynamically adjusts the recommendation model for time-varying user preferences. +Specifically, HyperBandit maintains a neural network capable of generating the +parameters for estimating time-varying rewards, taking into account the +correlation between time features and user preferences. Using the estimated +time-varying rewards, a bandit policy is employed to make online +recommendations by learning the latent item contexts. To meet the real-time +requirements in streaming recommendation scenarios, we have verified the +existence of a low-rank structure in the parameter matrix and utilize low-rank +factorization for efficient training. Theoretically, we demonstrate a sublinear +regret upper bound against the best policy. Extensive experiments on real-world +datasets show that the proposed HyperBandit consistently outperforms the +state-of-the-art baselines in terms of accumulated rewards. + +
+
+
+
+
+ + ♻ ☆ NECE: Narrative Event Chain Extraction Toolkit + + +
+ To understand a narrative, it is essential to comprehend the temporal event +flows, especially those associated with main characters; however, this can be +challenging with lengthy and unstructured narrative texts. To address this, we +introduce NECE, an open-access, document-level toolkit that automatically +extracts and aligns narrative events in the temporal order of their occurrence. +Through extensive evaluations, we show the high quality of the NECE toolkit and +demonstrates its downstream application in analyzing narrative bias regarding +gender. We also openly discuss the shortcomings of the current approach, and +potential of leveraging generative models in future works. Lastly the NECE +toolkit includes both a Python library and a user-friendly web interface, which +offer equal access to professionals and layman audience alike, to visualize +event chain, obtain narrative flows, or study narrative bias. + +
+
+
+
+
+ + ♻ ☆ Deconfounded Causal Collaborative Filtering + + +
+ Recommender systems may be confounded by various types of confounding factors +(also called confounders) that may lead to inaccurate recommendations and +sacrificed recommendation performance. Current approaches to solving the +problem usually design each specific model for each specific confounder. +However, real-world systems may include a huge number of confounders and thus +designing each specific model for each specific confounder could be +unrealistic. More importantly, except for those ``explicit confounders'' that +experts can manually identify and process such as item's position in the +ranking list, there are also many ``latent confounders'' that are beyond the +imagination of experts. For example, users' rating on a song may depend on +their current mood or the current weather, and users' preference on ice creams +may depend on the air temperature. Such latent confounders may be unobservable +in the recorded training data. To solve the problem, we propose Deconfounded +Causal Collaborative Filtering (DCCF). We first frame user behaviors with +unobserved confounders into a causal graph, and then we design a front-door +adjustment model carefully fused with machine learning to deconfound the +influence of unobserved confounders. Experiments on real-world datasets show +that our method is able to deconfound unobserved confounders to achieve better +recommendation performance. + +
+
+ comment: Accepted by the ACM Transactions on Recommender Systems (TORS) +
+
+
+
+
+ + ♻ ☆ Causal Collaborative Filtering SIGIR + + +
+ Many of the traditional recommendation algorithms are designed based on the +fundamental idea of mining or learning correlative patterns from data to +estimate the user-item correlative preference. However, pure correlative +learning may lead to Simpson's paradox in predictions, and thus results in +sacrificed recommendation performance. Simpson's paradox is a well-known +statistical phenomenon, which causes confusions in statistical conclusions and +ignoring the paradox may result in inaccurate decisions. Fortunately, causal +and counterfactual modeling can help us to think outside of the observational +data for user modeling and personalization so as to tackle such issues. In this +paper, we propose Causal Collaborative Filtering (CCF) -- a general framework +for modeling causality in collaborative filtering and recommendation. We +provide a unified causal view of CF and mathematically show that many of the +traditional CF algorithms are actually special cases of CCF under simplified +causal graphs. We then propose a conditional intervention approach for +$do$-operations so that we can estimate the user-item causal preference based +on the observational data. Finally, we further propose a general counterfactual +constrained learning framework for estimating the user-item preferences. +Experiments are conducted on two types of real-world datasets -- traditional +and randomized trial data -- and results show that our framework can improve +the recommendation performance and reduce the Simpson's paradox problem of many +CF algorithms. + +
+
+ comment: Accepted by the 2023 ACM SIGIR International Conference on Theory of + Information Retrieval +
+
+
+
+
+ + ♻ ☆ Framework to Automatically Determine the Quality of Open Data Catalogs + + +
+ Data catalogs play a crucial role in modern data-driven organizations by +facilitating the discovery, understanding, and utilization of diverse data +assets. However, ensuring their quality and reliability is complex, especially +in open and large-scale data environments. This paper proposes a framework to +automatically determine the quality of open data catalogs, addressing the need +for efficient and reliable quality assessment mechanisms. Our framework can +analyze various core quality dimensions, such as accuracy, completeness, +consistency, scalability, and timeliness, offer several alternatives for the +assessment of compatibility and similarity across such catalogs as well as the +implementation of a set of non-core quality dimensions such as provenance, +readability, and licensing. The goal is to empower data-driven organizations to +make informed decisions based on trustworthy and well-curated data assets. The +source code that illustrates our approach can be downloaded from +https://www.github.com/jorge-martinez-gil/dataq/. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ STUDY: Socially Aware Temporally Causal Decoder Recommender Systems + + +
+ Recommender systems are widely used to help people find items that are +tailored to their interests. These interests are often influenced by social +networks, making it important to use social network information effectively in +recommender systems. This is especially true for demographic groups with +interests that differ from the majority. This paper introduces STUDY, a +Socially-aware Temporally caUsal Decoder recommender sYstem. STUDY introduces a +new socially-aware recommender system architecture that is significantly more +efficient to learn and train than existing methods. STUDY performs joint +inference over socially connected groups in a single forward pass of a modified +transformer decoder network. We demonstrate the benefits of STUDY in the +recommendation of books for students who are dyslexic, or struggling readers. +Dyslexic students often have difficulty engaging with reading material, making +it critical to recommend books that are tailored to their interests. We worked +with our non-profit partner Learning Ally to evaluate STUDY on a dataset of +struggling readers. STUDY was able to generate recommendations that more +accurately predicted student engagement, when compared with existing methods. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Beyond Semantics: Learning a Behavior Augmented Relevance Model with + Self-supervised Learning CIKM2023 + + +
+ Relevance modeling aims to locate desirable items for corresponding queries, +which is crucial for search engines to ensure user experience. Although most +conventional approaches address this problem by assessing the semantic +similarity between the query and item, pure semantic matching is not +everything. In reality, auxiliary query-item interactions extracted from user +historical behavior data of the search log could provide hints to reveal users' +search intents further. Drawing inspiration from this, we devise a novel +Behavior Augmented Relevance Learning model for Alipay Search (BARL-ASe) that +leverages neighbor queries of target item and neighbor items of target query to +complement target query-item semantic matching. Specifically, our model builds +multi-level co-attention for distilling coarse-grained and fine-grained +semantic representations from both neighbor and target views. The model +subsequently employs neighbor-target self-supervised learning to improve the +accuracy and robustness of BARL-ASe by strengthening representation and logit +learning. Furthermore, we discuss how to deal with the long-tail query-item +matching of the mini apps search scenario of Alipay practically. Experiments on +real-world industry data and online A/B testing demonstrate our proposal +achieves promising performance with low latency. + +
+
+ comment: Accepted by CIKM2023 +
+
+
+
+
+ + ♻ ☆ Online Distillation-enhanced Multi-modal Transformer for Sequential + Recommendation ACM MM 2023 + + +
+ Multi-modal recommendation systems, which integrate diverse types of +information, have gained widespread attention in recent years. However, +compared to traditional collaborative filtering-based multi-modal +recommendation systems, research on multi-modal sequential recommendation is +still in its nascent stages. Unlike traditional sequential recommendation +models that solely rely on item identifier (ID) information and focus on +network structure design, multi-modal recommendation models need to emphasize +item representation learning and the fusion of heterogeneous data sources. This +paper investigates the impact of item representation learning on downstream +recommendation tasks and examines the disparities in information fusion at +different stages. Empirical experiments are conducted to demonstrate the need +to design a framework suitable for collaborative learning and fusion of diverse +information. Based on this, we propose a new model-agnostic framework for +multi-modal sequential recommendation tasks, called Online +Distillation-enhanced Multi-modal Transformer (ODMT), to enhance feature +interaction and mutual learning among multi-source input (ID, text, and image), +while avoiding conflicts among different features during training, thereby +improving recommendation accuracy. To be specific, we first introduce an +ID-aware Multi-modal Transformer module in the item representation learning +stage to facilitate information interaction among different features. Secondly, +we employ an online distillation training strategy in the prediction +optimization stage to make multi-source data learn from each other and improve +prediction robustness. Experimental results on a stream media recommendation +dataset and three e-commerce recommendation datasets demonstrate the +effectiveness of the proposed two modules, which is approximately 10% +improvement in performance compared to baseline models. + +
+
+ comment: 11 pages, 7 figures, accepted in ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ KuaiSAR: A Unified Search And Recommendation Dataset CIKM 2023 + + +
+ The confluence of Search and Recommendation (S&R) services is vital to online +services, including e-commerce and video platforms. The integration of S&R +modeling is a highly intuitive approach adopted by industry practitioners. +However, there is a noticeable lack of research conducted in this area within +academia, primarily due to the absence of publicly available datasets. +Consequently, a substantial gap has emerged between academia and industry +regarding research endeavors in joint optimization using user behavior data +from both S&R services. To bridge this gap, we introduce the first large-scale, +real-world dataset KuaiSAR of integrated Search And Recommendation behaviors +collected from Kuaishou, a leading short-video app in China with over 350 +million daily active users. Previous research in this field has predominantly +employed publicly available semi-synthetic datasets and simulated, with +artificially fabricated search behaviors. Distinct from previous datasets, +KuaiSAR contains genuine user behaviors, including the occurrence of each +interaction within either search or recommendation service, and the users' +transitions between the two services. This work aids in joint modeling of S&R, +and utilizing search data for recommender systems (and recommendation data for +search engines). Furthermore, due to the various feedback labels associated +with user-video interactions, KuaiSAR also supports a broad range of tasks, +including intent recommendation, multi-task learning, and modeling of long +sequential multi-behavioral patterns. We believe this dataset will serve as a +catalyst for innovative research and bridge the gap between academia and +industry in understanding the S&R services in practical, real-world +applications. + +
+
+ comment: CIKM 2023 resource track +
+
+
+
+
+ + ♻ ☆ Multi-domain Recommendation with Embedding Disentangling and Domain + Alignment CIKM'23 + + +
+ Multi-domain recommendation (MDR) aims to provide recommendations for +different domains (e.g., types of products) with overlapping users/items and is +common for platforms such as Amazon, Facebook, and LinkedIn that host multiple +services. Existing MDR models face two challenges: First, it is difficult to +disentangle knowledge that generalizes across domains (e.g., a user likes cheap +items) and knowledge specific to a single domain (e.g., a user likes blue +clothing but not blue cars). Second, they have limited ability to transfer +knowledge across domains with small overlaps. We propose a new MDR method named +EDDA with two key components, i.e., embedding disentangling recommender and +domain alignment, to tackle the two challenges respectively. In particular, the +embedding disentangling recommender separates both the model and embedding for +the inter-domain part and the intra-domain part, while most existing MDR +methods only focus on model-level disentangling. The domain alignment leverages +random walks from graph processing to identify similar user/item pairs from +different domains and encourages similar user/item pairs to have similar +embeddings, enhancing knowledge transfer. We compare EDDA with 12 +state-of-the-art baselines on 3 real datasets. The results show that EDDA +consistently outperforms the baselines on all datasets and domains. All +datasets and codes are available at https://github.com/Stevenn9981/EDDA. + +
+
+ comment: Accepted by CIKM'23 as a Long paper +
+
+
+
+
+
+
+
+ + Machine Learning 122 + +
+
+
+ + ☆ DiffSED: Sound Event Detection with Denoising Diffusion + + +
+ Sound Event Detection (SED) aims to predict the temporal boundaries of all +the events of interest and their class labels, given an unconstrained audio +sample. Taking either the splitand-classify (i.e., frame-level) strategy or the +more principled event-level modeling approach, all existing methods consider +the SED problem from the discriminative learning perspective. In this work, we +reformulate the SED problem by taking a generative learning perspective. +Specifically, we aim to generate sound temporal boundaries from noisy proposals +in a denoising diffusion process, conditioned on a target audio sample. During +training, our model learns to reverse the noising process by converting noisy +latent queries to the groundtruth versions in the elegant Transformer decoder +framework. Doing so enables the model generate accurate event boundaries from +even noisy queries during inference. Extensive experiments on the Urban-SED and +EPIC-Sounds datasets demonstrate that our model significantly outperforms +existing alternatives, with 40+% faster convergence in training. + +
+
+
+
+
+ + ☆ The Devil is in the Errors: Leveraging Large Language Models for + Fine-grained Machine Translation Evaluation + + +
+ Automatic evaluation of machine translation (MT) is a critical tool driving +the rapid iterative development of MT systems. While considerable progress has +been made on estimating a single scalar quality score, current metrics lack the +informativeness of more detailed schemes that annotate individual errors, such +as Multidimensional Quality Metrics (MQM). In this paper, we help fill this gap +by proposing AutoMQM, a prompting technique which leverages the reasoning and +in-context learning capabilities of large language models (LLMs) and asks them +to identify and categorize errors in translations. We start by evaluating +recent LLMs, such as PaLM and PaLM-2, through simple score prediction +prompting, and we study the impact of labeled data through in-context learning +and finetuning. We then evaluate AutoMQM with PaLM-2 models, and we find that +it improves performance compared to just prompting for scores (with +particularly large gains for larger models) while providing interpretability +through error spans that align with human annotations. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Cross-Attribute Matrix Factorization Model with Shared User Embedding + + +
+ Over the past few years, deep learning has firmly established its prowess +across various domains, including computer vision, speech recognition, and +natural language processing. Motivated by its outstanding success, researchers +have been directing their efforts towards applying deep learning techniques to +recommender systems. Neural collaborative filtering (NCF) and Neural Matrix +Factorization (NeuMF) refreshes the traditional inner product in matrix +factorization with a neural architecture capable of learning complex and +data-driven functions. While these models effectively capture user-item +interactions, they overlook the specific attributes of both users and items. +This can lead to robustness issues, especially for items and users that belong +to the "long tail". Such challenges are commonly recognized in recommender +systems as a part of the cold-start problem. A direct and intuitive approach to +address this issue is by leveraging the features and attributes of the items +and users themselves. In this paper, we introduce a refined NeuMF model that +considers not only the interaction between users and items, but also acrossing +associated attributes. Moreover, our proposed architecture features a shared +user embedding, seamlessly integrating with user embeddings to imporve the +robustness and effectively address the cold-start problem. Rigorous experiments +on both the Movielens and Pinterest datasets demonstrate the superiority of our +Cross-Attribute Matrix Factorization model, particularly in scenarios +characterized by higher dataset sparsity. + +
+
+
+
+
+ + ☆ Data-Efficient Energy-Aware Participant Selection for UAV-Enabled + Federated Learning + + +
+ Unmanned aerial vehicle (UAV)-enabled edge federated learning (FL) has +sparked a rise in research interest as a result of the massive and +heterogeneous data collected by UAVs, as well as the privacy concerns related +to UAV data transmissions to edge servers. However, due to the redundancy of +UAV collected data, e.g., imaging data, and non-rigorous FL participant +selection, the convergence time of the FL learning process and bias of the FL +model may increase. Consequently, we investigate in this paper the problem of +selecting UAV participants for edge FL, aiming to improve the FL model's +accuracy, under UAV constraints of energy consumption, communication quality, +and local datasets' heterogeneity. We propose a novel UAV participant selection +scheme, called data-efficient energy-aware participant selection strategy +(DEEPS), which consists of selecting the best FL participant in each sub-region +based on the structural similarity index measure (SSIM) average score of its +local dataset and its power consumption profile. Through experiments, we +demonstrate that the proposed selection scheme is superior to the benchmark +random selection method, in terms of model accuracy, training time, and UAV +energy consumption. + +
+
+
+
+
+ + ☆ Dialogue for Prompting: a Policy-Gradient-Based Discrete Prompt + Optimization for Few-shot Learning + + +
+ Prompt-based pre-trained language models (PLMs) paradigm have succeeded +substantially in few-shot natural language processing (NLP) tasks. However, +prior discrete prompt optimization methods require expert knowledge to design +the base prompt set and identify high-quality prompts, which is costly, +inefficient, and subjective. Meanwhile, existing continuous prompt optimization +methods improve the performance by learning the ideal prompts through the +gradient information of PLMs, whose high computational cost, and low +readability and generalizability are often concerning. To address the research +gap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt +Optimization ($DP_2O$) method. We first design a multi-round dialogue alignment +strategy for readability prompt set generation based on GPT-4. Furthermore, we +propose an efficient prompt screening metric to identify high-quality prompts +with linear complexity. Finally, we construct a reinforcement learning (RL) +framework based on policy gradients to match the prompts to inputs optimally. +By training a policy network with only 0.67% of the PLM parameter size on the +tasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA) +method by 1.52% in accuracy on average on four open-source datasets. Moreover, +subsequent experiments also demonstrate that $DP_2O$ has good universality, +robustness, and generalization ability. + +
+
+
+
+
+ + ☆ EasyEdit: An Easy-to-use Knowledge Editing Framework for Large Language + Models + + +
+ Large Language Models (LLMs) usually suffer from knowledge cutoff or fallacy +issues, which means they are unaware of unseen events or generate text with +incorrect facts owing to the outdated/noisy data. To this end, many knowledge +editing approaches for LLMs have emerged -- aiming to subtly inject/edit +updated knowledge or adjust undesired behavior while minimizing the impact on +unrelated inputs. Nevertheless, due to significant differences among various +knowledge editing methods and the variations in task setups, there is no +standard implementation framework available for the community, which hinders +practitioners to apply knowledge editing to applications. To address these +issues, we propose EasyEdit, an easy-to-use knowledge editing framework for +LLMs. It supports various cutting-edge knowledge editing approaches and can be +readily apply to many well-known LLMs such as T5, GPT-J, LlaMA, etc. +Empirically, we report the knowledge editing results on LlaMA-2 with EasyEdit, +demonstrating that knowledge editing surpasses traditional fine-tuning in terms +of reliability and generalization. We have released the source code on GitHub +at https://github.com/zjunlp/EasyEdit, along with Google Colab tutorials and +comprehensive documentation for beginners to get started. Besides, we present +an online system for real-time knowledge editing, and a demo video at +http://knowlm.zjukg.cn/easyedit.mp4. + +
+
+ comment: The project website is https://github.com/zjunlp/EasyEdit +
+
+
+
+
+ + ☆ LCE -- An Augmented Combination of Bagging and Boosting in Python + + +
+ lcensemble is a high-performing, scalable and user-friendly Python package +for the general tasks of classification and regression. The package implements +Local Cascade Ensemble (LCE), a machine learning method that further enhances +the prediction performance of the current state-of-the-art methods Random +Forest and XGBoost. LCE combines their strengths and adopts a complementary +diversification approach to obtain a better generalizing predictor. The package +is compatible with scikit-learn, therefore it can interact with scikit-learn +pipelines and model selection tools. It is distributed under the Apache 2.0 +license, and its source code is available at +https://github.com/LocalCascadeEnsemble/LCE. + +
+
+
+
+
+ + ☆ Can we Agree? On the Rashōmon Effect and the Reliability of Post-Hoc + Explainable AI + + +
+ The Rash\=omon effect poses challenges for deriving reliable knowledge from +machine learning models. This study examined the influence of sample size on +explanations from models in a Rash\=omon set using SHAP. Experiments on 5 +public datasets showed that explanations gradually converged as the sample size +increased. Explanations from <128 samples exhibited high variability, limiting +reliable knowledge extraction. However, agreement between models improved with +more data, allowing for consensus. Bagging ensembles often had higher +agreement. The results provide guidance on sufficient data to trust +explanations. Variability at low samples suggests that conclusions may be +unreliable without validation. Further work is needed with more model types, +data domains, and explanation methods. Testing convergence in neural networks +and with model-specific explanation methods would be impactful. The approaches +explored here point towards principled techniques for eliciting knowledge from +ambiguous models. + +
+
+ comment: 13 pages, 6 figures and 6 tables +
+
+
+
+
+ + ☆ A Unifying Generator Loss Function for Generative Adversarial Networks + + +
+ A unifying $\alpha$-parametrized generator loss function is introduced for a +dual-objective generative adversarial network (GAN), which uses a canonical (or +classical) discriminator loss function such as the one in the original GAN +(VanillaGAN) system. The generator loss function is based on a symmetric class +probability estimation type function, $\mathcal{L}_\alpha$, and the resulting +GAN system is termed $\mathcal{L}_\alpha$-GAN. Under an optimal discriminator, +it is shown that the generator's optimization problem consists of minimizing a +Jensen-$f_\alpha$-divergence, a natural generalization of the Jensen-Shannon +divergence, where $f_\alpha$ is a convex function expressed in terms of the +loss function $\mathcal{L}_\alpha$. It is also demonstrated that this +$\mathcal{L}_\alpha$-GAN problem recovers as special cases a number of GAN +problems in the literature, including VanillaGAN, Least Squares GAN (LSGAN), +Least $k$th order GAN (L$k$GAN) and the recently introduced +$(\alpha_D,\alpha_G)$-GAN with $\alpha_D=1$. Finally, experimental results are +conducted on three datasets, MNIST, CIFAR-10, and Stacked MNIST to illustrate +the performance of various examples of the $\mathcal{L}_\alpha$-GAN system. + +
+
+ comment: 31 pages, 4 figures, 12 tables +
+
+
+
+
+ + ☆ Distance Matters For Improving Performance Estimation Under Covariate + Shift ICCV + + +
+ Performance estimation under covariate shift is a crucial component of safe +AI model deployment, especially for sensitive use-cases. Recently, several +solutions were proposed to tackle this problem, most leveraging model +predictions or softmax confidence to derive accuracy estimates. However, under +dataset shifts, confidence scores may become ill-calibrated if samples are too +far from the training distribution. In this work, we show that taking into +account distances of test samples to their expected training distribution can +significantly improve performance estimation under covariate shift. Precisely, +we introduce a "distance-check" to flag samples that lie too far from the +expected distribution, to avoid relying on their untrustworthy model outputs in +the accuracy estimation step. We demonstrate the effectiveness of this method +on 13 image classification tasks, across a wide-range of natural and synthetic +distribution shifts and hundreds of models, with a median relative MAE +improvement of 27% over the best baseline across all tasks, and SOTA +performance on 10 out of 13 tasks. Our code is publicly available at +https://github.com/melanibe/distance_matters_performance_estimation. + +
+
+ comment: Accepted to ICCV Workshop on Uncertainty Quantification for Computer + Vision 2023 +
+
+
+
+
+ + ☆ AudioFormer: Audio Transformer learns audio feature representations from + discrete acoustic codes + + +
+ We propose a method named AudioFormer, which learns audio feature +representations through the acquisition of discrete acoustic codes and +subsequently fine-tunes them for audio classification tasks. Initially, we +introduce a novel perspective by considering the audio classification task as a +form of natural language understanding (NLU). Leveraging an existing neural +audio codec model, we generate discrete acoustic codes and utilize them to +train a masked language model (MLM), thereby obtaining audio feature +representations. Furthermore, we pioneer the integration of a +\textbf{M}ulti-\textbf{P}ositive sample \textbf{C}ontrastive (MPC) learning +approach. This method enables the learning of joint representations among +multiple discrete acoustic codes within the same audio input. In our +experiments, we treat discrete acoustic codes as textual data and train a +masked language model using a cloze-like methodology, ultimately deriving +high-quality audio representations. Notably, the MPC learning technique +effectively captures collaborative representations among distinct positive +samples. Our research outcomes demonstrate that AudioFormer attains +significantly improved performance compared to prevailing monomodal audio +classification models across multiple datasets, and even outperforms +audio-visual multimodal classification models on select datasets. Specifically, +our approach achieves remarkable results on datasets including AudioSet (2M, +20K), and FSD50K, with performance scores of 53.9, 45.1, and 65.6, +respectively. We have openly shared both the code and models: +\url{https://github.com/LZH-0225/AudioFormer.git}. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Automated Ensemble-Based Segmentation of Pediatric Brain Tumors: A Novel + Approach Using the CBTN-CONNECT-ASNR-MICCAI BraTS-PEDs 2023 Challenge Data + + +
+ Brain tumors remain a critical global health challenge, necessitating +advancements in diagnostic techniques and treatment methodologies. In response +to the growing need for age-specific segmentation models, particularly for +pediatric patients, this study explores the deployment of deep learning +techniques using magnetic resonance imaging (MRI) modalities. By introducing a +novel ensemble approach using ONet and modified versions of UNet, coupled with +innovative loss functions, this study achieves a precise segmentation model for +the BraTS-PEDs 2023 Challenge. Data augmentation, including both single and +composite transformations, ensures model robustness and accuracy across +different scanning protocols. The ensemble strategy, integrating the ONet and +UNet models, shows greater effectiveness in capturing specific features and +modeling diverse aspects of the MRI images which result in lesion_wise dice +scores of 0.52, 0.72 and 0.78 for enhancing tumor, tumor core and whole tumor +labels respectively. Visual comparisons further confirm the superiority of the +ensemble method in accurate tumor region coverage. The results indicate that +this advanced ensemble approach, building upon the unique strengths of +individual models, offers promising prospects for enhanced diagnostic accuracy +and effective treatment planning for brain tumors in pediatric brains. + +
+
+ comment: 3 Figs, 3 Tables +
+
+
+
+
+ + ☆ Unified Data-Free Compression: Pruning and Quantization without + Fine-Tuning ICCV2023 + + +
+ Structured pruning and quantization are promising approaches for reducing the +inference time and memory footprint of neural networks. However, most existing +methods require the original training dataset to fine-tune the model. This not +only brings heavy resource consumption but also is not possible for +applications with sensitive or proprietary data due to privacy and security +concerns. Therefore, a few data-free methods are proposed to address this +problem, but they perform data-free pruning and quantization separately, which +does not explore the complementarity of pruning and quantization. In this +paper, we propose a novel framework named Unified Data-Free Compression(UDFC), +which performs pruning and quantization simultaneously without any data and +fine-tuning process. Specifically, UDFC starts with the assumption that the +partial information of a damaged(e.g., pruned or quantized) channel can be +preserved by a linear combination of other channels, and then derives the +reconstruction form from the assumption to restore the information loss due to +compression. Finally, we formulate the reconstruction error between the +original network and its compressed network, and theoretically deduce the +closed-form solution. We evaluate the UDFC on the large-scale image +classification task and obtain significant improvements over various network +architectures and compression methods. For example, we achieve a 20.54% +accuracy improvement on ImageNet dataset compared to SOTA method with 30% +pruning ratio and 6-bit quantization on ResNet-34. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ☆ Algorithms for the Training of Neural Support Vector Machines + + +
+ Neural support vector machines (NSVMs) allow for the incorporation of domain +knowledge in the design of the model architecture. In this article we introduce +a set of training algorithms for NSVMs that leverage the Pegasos algorithm and +provide a proof of concept by solving a set of standard machine learning tasks. + +
+
+ comment: 19 pages, 0 figures +
+
+
+
+
+ + ☆ Neural Categorical Priors for Physics-Based Character Control + + +
+ Recent advances in learning reusable motion priors have demonstrated their +effectiveness in generating naturalistic behaviors. In this paper, we propose a +new learning framework in this paradigm for controlling physics-based +characters with significantly improved motion quality and diversity over +existing state-of-the-art methods. The proposed method uses reinforcement +learning (RL) to initially track and imitate life-like movements from +unstructured motion clips using the discrete information bottleneck, as adopted +in the Vector Quantized Variational AutoEncoder (VQ-VAE). This structure +compresses the most relevant information from the motion clips into a compact +yet informative latent space, i.e., a discrete space over vector quantized +codes. By sampling codes in the space from a trained categorical prior +distribution, high-quality life-like behaviors can be generated, similar to the +usage of VQ-VAE in computer vision. Although this prior distribution can be +trained with the supervision of the encoder's output, it follows the original +motion clip distribution in the dataset and could lead to imbalanced behaviors +in our setting. To address the issue, we further propose a technique named +prior shifting to adjust the prior distribution using curiosity-driven RL. The +outcome distribution is demonstrated to offer sufficient behavioral diversity +and significantly facilitates upper-level policy learning for downstream tasks. +We conduct comprehensive experiments using humanoid characters on two +challenging downstream tasks, sword-shield striking and two-player boxing game. +Our results demonstrate that the proposed framework is capable of controlling +the character to perform considerably high-quality movements in terms of +behavioral strategies, diversity, and realism. Videos, codes, and data are +available at https://tencent-roboticsx.github.io/NCP/. + +
+
+
+
+
+ + ☆ Explaining Black-Box Models through Counterfactuals + + +
+ We present CounterfactualExplanations.jl: a package for generating +Counterfactual Explanations (CE) and Algorithmic Recourse (AR) for black-box +models in Julia. CE explain how inputs into a model need to change to yield +specific model predictions. Explanations that involve realistic and actionable +changes can be used to provide AR: a set of proposed actions for individuals to +change an undesirable outcome for the better. In this article, we discuss the +usefulness of CE for Explainable Artificial Intelligence and demonstrate the +functionality of our package. The package is straightforward to use and +designed with a focus on customization and extensibility. We envision it to one +day be the go-to place for explaining arbitrary predictive models in Julia +through a diverse suite of counterfactual generators. + +
+
+ comment: 13 pages, 9 figures, originally published in The Proceedings of the + JuliaCon Conferences (JCON) +
+
+
+
+
+ + ☆ gSASRec: Reducing Overconfidence in Sequential Recommendation Trained + with Negative Sampling RecSys 2023 + + +
+ A large catalogue size is one of the central challenges in training +recommendation models: a large number of items makes them memory and +computationally inefficient to compute scores for all items during training, +forcing these models to deploy negative sampling. However, negative sampling +increases the proportion of positive interactions in the training data, and +therefore models trained with negative sampling tend to overestimate the +probabilities of positive interactions a phenomenon we call overconfidence. +While the absolute values of the predicted scores or probabilities are not +important for the ranking of retrieved recommendations, overconfident models +may fail to estimate nuanced differences in the top-ranked items, resulting in +degraded performance. In this paper, we show that overconfidence explains why +the popular SASRec model underperforms when compared to BERT4Rec. This is +contrary to the BERT4Rec authors explanation that the difference in performance +is due to the bi-directional attention mechanism. To mitigate overconfidence, +we propose a novel Generalised Binary Cross-Entropy Loss function (gBCE) and +theoretically prove that it can mitigate overconfidence. We further propose the +gSASRec model, an improvement over SASRec that deploys an increased number of +negatives and the gBCE loss. We show through detailed experiments on three +datasets that gSASRec does not exhibit the overconfidence problem. As a result, +gSASRec can outperform BERT4Rec (e.g. +9.47% NDCG on the MovieLens-1M dataset), +while requiring less training time (e.g. -73% training time on MovieLens-1M). +Moreover, in contrast to BERT4Rec, gSASRec is suitable for large datasets that +contain more than 1 million items. + +
+
+ comment: Accepted at ACM RecSys 2023 +
+
+
+
+
+ + ☆ Efficient Learning of Quantum States Prepared With Few Non-Clifford + Gates II: Single-Copy Measurements + + +
+ Recent work has shown that $n$-qubit quantum states output by circuits with +at most $t$ single-qubit non-Clifford gates can be learned to trace distance +$\epsilon$ using $\mathsf{poly}(n,2^t,1/\epsilon)$ time and samples. All prior +algorithms achieving this runtime use entangled measurements across two copies +of the input state. In this work, we give a similarly efficient algorithm that +learns the same class of states using only single-copy measurements. + +
+
+ comment: 22 pages. arXiv admin note: text overlap with arXiv:2305.13409 +
+
+
+
+
+ + ☆ PitchNet: A Fully Convolutional Neural Network for Pitch Estimation + + +
+ In the domain of music and sound processing, pitch extraction plays a pivotal +role. This research introduces "PitchNet", a convolutional neural network +tailored for pitch extraction from the human singing voice, including acapella +performances. Integrating autocorrelation with deep learning techniques, +PitchNet aims to optimize the accuracy of pitch detection. Evaluation across +datasets comprising synthetic sounds, opera recordings, and time-stretched +vowels demonstrates its efficacy. This work paves the way for enhanced pitch +extraction in both music and voice settings. + +
+
+
+
+
+ + ☆ Pairing interacting protein sequences using masked language modeling + + +
+ Predicting which proteins interact together from amino-acid sequences is an +important task. We develop a method to pair interacting protein sequences which +leverages the power of protein language models trained on multiple sequence +alignments, such as MSA Transformer and the EvoFormer module of AlphaFold. We +formulate the problem of pairing interacting partners among the paralogs of two +protein families in a differentiable way. We introduce a method called DiffPALM +that solves it by exploiting the ability of MSA Transformer to fill in masked +amino acids in multiple sequence alignments using the surrounding context. MSA +Transformer encodes coevolution between functionally or structurally coupled +amino acids. We show that it captures inter-chain coevolution, while it was +trained on single-chain data, which means that it can be used +out-of-distribution. Relying on MSA Transformer without fine-tuning, DiffPALM +outperforms existing coevolution-based pairing methods on difficult benchmarks +of shallow multiple sequence alignments extracted from ubiquitous prokaryotic +protein datasets. It also outperforms an alternative method based on a +state-of-the-art protein language model trained on single sequences. Paired +alignments of interacting protein sequences are a crucial ingredient of +supervised deep learning methods to predict the three-dimensional structure of +protein complexes. DiffPALM substantially improves the structure prediction of +some eukaryotic protein complexes by AlphaFold-Multimer, without significantly +deteriorating any of those we tested. It also achieves competitive performance +with using orthology-based pairing. + +
+
+ comment: 33 pages, 14 figures, 2 tables +
+
+
+
+
+ + ☆ Natural Language is All a Graph Needs + + +
+ The emergence of large-scale pre-trained language models, such as ChatGPT, +has revolutionized various research fields in artificial intelligence. +Transformers-based large language models (LLMs) have gradually replaced CNNs +and RNNs to unify fields of computer vision and natural language processing. +Compared with the data that exists relatively independently such as images, +videos or texts, graph is a type of data that contains rich structural and +relational information. Meanwhile, natural language, as one of the most +expressive mediums, excels in describing complex structures. However, existing +work on incorporating graph learning problems into the generative language +modeling framework remains very limited. As the importance of language models +continues to grow, it becomes essential to explore whether LLMs can also +replace GNNs as the foundational model for graphs. In this paper, we propose +InstructGLM (Instruction-finetuned Graph Language Model), systematically design +highly scalable prompts based on natural language instructions, and use natural +language to describe the geometric structure and node features of the graph for +instruction tuning an LLMs to perform learning and inference on graphs in a +generative manner. Our method exceeds all competitive GNN baselines on +ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of +our method and sheds light on generative language models replacing GNNs as the +foundation model for graph machine learning. + +
+
+ comment: 21 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ A Time-aware tensor decomposition for tracking evolving patterns + + +
+ Time-evolving data sets can often be arranged as a higher-order tensor with +one of the modes being the time mode. While tensor factorizations have been +successfully used to capture the underlying patterns in such higher-order data +sets, the temporal aspect is often ignored, allowing for the reordering of time +points. In recent studies, temporal regularizers are incorporated in the time +mode to tackle this issue. Nevertheless, existing approaches still do not allow +underlying patterns to change in time (e.g., spatial changes in the brain, +contextual changes in topics). In this paper, we propose temporal PARAFAC2 +(tPARAFAC2): a PARAFAC2-based tensor factorization method with temporal +regularization to extract gradually evolving patterns from temporal data. +Through extensive experiments on synthetic data, we demonstrate that tPARAFAC2 +can capture the underlying evolving patterns accurately performing better than +PARAFAC2 and coupled matrix factorization with temporal smoothness +regularization. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Active Bird2Vec: Towards End-to-End Bird Sound Monitoring with + Transformers ECAI2023 + + +
+ We propose a shift towards end-to-end learning in bird sound monitoring by +combining self-supervised (SSL) and deep active learning (DAL). Leveraging +transformer models, we aim to bypass traditional spectrogram conversions, +enabling direct raw audio processing. ActiveBird2Vec is set to generate +high-quality bird sound representations through SSL, potentially accelerating +the assessment of environmental changes and decision-making processes for wind +farms. Additionally, we seek to utilize the wide variety of bird vocalizations +through DAL, reducing the reliance on extensively labeled datasets by human +experts. We plan to curate a comprehensive set of tasks through Huggingface +Datasets, enhancing future comparability and reproducibility of bioacoustic +research. A comparative analysis between various transformer models will be +conducted to evaluate their proficiency in bird sound recognition tasks. We aim +to accelerate the progression of avian bioacoustic research and contribute to +more effective conservation strategies. + +
+
+ comment: Accepted @AI4S ECAI2023. This is the author's version of the work +
+
+
+
+
+ + ☆ Neural radiance fields in the industrial and robotics domain: + applications, research opportunities and use cases + + +
+ The proliferation of technologies, such as extended reality (XR), has +increased the demand for high-quality three-dimensional (3D) graphical +representations. Industrial 3D applications encompass computer-aided design +(CAD), finite element analysis (FEA), scanning, and robotics. However, current +methods employed for industrial 3D representations suffer from high +implementation costs and reliance on manual human input for accurate 3D +modeling. To address these challenges, neural radiance fields (NeRFs) have +emerged as a promising approach for learning 3D scene representations based on +provided training 2D images. Despite a growing interest in NeRFs, their +potential applications in various industrial subdomains are still unexplored. +In this paper, we deliver a comprehensive examination of NeRF industrial +applications while also providing direction for future research endeavors. We +also present a series of proof-of-concept experiments that demonstrate the +potential of NeRFs in the industrial domain. These experiments include +NeRF-based video compression techniques and using NeRFs for 3D motion +estimation in the context of collision avoidance. In the video compression +experiment, our results show compression savings up to 48\% and 74\% for +resolutions of 1920x1080 and 300x168, respectively. The motion estimation +experiment used a 3D animation of a robotic arm to train Dynamic-NeRF (D-NeRF) +and achieved an average disparity map PSNR of 23 dB and an SSIM of 0.97. The +code for our experiments is publicly available at +https://github.com/Maftej/iisnerf . + +
+
+
+
+
+ + ☆ iSTFTNet2: Faster and More Lightweight iSTFT-Based Neural Vocoder Using + 1D-2D CNN + + +
+ The inverse short-time Fourier transform network (iSTFTNet) has garnered +attention owing to its fast, lightweight, and high-fidelity speech synthesis. +It obtains these characteristics using a fast and lightweight 1D CNN as the +backbone and replacing some neural processes with iSTFT. Owing to the +difficulty of a 1D CNN to model high-dimensional spectrograms, the frequency +dimension is reduced via temporal upsampling. However, this strategy +compromises the potential to enhance the speed. Therefore, we propose +iSTFTNet2, an improved variant of iSTFTNet with a 1D-2D CNN that employs 1D and +2D CNNs to model temporal and spectrogram structures, respectively. We designed +a 2D CNN that performs frequency upsampling after conversion in a few-frequency +space. This design facilitates the modeling of high-dimensional spectrograms +without compromising the speed. The results demonstrated that iSTFTNet2 made +iSTFTNet faster and more lightweight with comparable speech quality. Audio +samples are available at +https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/istftnet2/. + +
+
+ comment: Accepted to Interspeech 2023. Project page: + https://www.kecl.ntt.co.jp/people/kaneko.takuhiro/projects/istftnet2/ +
+
+
+
+
+ + ☆ #InsTag: Instruction Tagging for Diversity and Complexity Analysis + + +
+ Foundation language models obtain the instruction-following ability through +supervised fine-tuning (SFT). Diversity and complexity are considered critical +factors of a successful SFT dataset, while their definitions remain obscure and +lack quantitative analyses. In this work, we propose InsTag, an open-set +fine-grained tagger, to tag samples within SFT datasets based on semantics and +intentions and define instruction diversity and complexity regarding tags. We +obtain 6.6K tags to describe comprehensive user queries. Then we analyze +popular open-sourced SFT datasets and find that the model ability grows with +more diverse and complex data. Based on this observation, we propose a data +selector based on InsTag to select 6K diverse and complex samples from +open-source datasets and fine-tune models on InsTag-selected data. The +resulting models, TagLM, outperform open-source models based on considerably +larger SFT data evaluated by MT-Bench, echoing the importance of query +diversity and complexity. We open-source InsTag in +https://github.com/OFA-Sys/InsTag. + +
+
+
+
+
+ + ☆ Machine Unlearning: Solutions and Challenges + + +
+ Machine learning models may inadvertently memorize sensitive, unauthorized, +or malicious data, posing risks of privacy violations, security breaches, and +performance deterioration. To address these issues, machine unlearning has +emerged as a critical technique to selectively remove specific training data +points' influence on trained models. This paper provides a comprehensive +taxonomy and analysis of machine unlearning research. We categorize existing +research into exact unlearning that algorithmically removes data influence +entirely and approximate unlearning that efficiently minimizes influence +through limited parameter updates. By reviewing the state-of-the-art solutions, +we critically discuss their advantages and limitations. Furthermore, we propose +future directions to advance machine unlearning and establish it as an +essential capability for trustworthy and adaptive machine learning. This paper +provides researchers with a roadmap of open problems, encouraging impactful +contributions to address real-world needs for selective data removal. + +
+
+
+
+
+ + ☆ Diagnosis of Scalp Disorders using Machine Learning and Deep Learning + Approach -- A Review + + +
+ The morbidity of scalp diseases is minuscule compared to other diseases, but +the impact on the patient's life is enormous. It is common for people to +experience scalp problems that include Dandruff, Psoriasis, Tinea-Capitis, +Alopecia and Atopic-Dermatitis. In accordance with WHO research, approximately +70% of adults have problems with their scalp. It has been demonstrated in +descriptive research that hair quality is impaired by impaired scalp, but these +impacts are reversible with early diagnosis and treatment. Deep Learning +advances have demonstrated the effectiveness of CNN paired with FCN in +diagnosing scalp and skin disorders. In one proposed Deep-Learning-based scalp +inspection and diagnosis system, an imaging microscope and a trained model are +combined with an app that classifies scalp disorders accurately with an average +precision of 97.41%- 99.09%. Another research dealt with classifying the +Psoriasis using the CNN with an accuracy of 82.9%. As part of another study, an +ML based algorithm was also employed. It accurately classified the healthy +scalp and alopecia areata with 91.4% and 88.9% accuracy with SVM and KNN +algorithms. Using deep learning models to diagnose scalp related diseases has +improved due to advancements i computation capabilities and computer vision, +but there remains a wide horizon for further improvements. + +
+
+
+
+
+ + ☆ Fourier neural operator for learning solutions to macroscopic traffic + flow models: Application to the forward and inverse problems + + +
+ Deep learning methods are emerging as popular computational tools for solving +forward and inverse problems in traffic flow. In this paper, we study a neural +operator framework for learning solutions to nonlinear hyperbolic partial +differential equations with applications in macroscopic traffic flow models. In +this framework, an operator is trained to map heterogeneous and sparse traffic +input data to the complete macroscopic traffic state in a supervised learning +setting. We chose a physics-informed Fourier neural operator ($\pi$-FNO) as the +operator, where an additional physics loss based on a discrete conservation law +regularizes the problem during training to improve the shock predictions. We +also propose to use training data generated from random piecewise constant +input data to systematically capture the shock and rarefied solutions. From +experiments using the LWR traffic flow model, we found superior accuracy in +predicting the density dynamics of a ring-road network and urban signalized +road. We also found that the operator can be trained using simple traffic +density dynamics, e.g., consisting of $2-3$ vehicle queues and $1-2$ traffic +signal cycles, and it can predict density dynamics for heterogeneous vehicle +queue distributions and multiple traffic signal cycles $(\geq 2)$ with an +acceptable error. The extrapolation error grew sub-linearly with input +complexity for a proper choice of the model architecture and training data. +Adding a physics regularizer aided in learning long-term traffic density +dynamics, especially for problems with periodic boundary data. + +
+
+
+
+
+ + ☆ UIPC-MF: User-Item Prototype Connection Matrix Factorization for + Explainable Collaborative Filtering + + +
+ Recommending items to potentially interested users has been an important +commercial task that faces two main challenges: accuracy and explainability. +While most collaborative filtering models rely on statistical computations on a +large scale of interaction data between users and items and can achieve high +performance, they often lack clear explanatory power. We propose UIPC-MF, a +prototype-based matrix factorization method for explainable collaborative +filtering recommendations. In UIPC-MF, both users and items are associated with +sets of prototypes, capturing general collaborative attributes. To enhance +explainability, UIPC-MF learns connection weights that reflect the associative +relations between user and item prototypes for recommendations. UIPC-MF +outperforms other prototype-based baseline methods in terms of Hit Ratio and +Normalized Discounted Cumulative Gain on three datasets, while also providing +better transparency. + +
+
+
+
+
+ + ☆ No Regularization is Needed: An Efficient and Effective Model for + Incomplete Label Distribution Learning + + +
+ Label Distribution Learning (LDL) assigns soft labels, a.k.a. degrees, to a +sample. In reality, it is always laborious to obtain complete degrees, giving +birth to the Incomplete LDL (InLDL). However, InLDL often suffers from +performance degeneration. To remedy it, existing methods need one or more +explicit regularizations, leading to burdensome parameter tuning and extra +computation. We argue that label distribution itself may provide useful prior, +when used appropriately, the InLDL problem can be solved without any explicit +regularization. In this paper, we offer a rational alternative to use such a +prior. Our intuition is that large degrees are likely to get more concern, the +small ones are easily overlooked, whereas the missing degrees are completely +neglected in InLDL. To learn an accurate label distribution, it is crucial not +to ignore the small observed degrees but to give them properly large weights, +while gradually increasing the weights of the missing degrees. To this end, we +first define a weighted empirical risk and derive upper bounds between the +expected risk and the weighted empirical risk, which reveals in principle that +weighting plays an implicit regularization role. Then, by using the prior of +degrees, we design a weighted scheme and verify its effectiveness. To sum up, +our model has four advantages, it is 1) model selection free, as no explicit +regularization is imposed; 2) with closed form solution (sub-problem) and +easy-to-implement (a few lines of codes); 3) with linear computational +complexity in the number of samples, thus scalable to large datasets; 4) +competitive with state-of-the-arts even without any explicit regularization. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Bayesian Flow Networks + + +
+ This paper introduces Bayesian Flow Networks (BFNs), a new class of +generative model in which the parameters of a set of independent distributions +are modified with Bayesian inference in the light of noisy data samples, then +passed as input to a neural network that outputs a second, interdependent +distribution. Starting from a simple prior and iteratively updating the two +distributions yields a generative procedure similar to the reverse process of +diffusion models; however it is conceptually simpler in that no forward process +is required. Discrete and continuous-time loss functions are derived for +continuous, discretised and discrete data, along with sample generation +procedures. Notably, the network inputs for discrete data lie on the +probability simplex, and are therefore natively differentiable, paving the way +for gradient-based sample guidance and few-step generation in discrete domains +such as language modelling. The loss function directly optimises data +compression and places no restrictions on the network architecture. In our +experiments BFNs achieve competitive log-likelihoods for image modelling on +dynamically binarized MNIST and CIFAR-10, and outperform all known discrete +diffusion models on the text8 character-level language modelling task. + +
+
+
+
+
+ + ☆ S3IM: Stochastic Structural SIMilarity and Its Unreasonable + Effectiveness for Neural Fields ICCV 2023 + + +
+ Recently, Neural Radiance Field (NeRF) has shown great success in rendering +novel-view images of a given scene by learning an implicit representation with +only posed RGB images. NeRF and relevant neural field methods (e.g., neural +surface representation) typically optimize a point-wise loss and make +point-wise predictions, where one data point corresponds to one pixel. +Unfortunately, this line of research failed to use the collective supervision +of distant pixels, although it is known that pixels in an image or scene can +provide rich structural information. To the best of our knowledge, we are the +first to design a nonlocal multiplex training paradigm for NeRF and relevant +neural field methods via a novel Stochastic Structural SIMilarity (S3IM) loss +that processes multiple data points as a whole set instead of process multiple +inputs independently. Our extensive experiments demonstrate the unreasonable +effectiveness of S3IM in improving NeRF and neural surface representation for +nearly free. The improvements of quality metrics can be particularly +significant for those relatively difficult tasks: e.g., the test MSE loss +unexpectedly drops by more than 90% for TensoRF and DVGO over eight novel view +synthesis tasks; a 198% F-score gain and a 64% Chamfer $L_{1}$ distance +reduction for NeuS over eight surface reconstruction tasks. Moreover, S3IM is +consistently robust even with sparse inputs, corrupted images, and dynamic +scenes. + +
+
+ comment: ICCV 2023 main conference. Code: https://github.com/Madaoer/S3IM. 14 + pages, 5 figures, 17 tables +
+
+
+
+
+ + ☆ Learning to Optimize LSM-trees: Towards A Reinforcement Learning based + Key-Value Store for Dynamic Workloads + + +
+ LSM-trees are widely adopted as the storage backend of key-value stores. +However, optimizing the system performance under dynamic workloads has not been +sufficiently studied or evaluated in previous work. To fill the gap, we present +RusKey, a key-value store with the following new features: (1) RusKey is a +first attempt to orchestrate LSM-tree structures online to enable robust +performance under the context of dynamic workloads; (2) RusKey is the first +study to use Reinforcement Learning (RL) to guide LSM-tree transformations; (3) +RusKey includes a new LSM-tree design, named FLSM-tree, for an efficient +transition between different compaction policies -- the bottleneck of dynamic +key-value stores. We justify the superiority of the new design with theoretical +analysis; (4) RusKey requires no prior workload knowledge for system +adjustment, in contrast to state-of-the-art techniques. Experiments show that +RusKey exhibits strong performance robustness in diverse workloads, achieving +up to 4x better end-to-end performance than the RocksDB system under various +settings. + +
+
+ comment: 25 pages, 13 figures +
+
+
+
+
+ + ☆ Greedy online change point detection SP 2023 + + +
+ Standard online change point detection (CPD) methods tend to have large false +discovery rates as their detections are sensitive to outliers. To overcome this +drawback, we propose Greedy Online Change Point Detection (GOCPD), a +computationally appealing method which finds change points by maximizing the +probability of the data coming from the (temporal) concatenation of two +independent models. We show that, for time series with a single change point, +this objective is unimodal and thus CPD can be accelerated via ternary search +with logarithmic complexity. We demonstrate the effectiveness of GOCPD on +synthetic data and validate our findings on real-world univariate and +multivariate settings. + +
+
+ comment: Accepted at IEEE MLSP 2023 +
+
+
+
+
+ + ☆ Deep convolutional neural networks for cyclic sensor data + + +
+ Predictive maintenance plays a critical role in ensuring the uninterrupted +operation of industrial systems and mitigating the potential risks associated +with system failures. This study focuses on sensor-based condition monitoring +and explores the application of deep learning techniques using a hydraulic +system testbed dataset. Our investigation involves comparing the performance of +three models: a baseline model employing conventional methods, a single CNN +model with early sensor fusion, and a two-lane CNN model (2L-CNN) with late +sensor fusion. The baseline model achieves an impressive test error rate of 1% +by employing late sensor fusion, where feature extraction is performed +individually for each sensor. However, the CNN model encounters challenges due +to the diverse sensor characteristics, resulting in an error rate of 20.5%. To +further investigate this issue, we conduct separate training for each sensor +and observe variations in accuracy. Additionally, we evaluate the performance +of the 2L-CNN model, which demonstrates significant improvement by reducing the +error rate by 33% when considering the combination of the least and most +optimal sensors. This study underscores the importance of effectively +addressing the complexities posed by multi-sensor systems in sensor-based +condition monitoring. + +
+
+ comment: 4 pages, 3 figures, submitted to the IEEE Sensors Conference +
+
+
+
+
+ + ☆ pNNCLR: Stochastic Pseudo Neighborhoods for Contrastive Learning based + Unsupervised Representation Learning Problems + + +
+ Nearest neighbor (NN) sampling provides more semantic variations than +pre-defined transformations for self-supervised learning (SSL) based image +recognition problems. However, its performance is restricted by the quality of +the support set, which holds positive samples for the contrastive loss. In this +work, we show that the quality of the support set plays a crucial role in any +nearest neighbor based method for SSL. We then provide a refined baseline +(pNNCLR) to the nearest neighbor based SSL approach (NNCLR). To this end, we +introduce pseudo nearest neighbors (pNN) to control the quality of the support +set, wherein, rather than sampling the nearest neighbors, we sample in the +vicinity of hard nearest neighbors by varying the magnitude of the resultant +vector and employing a stochastic sampling strategy to improve the performance. +Additionally, to stabilize the effects of uncertainty in NN-based learning, we +employ a smooth-weight-update approach for training the proposed network. +Evaluation of the proposed method on multiple public image recognition and +medical image recognition datasets shows that it performs up to 8 percent +better than the baseline nearest neighbor method, and is comparable to other +previously proposed SSL methods. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Routing Recovery for UAV Networks with Deliberate Attacks: A + Reinforcement Learning based Approach + + +
+ The unmanned aerial vehicle (UAV) network is popular these years due to its +various applications. In the UAV network, routing is significantly affected by +the distributed network topology, leading to the issue that UAVs are vulnerable +to deliberate damage. Hence, this paper focuses on the routing plan and +recovery for UAV networks with attacks. In detail, a deliberate attack model +based on the importance of nodes is designed to represent enemy attacks. Then, +a node importance ranking mechanism is presented, considering the degree of +nodes and link importance. However, it is intractable to handle the routing +problem by traditional methods for UAV networks, since link connections change +with the UAV availability. Hence, an intelligent algorithm based on +reinforcement learning is proposed to recover the routing path when UAVs are +attacked. Simulations are conducted and numerical results verify the proposed +mechanism performs better than other referred methods. + +
+
+ comment: IEEE GLOBECOM 2023, 6 pages, 4 figures +
+
+
+
+
+ + ☆ AutoAssign+: Automatic Shared Embedding Assignment in Streaming + Recommendation + + +
+ In the domain of streaming recommender systems, conventional methods for +addressing new user IDs or item IDs typically involve assigning initial ID +embeddings randomly. However, this practice results in two practical +challenges: (i) Items or users with limited interactive data may yield +suboptimal prediction performance. (ii) Embedding new IDs or low-frequency IDs +necessitates consistently expanding the embedding table, leading to unnecessary +memory consumption. In light of these concerns, we introduce a reinforcement +learning-driven framework, namely AutoAssign+, that facilitates Automatic +Shared Embedding Assignment Plus. To be specific, AutoAssign+ utilizes an +Identity Agent as an actor network, which plays a dual role: (i) Representing +low-frequency IDs field-wise with a small set of shared embeddings to enhance +the embedding initialization, and (ii) Dynamically determining which ID +features should be retained or eliminated in the embedding table. The policy of +the agent is optimized with the guidance of a critic network. To evaluate the +effectiveness of our approach, we perform extensive experiments on three +commonly used benchmark datasets. Our experiment results demonstrate that +AutoAssign+ is capable of significantly enhancing recommendation performance by +mitigating the cold-start problem. Furthermore, our framework yields a +reduction in memory usage of approximately 20-30%, verifying its practical +effectiveness and efficiency for streaming recommender systems. + +
+
+
+
+
+ + ☆ Graph Structural Residuals: A Learning Approach to Diagnosis + + +
+ Traditional model-based diagnosis relies on constructing explicit system +models, a process that can be laborious and expertise-demanding. In this paper, +we propose a novel framework that combines concepts of model-based diagnosis +with deep graph structure learning. This data-driven approach leverages data to +learn the system's underlying structure and provide dynamic observations, +represented by two distinct graph adjacency matrices. Our work facilitates a +seamless integration of graph structure learning with model-based diagnosis by +making three main contributions: (i) redefining the constructs of system +representation, observations, and faults (ii) introducing two distinct versions +of a self-supervised graph structure learning model architecture and (iii) +demonstrating the potential of our data-driven diagnostic method through +experiments on a system of coupled oscillators. + +
+
+
+
+
+ + ☆ Search to Fine-tune Pre-trained Graph Neural Networks for Graph-level + Tasks + + +
+ Recently, graph neural networks (GNNs) have shown its unprecedented success +in many graph-related tasks. However, GNNs face the label scarcity issue as +other neural networks do. Thus, recent efforts try to pre-train GNNs on a +large-scale unlabeled graph and adapt the knowledge from the unlabeled graph to +the target downstream task. The adaptation is generally achieved by fine-tuning +the pre-trained GNNs with a limited number of labeled data. Despite the +importance of fine-tuning, current GNNs pre-training works often ignore +designing a good fine-tuning strategy to better leverage transferred knowledge +and improve the performance on downstream tasks. Only few works start to +investigate a better fine-tuning strategy for pre-trained GNNs. But their +designs either have strong assumptions or overlook the data-aware issue for +various downstream datasets. Therefore, we aim to design a better fine-tuning +strategy for pre-trained GNNs to improve the model performance in this paper. +Given a pre-trained GNN, we propose to search to fine-tune pre-trained graph +neural networks for graph-level tasks (S2PGNN), which adaptively design a +suitable fine-tuning framework for the given labeled data on the downstream +task. To ensure the improvement brought by searching fine-tuning strategy, we +carefully summarize a proper search space of fine-tuning framework that is +suitable for GNNs. The empirical studies show that S2PGNN can be implemented on +the top of 10 famous pre-trained GNNs and consistently improve their +performance. Besides, S2PGNN achieves better performance than existing +fine-tuning strategies within and outside the GNN area. Our code is publicly +available at \url{https://anonymous.4open.science/r/code_icde2024-A9CB/}. + +
+
+
+
+
+ + ☆ Data-Driven Allocation of Preventive Care With Application to Diabetes + Mellitus Type II + + +
+ Problem Definition. Increasing costs of healthcare highlight the importance +of effective disease prevention. However, decision models for allocating +preventive care are lacking. + Methodology/Results. In this paper, we develop a data-driven decision model +for determining a cost-effective allocation of preventive treatments to +patients at risk. Specifically, we combine counterfactual inference, machine +learning, and optimization techniques to build a scalable decision model that +can exploit high-dimensional medical data, such as the data found in modern +electronic health records. Our decision model is evaluated based on electronic +health records from 89,191 prediabetic patients. We compare the allocation of +preventive treatments (metformin) prescribed by our data-driven decision model +with that of current practice. We find that if our approach is applied to the +U.S. population, it can yield annual savings of $1.1 billion. Finally, we +analyze the cost-effectiveness under varying budget levels. + Managerial Implications. Our work supports decision-making in health +management, with the goal of achieving effective disease prevention at lower +costs. Importantly, our decision model is generic and can thus be used for +effective allocation of preventive care for other preventable diseases. + +
+
+ comment: Accepted by Manufacturing & Service Operations Management +
+
+
+
+
+ + ☆ CEmb-SAM: Segment Anything Model with Condition Embedding for Joint + Learning from Heterogeneous Datasets + + +
+ Automated segmentation of ultrasound images can assist medical experts with +diagnostic and therapeutic procedures. Although using the common modality of +ultrasound, one typically needs separate datasets in order to segment, for +example, different anatomical structures or lesions with different levels of +malignancy. In this paper, we consider the problem of jointly learning from +heterogeneous datasets so that the model can improve generalization abilities +by leveraging the inherent variability among datasets. We merge the +heterogeneous datasets into one dataset and refer to each component dataset as +a subgroup. We propose to train a single segmentation model so that the model +can adapt to each sub-group. For robust segmentation, we leverage recently +proposed Segment Anything model (SAM) in order to incorporate sub-group +information into the model. We propose SAM with Condition Embedding block +(CEmb-SAM) which encodes sub-group conditions and combines them with image +embeddings from SAM. The conditional embedding block effectively adapts SAM to +each image sub-group by incorporating dataset properties through learnable +parameters for normalization. Experiments show that CEmb-SAM outperforms the +baseline methods on ultrasound image segmentation for peripheral nerves and +breast cancer. The experiments highlight the effectiveness of Cemb-SAM in +learning from heterogeneous datasets in medical image segmentation tasks. + +
+
+
+
+
+ + ☆ Channel-Wise Contrastive Learning for Learning with Noisy Labels + + +
+ In real-world datasets, noisy labels are pervasive. The challenge of learning +with noisy labels (LNL) is to train a classifier that discerns the actual +classes from given instances. For this, the model must identify features +indicative of the authentic labels. While research indicates that genuine label +information is embedded in the learned features of even inaccurately labeled +data, it's often intertwined with noise, complicating its direct application. +Addressing this, we introduce channel-wise contrastive learning (CWCL). This +method distinguishes authentic label information from noise by undertaking +contrastive learning across diverse channels. Unlike conventional instance-wise +contrastive learning (IWCL), CWCL tends to yield more nuanced and resilient +features aligned with the authentic labels. Our strategy is twofold: firstly, +using CWCL to extract pertinent features to identify cleanly labeled samples, +and secondly, progressively fine-tuning using these samples. Evaluations on +several benchmark datasets validate our method's superiority over existing +approaches. + +
+
+
+
+
+ + ☆ Knowing Where to Focus: Event-aware Transformer for Video Grounding ICCV 2023 + + +
+ Recent DETR-based video grounding models have made the model directly predict +moment timestamps without any hand-crafted components, such as a pre-defined +proposal or non-maximum suppression, by learning moment queries. However, their +input-agnostic moment queries inevitably overlook an intrinsic temporal +structure of a video, providing limited positional information. In this paper, +we formulate an event-aware dynamic moment query to enable the model to take +the input-specific content and positional information of the video into +account. To this end, we present two levels of reasoning: 1) Event reasoning +that captures distinctive event units constituting a given video using a slot +attention mechanism; and 2) moment reasoning that fuses the moment queries with +a given sentence through a gated fusion transformer layer and learns +interactions between the moment queries and video-sentence representations to +predict moment timestamps. Extensive experiments demonstrate the effectiveness +and efficiency of the event-aware dynamic moment queries, outperforming +state-of-the-art approaches on several video grounding benchmarks. + +
+
+ comment: ICCV 2023. Code is available at https://github.com/jinhyunj/EaTR +
+
+
+
+
+ + ☆ Semantic-aware Network for Aerial-to-Ground Image Synthesis ICIP 2021 + + +
+ Aerial-to-ground image synthesis is an emerging and challenging problem that +aims to synthesize a ground image from an aerial image. Due to the highly +different layout and object representation between the aerial and ground +images, existing approaches usually fail to transfer the components of the +aerial scene into the ground scene. In this paper, we propose a novel framework +to explore the challenges by imposing enhanced structural alignment and +semantic awareness. We introduce a novel semantic-attentive feature +transformation module that allows to reconstruct the complex geographic +structures by aligning the aerial feature to the ground layout. Furthermore, we +propose semantic-aware loss functions by leveraging a pre-trained segmentation +network. The network is enforced to synthesize realistic objects across various +classes by separately calculating losses for different classes and balancing +them. Extensive experiments including comparisons with previous methods and +ablation studies show the effectiveness of the proposed framework both +qualitatively and quantitatively. + +
+
+ comment: ICIP 2021. Code is available at https://github.com/jinhyunj/SANet +
+
+
+
+
+ + ☆ Insurance pricing on price comparison websites via reinforcement + learning + + +
+ The emergence of price comparison websites (PCWs) has presented insurers with +unique challenges in formulating effective pricing strategies. Operating on +PCWs requires insurers to strike a delicate balance between competitive +premiums and profitability, amidst obstacles such as low historical conversion +rates, limited visibility of competitors' actions, and a dynamic market +environment. In addition to this, the capital intensive nature of the business +means pricing below the risk levels of customers can result in solvency issues +for the insurer. To address these challenges, this paper introduces +reinforcement learning (RL) framework that learns the optimal pricing policy by +integrating model-based and model-free methods. The model-based component is +used to train agents in an offline setting, avoiding cold-start issues, while +model-free algorithms are then employed in a contextual bandit (CB) manner to +dynamically update the pricing policy to maximise the expected revenue. This +facilitates quick adaptation to evolving market dynamics and enhances algorithm +efficiency and decision interpretability. The paper also highlights the +importance of evaluating pricing policies using an offline dataset in a +consistent fashion and demonstrates the superiority of the proposed methodology +over existing off-the-shelf RL/CB approaches. We validate our methodology using +synthetic data, generated to reflect private commercially available data within +real-world insurers, and compare against 6 other benchmark approaches. Our +hybrid agent outperforms these benchmarks in terms of sample efficiency and +cumulative reward with the exception of an agent that has access to perfect +market information which would not be available in a real-world set-up. + +
+
+
+
+
+ + ☆ Predicting Listing Prices In Dynamic Short Term Rental Markets Using + Machine Learning Models + + +
+ Our research group wanted to take on the difficult task of predicting prices +in a dynamic market. And short term rentals such as Airbnb listings seemed to +be the perfect proving ground to do such a thing. Airbnb has revolutionized the +travel industry by providing a platform for homeowners to rent out their +properties to travelers. The pricing of Airbnb rentals is prone to high +fluctuations, with prices changing frequently based on demand, seasonality, and +other factors. Accurate prediction of Airbnb rental prices is crucial for hosts +to optimize their revenue and for travelers to make informed booking decisions. +In this project, we aim to predict the prices of Airbnb rentals using a machine +learning modeling approach. + Our project expands on earlier research in the area of analyzing Airbnb +rental prices by taking a methodical machine learning approach as well as +incorporating sentiment analysis into our feature engineering. We intend to +gain a deeper understanding on periodic changes of Airbnb rental prices. The +primary objective of this study is to construct an accurate machine learning +model for predicting Airbnb rental prices specifically in Austin, Texas. Our +project's secondary objective is to identify the key factors that drive Airbnb +rental prices and to investigate how these factors vary across different +locations and property types. + +
+
+ comment: 40 pages, 10 tables, 12 figures +
+
+
+
+
+ + ☆ CBA: Improving Online Continual Learning via Continual Bias Adaptor ICCV 2023 + + +
+ Online continual learning (CL) aims to learn new knowledge and consolidate +previously learned knowledge from non-stationary data streams. Due to the +time-varying training setting, the model learned from a changing distribution +easily forgets the previously learned knowledge and biases toward the newly +received task. To address this problem, we propose a Continual Bias Adaptor +(CBA) module to augment the classifier network to adapt to catastrophic +distribution change during training, such that the classifier network is able +to learn a stable consolidation of previously learned tasks. In the testing +stage, CBA can be removed which introduces no additional computation cost and +memory overhead. We theoretically reveal the reason why the proposed method can +effectively alleviate catastrophic distribution shifts, and empirically +demonstrate its effectiveness through extensive experiments based on four +rehearsal-based baselines and three public continual learning benchmarks. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ CausalLM is not optimal for in-context learning + + +
+ Recent empirical evidence indicates that transformer based in-context +learning performs better when using a prefix language model (prefixLM), in +which in-context samples can all attend to each other, compared to causal +language models (causalLM), which use auto-regressive attention that prohibits +in-context samples to attend to future samples. While this result is intuitive, +it is not understood from a theoretical perspective. In this paper we take a +theoretical approach and analyze the convergence behavior of prefixLM and +causalLM under a certain parameter construction. Our analysis shows that both +LM types converge to their stationary points at a linear rate, but that while +prefixLM converges to the optimal solution of linear regression, causalLM +convergence dynamics follows that of an online gradient descent algorithm, +which is not guaranteed to be optimal even as the number of samples grows +infinitely. We supplement our theoretical claims with empirical experiments +over synthetic and real tasks and using various types of transformers. Our +experiments verify that causalLM consistently underperforms prefixLM in all +settings. + +
+
+
+
+
+ + ☆ GIT-Mol: A Multi-modal Large Language Model for Molecular Science with + Graph, Image, and Text + + +
+ Large language models have made significant strides in natural language +processing, paving the way for innovative applications including molecular +representation and generation. However, most existing single-modality +approaches cannot capture the abundant and complex information in molecular +data. Here, we introduce GIT-Mol, a multi-modal large language model that +integrates the structure Graph, Image, and Text information, including the +Simplified Molecular Input Line Entry System (SMILES) and molecular captions. +To facilitate the integration of multi-modal molecular data, we propose +GIT-Former, a novel architecture capable of mapping all modalities into a +unified latent space. Our study develops an innovative any-to-language +molecular translation strategy and achieves a 10%-15% improvement in molecular +captioning, a 5%-10% accuracy increase in property prediction, and a 20% boost +in molecule generation validity compared to baseline or single-modality models. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ Generative Interpretation + + +
+ We introduce generative interpretation, a new approach to estimating +contractual meaning using large language models. As AI triumphalism is the +order of the day, we proceed by way of grounded case studies, each illustrating +the capabilities of these novel tools in distinct ways. Taking well-known +contracts opinions, and sourcing the actual agreements that they adjudicated, +we show that AI models can help factfinders ascertain ordinary meaning in +context, quantify ambiguity, and fill gaps in parties' agreements. We also +illustrate how models can calculate the probative value of individual pieces of +extrinsic evidence. After offering best practices for the use of these models +given their limitations, we consider their implications for judicial practice +and contract theory. Using LLMs permits courts to estimate what the parties +intended cheaply and accurately, and as such generative interpretation +unsettles the current interpretative stalemate. Their use responds to +efficiency-minded textualists and justice-oriented contextualists, who argue +about whether parties will prefer cost and certainty or accuracy and fairness. +Parties--and courts--would prefer a middle path, in which adjudicators strive +to predict what the contract really meant, admitting just enough context to +approximate reality while avoiding unguided and biased assimilation of +evidence. As generative interpretation offers this possibility, we argue it can +become the new workhorse of contractual interpretation. + +
+
+
+
+
+ + ☆ Federated Classification in Hyperbolic Spaces via Secure Aggregation of + Convex Hulls + + +
+ Hierarchical and tree-like data sets arise in many applications, including +language processing, graph data mining, phylogeny and genomics. It is known +that tree-like data cannot be embedded into Euclidean spaces of finite +dimension with small distortion. This problem can be mitigated through the use +of hyperbolic spaces. When such data also has to be processed in a distributed +and privatized setting, it becomes necessary to work with new federated +learning methods tailored to hyperbolic spaces. As an initial step towards the +development of the field of federated learning in hyperbolic spaces, we propose +the first known approach to federated classification in hyperbolic spaces. Our +contributions are as follows. First, we develop distributed versions of convex +SVM classifiers for Poincar\'e discs. In this setting, the information conveyed +from clients to the global classifier are convex hulls of clusters present in +individual client data. Second, to avoid label switching issues, we introduce a +number-theoretic approach for label recovery based on the so-called integer +$B_h$ sequences. Third, we compute the complexity of the convex hulls in +hyperbolic spaces to assess the extent of data leakage; at the same time, in +order to limit the communication cost for the hulls, we propose a new +quantization method for the Poincar\'e disc coupled with Reed-Solomon-like +encoding. Fourth, at server level, we introduce a new approach for aggregating +convex hulls of the clients based on balanced graph partitioning. We test our +method on a collection of diverse data sets, including hierarchical single-cell +RNA-seq data from different patients distributed across different repositories +that have stringent privacy constraints. The classification accuracy of our +method is up to $\sim 11\%$ better than its Euclidean counterpart, +demonstrating the importance of privacy-preserving learning in hyperbolic +spaces. + +
+
+
+
+
+ + ☆ Bridging Offline-Online Evaluation with a Time-dependent and Popularity + Bias-free Offline Metric for Recommenders KDD + + +
+ The evaluation of recommendation systems is a complex task. The offline and +online evaluation metrics for recommender systems are ambiguous in their true +objectives. The majority of recently published papers benchmark their methods +using ill-posed offline evaluation methodology that often fails to predict true +online performance. Because of this, the impact that academic research has on +the industry is reduced. The aim of our research is to investigate and compare +the online performance of offline evaluation metrics. We show that penalizing +popular items and considering the time of transactions during the evaluation +significantly improves our ability to choose the best recommendation model for +a live recommender system. Our results, averaged over five large-size +real-world live data procured from recommenders, aim to help the academic +community to understand better offline evaluation and optimization criteria +that are more relevant for real applications of recommender systems. + +
+
+ comment: Accepted to evalRS 2023@KDD +
+
+
+
+
+ + ☆ Multi-Receiver Task-Oriented Communications via Multi-Task Deep Learning + + +
+ This paper studies task-oriented, otherwise known as goal-oriented, +communications, in a setting where a transmitter communicates with multiple +receivers, each with its own task to complete on a dataset, e.g., images, +available at the transmitter. A multi-task deep learning approach that involves +training a common encoder at the transmitter and individual decoders at the +receivers is presented for joint optimization of completing multiple tasks and +communicating with multiple receivers. By providing efficient resource +allocation at the edge of 6G networks, the proposed approach allows the +communications system to adapt to varying channel conditions and achieves +task-specific objectives while minimizing transmission overhead. Joint training +of the encoder and decoders using multi-task learning captures shared +information across tasks and optimizes the communication process accordingly. +By leveraging the broadcast nature of wireless communications, multi-receiver +task-oriented communications (MTOC) reduces the number of transmissions +required to complete tasks at different receivers. Performance evaluation +conducted on the MNIST, Fashion MNIST, and CIFAR-10 datasets (with image +classification considered for different tasks) demonstrates the effectiveness +of MTOC in terms of classification accuracy and resource utilization compared +to single-task-oriented communication systems. + +
+
+
+
+
+ + ☆ Quantifying Outlierness of Funds from their Categories using Supervised + Similarity + + +
+ Mutual fund categorization has become a standard tool for the investment +management industry and is extensively used by allocators for portfolio +construction and manager selection, as well as by fund managers for peer +analysis and competitive positioning. As a result, a (unintended) +miscategorization or lack of precision can significantly impact allocation +decisions and investment fund managers. Here, we aim to quantify the effect of +miscategorization of funds utilizing a machine learning based approach. We +formulate the problem of miscategorization of funds as a distance-based outlier +detection problem, where the outliers are the data-points that are far from the +rest of the data-points in the given feature space. We implement and employ a +Random Forest (RF) based method of distance metric learning, and compute the +so-called class-wise outlier measures for each data-point to identify outliers +in the data. We test our implementation on various publicly available data +sets, and then apply it to mutual fund data. We show that there is a strong +relationship between the outlier measures of the funds and their future returns +and discuss the implications of our findings. + +
+
+ comment: 8 pages, 5 tables, 8 figures +
+
+
+
+
+ + ☆ AutoSeqRec: Autoencoder for Efficient Sequential Recommendation CIKM 2023 + + +
+ Sequential recommendation demonstrates the capability to recommend items by +modeling the sequential behavior of users. Traditional methods typically treat +users as sequences of items, overlooking the collaborative relationships among +them. Graph-based methods incorporate collaborative information by utilizing +the user-item interaction graph. However, these methods sometimes face +challenges in terms of time complexity and computational efficiency. To address +these limitations, this paper presents AutoSeqRec, an incremental +recommendation model specifically designed for sequential recommendation tasks. +AutoSeqRec is based on autoencoders and consists of an encoder and three +decoders within the autoencoder architecture. These components consider both +the user-item interaction matrix and the rows and columns of the item +transition matrix. The reconstruction of the user-item interaction matrix +captures user long-term preferences through collaborative filtering. In +addition, the rows and columns of the item transition matrix represent the item +out-degree and in-degree hopping behavior, which allows for modeling the user's +short-term interests. When making incremental recommendations, only the input +matrices need to be updated, without the need to update parameters, which makes +AutoSeqRec very efficient. Comprehensive evaluations demonstrate that +AutoSeqRec outperforms existing methods in terms of accuracy, while showcasing +its robustness and efficiency. + +
+
+ comment: 10 pages, accepted by CIKM 2023 +
+
+
+
+
+ + ☆ SpeechX: Neural Codec Language Model as a Versatile Speech Transformer + + +
+ Recent advancements in generative speech models based on audio-text prompts +have enabled remarkable innovations like high-quality zero-shot text-to-speech. +However, existing models still face limitations in handling diverse audio-text +speech generation tasks involving transforming input speech and processing +audio captured in adverse acoustic conditions. This paper introduces SpeechX, a +versatile speech generation model capable of zero-shot TTS and various speech +transformation tasks, dealing with both clean and noisy signals. SpeechX +combines neural codec language modeling with multi-task learning using +task-dependent prompting, enabling unified and extensible modeling and +providing a consistent way for leveraging textual input in speech enhancement +and transformation tasks. Experimental results show SpeechX's efficacy in +various tasks, including zero-shot TTS, noise suppression, target speaker +extraction, speech removal, and speech editing with or without background +noise, achieving comparable or superior performance to specialized models +across tasks. See https://aka.ms/speechx for demo samples. + +
+
+ comment: See https://aka.ms/speechx for demo samples +
+
+
+
+
+ + ☆ ST-MLP: A Cascaded Spatio-Temporal Linear Framework with + Channel-Independence Strategy for Traffic Forecasting + + +
+ The criticality of prompt and precise traffic forecasting in optimizing +traffic flow management in Intelligent Transportation Systems (ITS) has drawn +substantial scholarly focus. Spatio-Temporal Graph Neural Networks (STGNNs) +have been lauded for their adaptability to road graph structures. Yet, current +research on STGNNs architectures often prioritizes complex designs, leading to +elevated computational burdens with only minor enhancements in accuracy. To +address this issue, we propose ST-MLP, a concise spatio-temporal model solely +based on cascaded Multi-Layer Perceptron (MLP) modules and linear layers. +Specifically, we incorporate temporal information, spatial information and +predefined graph structure with a successful implementation of the +channel-independence strategy - an effective technique in time series +forecasting. Empirical results demonstrate that ST-MLP outperforms +state-of-the-art STGNNs and other models in terms of accuracy and computational +efficiency. Our finding encourages further exploration of more concise and +effective neural network architectures in the field of traffic forecasting. + +
+
+
+
+
+ + ☆ Adaptive Tracking of a Single-Rigid-Body Character in Various + Environments + + +
+ Since the introduction of DeepMimic [Peng et al. 2018], subsequent research +has focused on expanding the repertoire of simulated motions across various +scenarios. In this study, we propose an alternative approach for this goal, a +deep reinforcement learning method based on the simulation of a +single-rigid-body character. Using the centroidal dynamics model (CDM) to +express the full-body character as a single rigid body (SRB) and training a +policy to track a reference motion, we can obtain a policy that is capable of +adapting to various unobserved environmental changes and controller transitions +without requiring any additional learning. Due to the reduced dimension of +state and action space, the learning process is sample-efficient. The final +full-body motion is kinematically generated in a physically plausible way, +based on the state of the simulated SRB character. The SRB simulation is +formulated as a quadratic programming (QP) problem, and the policy outputs an +action that allows the SRB character to follow the reference motion. We +demonstrate that our policy, efficiently trained within 30 minutes on an +ultraportable laptop, has the ability to cope with environments that have not +been experienced during learning, such as running on uneven terrain or pushing +a box, and transitions between learned policies, without any additional +learning. + +
+
+
+
+
+ + ☆ O-1: Self-training with Oracle and 1-best Hypothesis + + +
+ We introduce O-1, a new self-training objective to reduce training bias and +unify training and evaluation metrics for speech recognition. O-1 is a faster +variant of Expected Minimum Bayes Risk (EMBR), that boosts the oracle +hypothesis and can accommodate both supervised and unsupervised data. We +demonstrate the effectiveness of our approach in terms of recognition on +publicly available SpeechStew datasets and a large-scale, in-house data set. On +Speechstew, the O-1 objective closes the gap between the actual and oracle +performance by 80\% relative compared to EMBR which bridges the gap by 43\% +relative. O-1 achieves 13\% to 25\% relative improvement over EMBR on the +various datasets that SpeechStew comprises of, and a 12\% relative gap +reduction with respect to the oracle WER over EMBR training on the in-house +dataset. Overall, O-1 results in a 9\% relative improvement in WER over EMBR, +thereby speaking to the scalability of the proposed objective for large-scale +datasets. + +
+
+
+
+
+ + ☆ OCDaf: Ordered Causal Discovery with Autoregressive Flows + + +
+ We propose OCDaf, a novel order-based method for learning causal graphs from +observational data. We establish the identifiability of causal graphs within +multivariate heteroscedastic noise models, a generalization of additive noise +models that allow for non-constant noise variances. Drawing upon the structural +similarities between these models and affine autoregressive normalizing flows, +we introduce a continuous search algorithm to find causal structures. Our +experiments demonstrate state-of-the-art performance across the Sachs and +SynTReN benchmarks in Structural Hamming Distance (SHD) and Structural +Intervention Distance (SID). Furthermore, we validate our identifiability +theory across various parametric and nonparametric synthetic datasets and +showcase superior performance compared to existing baselines. + +
+
+
+
+
+ + ☆ Symphony: Optimized Model Serving using Centralized Orchestration + + +
+ The orchestration of deep neural network (DNN) model inference on GPU +clusters presents two significant challenges: achieving high accelerator +efficiency given the batching properties of model inference while meeting +latency service level objectives (SLOs), and adapting to workload changes both +in terms of short-term fluctuations and long-term resource allocation. To +address these challenges, we propose Symphony, a centralized scheduling system +that can scale to millions of requests per second and coordinate tens of +thousands of GPUs. Our system utilizes a non-work-conserving scheduling +algorithm capable of achieving high batch efficiency while also enabling robust +autoscaling. Additionally, we developed an epoch-scale algorithm that allocates +models to sub-clusters based on the compute and memory needs of the models. +Through extensive experiments, we demonstrate that Symphony outperforms prior +systems by up to 4.7x higher goodput. + +
+
+
+
+
+ + ☆ Omega-Regular Reward Machines ECAI-2023 + + +
+ Reinforcement learning (RL) is a powerful approach for training agents to +perform tasks, but designing an appropriate reward mechanism is critical to its +success. However, in many cases, the complexity of the learning objectives goes +beyond the capabilities of the Markovian assumption, necessitating a more +sophisticated reward mechanism. Reward machines and omega-regular languages are +two formalisms used to express non-Markovian rewards for quantitative and +qualitative objectives, respectively. This paper introduces omega-regular +reward machines, which integrate reward machines with omega-regular languages +to enable an expressive and effective reward mechanism for RL. We present a +model-free RL algorithm to compute epsilon-optimal strategies against +omega-egular reward machines and evaluate the effectiveness of the proposed +algorithm through experiments. + +
+
+ comment: To appear in ECAI-2023 +
+
+
+
+
+ + ☆ There Is a Digital Art History + + +
+ In this paper, we revisit Johanna Drucker's question, "Is there a digital art +history?" -- posed exactly a decade ago -- in the light of the emergence of +large-scale, transformer-based vision models. While more traditional types of +neural networks have long been part of digital art history, and digital +humanities projects have recently begun to use transformer models, their +epistemic implications and methodological affordances have not yet been +systematically analyzed. We focus our analysis on two main aspects that, +together, seem to suggest a coming paradigm shift towards a "digital" art +history in Drucker's sense. On the one hand, the visual-cultural repertoire +newly encoded in large-scale vision models has an outsized effect on digital +art history. The inclusion of significant numbers of non-photographic images +allows for the extraction and automation of different forms of visual logics. +Large-scale vision models have "seen" large parts of the Western visual canon +mediated by Net visual culture, and they continuously solidify and concretize +this canon through their already widespread application in all aspects of +digital life. On the other hand, based on two technical case studies of +utilizing a contemporary large-scale visual model to investigate basic +questions from the fields of art history and urbanism, we suggest that such +systems require a new critical methodology that takes into account the +epistemic entanglement of a model and its applications. This new methodology +reads its corpora through a neural model's training data, and vice versa: the +visual ideologies of research datasets and training datasets become entangled. + +
+
+
+
+
+ + ☆ GRU-D-Weibull: A Novel Real-Time Individualized Endpoint Prediction + + +
+ Accurate prediction models for individual-level endpoints and +time-to-endpoints are crucial in clinical practice. In this study, we propose a +novel approach, GRU-D-Weibull, which combines gated recurrent units with decay +(GRU-D) to model the Weibull distribution. Our method enables real-time +individualized endpoint prediction and population-level risk management. Using +a cohort of 6,879 patients with stage 4 chronic kidney disease (CKD4), we +evaluated the performance of GRU-D-Weibull in endpoint prediction. The C-index +of GRU-D-Weibull was ~0.7 at the index date and increased to ~0.77 after 4.3 +years of follow-up, similar to random survival forest. Our approach achieved an +absolute L1-loss of ~1.1 years (SD 0.95) at the CKD4 index date and a minimum +of ~0.45 years (SD0.3) at 4 years of follow-up, outperforming competing methods +significantly. GRU-D-Weibull consistently constrained the predicted survival +probability at the time of an event within a smaller and more fixed range +compared to other models throughout the follow-up period. We observed +significant correlations between the error in point estimates and missing +proportions of input features at the index date (correlations from ~0.1 to +~0.3), which diminished within 1 year as more data became available. By +post-training recalibration, we successfully aligned the predicted and observed +survival probabilities across multiple prediction horizons at different time +points during follow-up. Our findings demonstrate the considerable potential of +GRU-D-Weibull as the next-generation architecture for endpoint risk management, +capable of generating various endpoint estimates for real-time monitoring using +clinical data. + +
+
+ comment: 30 pages, 7 figures, 4 supplementary figures +
+
+
+
+
+ + ☆ Open-set Face Recognition using Ensembles trained on Clustered Data + + +
+ Open-set face recognition describes a scenario where unknown subjects, unseen +during the training stage, appear on test time. Not only it requires methods +that accurately identify individuals of interest, but also demands approaches +that effectively deal with unfamiliar faces. This work details a scalable +open-set face identification approach to galleries composed of hundreds and +thousands of subjects. It is composed of clustering and an ensemble of binary +learning algorithms that estimates when query face samples belong to the face +gallery and then retrieves their correct identity. The approach selects the +most suitable gallery subjects and uses the ensemble to improve prediction +performance. We carry out experiments on well-known LFW and YTF benchmarks. +Results show that competitive performance can be achieved even when targeting +scalability. + +
+
+ comment: [Original paper title: Unconstrained Face Identification using + Ensembles trained on Clustered Data] [2020 IEEE International Joint + Conference on Biometrics (IJCB)] + [https://ieeexplore.ieee.org/document/9304882] +
+
+
+
+
+ + ☆ Physics-Informed Deep Learning to Reduce the Bias in Joint Prediction of + Nitrogen Oxides + + +
+ Atmospheric nitrogen oxides (NOx) primarily from fuel combustion have +recognized acute and chronic health and environmental effects. Machine learning +(ML) methods have significantly enhanced our capacity to predict NOx +concentrations at ground-level with high spatiotemporal resolution but may +suffer from high estimation bias since they lack physical and chemical +knowledge about air pollution dynamics. Chemical transport models (CTMs) +leverage this knowledge; however, accurate predictions of ground-level +concentrations typically necessitate extensive post-calibration. Here, we +present a physics-informed deep learning framework that encodes +advection-diffusion mechanisms and fluid dynamics constraints to jointly +predict NO2 and NOx and reduce ML model bias by 21-42%. Our approach captures +fine-scale transport of NO2 and NOx, generates robust spatial extrapolation, +and provides explicit uncertainty estimation. The framework fuses +knowledge-driven physicochemical principles of CTMs with the predictive power +of ML for air quality exposure, health, and policy applications. Our approach +offers significant improvements over purely data-driven ML methods and has +unprecedented bias reduction in joint NO2 and NOx prediction. + +
+
+
+
+
+ + ☆ Interaction-Aware Personalized Vehicle Trajectory Prediction Using + Temporal Graph Neural Networks + + +
+ Accurate prediction of vehicle trajectories is vital for advanced driver +assistance systems and autonomous vehicles. Existing methods mainly rely on +generic trajectory predictions derived from large datasets, overlooking the +personalized driving patterns of individual drivers. To address this gap, we +propose an approach for interaction-aware personalized vehicle trajectory +prediction that incorporates temporal graph neural networks. Our method +utilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to +model the spatio-temporal interactions between target vehicles and their +surrounding traffic. To personalize the predictions, we establish a pipeline +that leverages transfer learning: the model is initially pre-trained on a +large-scale trajectory dataset and then fine-tuned for each driver using their +specific driving data. We employ human-in-the-loop simulation to collect +personalized naturalistic driving trajectories and corresponding surrounding +vehicle trajectories. Experimental results demonstrate the superior performance +of our personalized GCN-LSTM model, particularly for longer prediction +horizons, compared to its generic counterpart. Moreover, the personalized model +outperforms individual models created without pre-training, emphasizing the +significance of pre-training on a large dataset to avoid overfitting. By +incorporating personalization, our approach enhances trajectory prediction +accuracy. + +
+
+
+
+
+ + ☆ A Hybrid Deep Spatio-Temporal Attention-Based Model for Parkinson's + Disease Diagnosis Using Resting State EEG Signals + + +
+ Parkinson's disease (PD), a severe and progressive neurological illness, +affects millions of individuals worldwide. For effective treatment and +management of PD, an accurate and early diagnosis is crucial. This study +presents a deep learning-based model for the diagnosis of PD using resting +state electroencephalogram (EEG) signal. The objective of the study is to +develop an automated model that can extract complex hidden nonlinear features +from EEG and demonstrate its generalizability on unseen data. The model is +designed using a hybrid model, consists of convolutional neural network (CNN), +bidirectional gated recurrent unit (Bi-GRU), and attention mechanism. The +proposed method is evaluated on three public datasets (Uc San Diego Dataset, +PRED-CT, and University of Iowa (UI) dataset), with one dataset used for +training and the other two for evaluation. The results show that the proposed +model can accurately diagnose PD with high performance on both the training and +hold-out datasets. The model also performs well even when some part of the +input information is missing. The results of this work have significant +implications for patient treatment and for ongoing investigations into the +early detection of Parkinson's disease. The suggested model holds promise as a +non-invasive and reliable technique for PD early detection utilizing resting +state EEG. + +
+
+
+
+
+ + ☆ Addressing Distribution Shift in RTB Markets via Exponential Tilting + + +
+ Distribution shift in machine learning models can be a primary cause of +performance degradation. This paper delves into the characteristics of these +shifts, primarily motivated by Real-Time Bidding (RTB) market models. We +emphasize the challenges posed by class imbalance and sample selection bias, +both potent instigators of distribution shifts. This paper introduces the +Exponential Tilt Reweighting Alignment (ExTRA) algorithm, as proposed by Marty +et al. (2023), to address distribution shifts in data. The ExTRA method is +designed to determine the importance weights on the source data, aiming to +minimize the KL divergence between the weighted source and target datasets. A +notable advantage of this method is its ability to operate using labeled source +data and unlabeled target data. Through simulated real-world data, we +investigate the nature of distribution shift and evaluate the applicacy of the +proposed model. + +
+
+
+
+
+ + ☆ U-Turn Diffusion + + +
+ We present a comprehensive examination of score-based diffusion models of AI +for generating synthetic images. These models hinge upon a dynamic auxiliary +time mechanism driven by stochastic differential equations, wherein the score +function is acquired from input images. Our investigation unveils a criterion +for evaluating efficiency of the score-based diffusion models: the power of the +generative process depends on the ability to de-construct fast correlations +during the reverse/de-noising phase. To improve the quality of the produced +synthetic images, we introduce an approach coined "U-Turn Diffusion". The +U-Turn Diffusion technique starts with the standard forward diffusion process, +albeit with a condensed duration compared to conventional settings. +Subsequently, we execute the standard reverse dynamics, initialized with the +concluding configuration from the forward process. This U-Turn Diffusion +procedure, combining forward, U-turn, and reverse processes, creates a +synthetic image approximating an independent and identically distributed +(i.i.d.) sample from the probability distribution implicitly described via +input samples. To analyze relevant time scales we employ various analytical +tools, including auto-correlation analysis, weighted norm of the score-function +analysis, and Kolmogorov-Smirnov Gaussianity test. The tools guide us to +establishing that the Kernel Intersection Distance, a metric comparing the +quality of synthetic samples with real data samples, is minimized at the +optimal U-turn time. + +
+
+
+
+
+ + ☆ Locally Adaptive and Differentiable Regression + + +
+ Over-parameterized models like deep nets and random forests have become very +popular in machine learning. However, the natural goals of continuity and +differentiability, common in regression models, are now often ignored in modern +overparametrized, locally-adaptive models. We propose a general framework to +construct a global continuous and differentiable model based on a weighted +average of locally learned models in corresponding local regions. This model is +competitive in dealing with data with different densities or scales of function +values in different local regions. We demonstrate that when we mix kernel ridge +and polynomial regression terms in the local models, and stitch them together +continuously, we achieve faster statistical convergence in theory and improved +performance in various practical settings. + +
+
+
+
+
+ + ☆ Text Injection for Capitalization and Turn-Taking Prediction in Speech + Models + + +
+ Text injection for automatic speech recognition (ASR), wherein unpaired +text-only data is used to supplement paired audio-text data, has shown +promising improvements for word error rate. This study examines the use of text +injection for auxiliary tasks, which are the non-ASR tasks often performed by +an E2E model. In this work, we use joint end-to-end and internal language model +training (JEIT) as our text injection algorithm to train an ASR model which +performs two auxiliary tasks. The first is capitalization, which is a +de-normalization task. The second is turn-taking prediction, which attempts to +identify whether a user has completed their conversation turn in a digital +assistant interaction. We show results demonstrating that our text injection +method boosts capitalization performance for long-tail data, and improves +turn-taking detection recall. + +
+
+
+
+
+ + ☆ DISBELIEVE: Distance Between Client Models is Very Essential for + Effective Local Model Poisoning Attacks MICCAI 2023 + + +
+ Federated learning is a promising direction to tackle the privacy issues +related to sharing patients' sensitive data. Often, federated systems in the +medical image analysis domain assume that the participating local clients are +\textit{honest}. Several studies report mechanisms through which a set of +malicious clients can be introduced that can poison the federated setup, +hampering the performance of the global model. To overcome this, robust +aggregation methods have been proposed that defend against those attacks. We +observe that most of the state-of-the-art robust aggregation methods are +heavily dependent on the distance between the parameters or gradients of +malicious clients and benign clients, which makes them prone to local model +poisoning attacks when the parameters or gradients of malicious and benign +clients are close. Leveraging this, we introduce DISBELIEVE, a local model +poisoning attack that creates malicious parameters or gradients such that their +distance to benign clients' parameters or gradients is low respectively but at +the same time their adverse effect on the global model's performance is high. +Experiments on three publicly available medical image datasets demonstrate the +efficacy of the proposed DISBELIEVE attack as it significantly lowers the +performance of the state-of-the-art \textit{robust aggregation} methods for +medical image analysis. Furthermore, compared to state-of-the-art local model +poisoning attacks, DISBELIEVE attack is also effective on natural images where +we observe a severe drop in classification performance of the global model for +multi-class classification on benchmark dataset CIFAR-10. + +
+
+ comment: Accepted by MICCAI 2023 - DeCaF +
+
+
+
+
+ + ♻ ☆ G-MATT: Single-step Retrosynthesis Prediction using Molecular Grammar + Tree Transformer + + +
+ Various template-based and template-free approaches have been proposed for +single-step retrosynthesis prediction in recent years. While these approaches +demonstrate strong performance from a data-driven metrics standpoint, many +model architectures do not incorporate underlying chemistry principles. Here, +we propose a novel chemistry-aware retrosynthesis prediction framework that +combines powerful data-driven models with prior domain knowledge. We present a +tree-to-sequence transformer architecture that utilizes hierarchical SMILES +grammar-based trees, incorporating crucial chemistry information that is often +overlooked by SMILES text-based representations, such as local structures and +functional groups. The proposed framework, grammar-based molecular attention +tree transformer (G-MATT), achieves significant performance improvements +compared to baseline retrosynthesis models. G-MATT achieves a promising top-1 +accuracy of 51% (top-10 accuracy of 79.1%), invalid rate of 1.5%, and bioactive +similarity rate of 74.8% on the USPTO- 50K dataset. Additional analyses of +G-MATT attention maps demonstrate the ability to retain chemistry knowledge +without relying on excessively complex model architectures. + +
+
+
+
+
+ + ♻ ☆ On the Sublinear Regret of GP-UCB + + +
+ In the kernelized bandit problem, a learner aims to sequentially compute the +optimum of a function lying in a reproducing kernel Hilbert space given only +noisy evaluations at sequentially chosen points. In particular, the learner +aims to minimize regret, which is a measure of the suboptimality of the choices +made. Arguably the most popular algorithm is the Gaussian Process Upper +Confidence Bound (GP-UCB) algorithm, which involves acting based on a simple +linear estimator of the unknown function. Despite its popularity, existing +analyses of GP-UCB give a suboptimal regret rate, which fails to be sublinear +for many commonly used kernels such as the Mat\'ern kernel. This has led to a +longstanding open question: are existing regret analyses for GP-UCB tight, or +can bounds be improved by using more sophisticated analytical techniques? In +this work, we resolve this open question and show that GP-UCB enjoys nearly +optimal regret. In particular, our results yield sublinear regret rates for the +Mat\'ern kernel, improving over the state-of-the-art analyses and partially +resolving a COLT open problem posed by Vakili et al. Our improvements rely on a +key technical contribution -- regularizing kernel ridge estimators in +proportion to the smoothness of the underlying kernel $k$. Applying this key +idea together with a largely overlooked concentration result in separable +Hilbert spaces (for which we provide an independent, simplified derivation), we +are able to provide a tighter analysis of the GP-UCB algorithm. + +
+
+ comment: 20 pages, 0 figures +
+
+
+
+
+ + ♻ ☆ Evaluating the Impact of Social Determinants on Health Prediction in the + Intensive Care Unit + + +
+ Social determinants of health (SDOH) -- the conditions in which people live, +grow, and age -- play a crucial role in a person's health and well-being. There +is a large, compelling body of evidence in population health studies showing +that a wide range of SDOH is strongly correlated with health outcomes. Yet, a +majority of the risk prediction models based on electronic health records (EHR) +do not incorporate a comprehensive set of SDOH features as they are often noisy +or simply unavailable. Our work links a publicly available EHR database, +MIMIC-IV, to well-documented SDOH features. We investigate the impact of such +features on common EHR prediction tasks across different patient populations. +We find that community-level SDOH features do not improve model performance for +a general patient population, but can improve data-limited model fairness for +specific subpopulations. We also demonstrate that SDOH features are vital for +conducting thorough audits of algorithmic biases beyond protective attributes. +We hope the new integrated EHR-SDOH database will enable studies on the +relationship between community health and individual outcomes and provide new +benchmarks to study algorithmic biases beyond race, gender, and age. + +
+
+
+
+
+ + ♻ ☆ Source-free Domain Adaptive Human Pose Estimation ICCV 2023 + + +
+ Human Pose Estimation (HPE) is widely used in various fields, including +motion analysis, healthcare, and virtual reality. However, the great expenses +of labeled real-world datasets present a significant challenge for HPE. To +overcome this, one approach is to train HPE models on synthetic datasets and +then perform domain adaptation (DA) on real-world data. Unfortunately, existing +DA methods for HPE neglect data privacy and security by using both source and +target data in the adaptation process. To this end, we propose a new task, +named source-free domain adaptive HPE, which aims to address the challenges of +cross-domain learning of HPE without access to source data during the +adaptation process. We further propose a novel framework that consists of three +models: source model, intermediate model, and target model, which explores the +task from both source-protect and target-relevant perspectives. The +source-protect module preserves source information more effectively while +resisting noise, and the target-relevant module reduces the sparsity of spatial +representations by building a novel spatial probability space, and +pose-specific contrastive learning and information maximization are proposed on +the basis of this space. Comprehensive experiments on several domain adaptive +HPE benchmarks show that the proposed method outperforms existing approaches by +a considerable margin. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ The Future of Fundamental Science Led by Generative Closed-Loop + Artificial Intelligence + + +
+ Recent advances in machine learning and AI, including Generative AI and LLMs, +are disrupting technological innovation, product development, and society as a +whole. AI's contribution to technology can come from multiple approaches that +require access to large training data sets and clear performance evaluation +criteria, ranging from pattern recognition and classification to generative +models. Yet, AI has contributed less to fundamental science in part because +large data sets of high-quality data for scientific practice and model +discovery are more difficult to access. Generative AI, in general, and Large +Language Models in particular, may represent an opportunity to augment and +accelerate the scientific discovery of fundamental deep science with +quantitative models. Here we explore and investigate aspects of an AI-driven, +automated, closed-loop approach to scientific discovery, including self-driven +hypothesis generation and open-ended autonomous exploration of the hypothesis +space. Integrating AI-driven automation into the practice of science would +mitigate current problems, including the replication of findings, systematic +production of data, and ultimately democratisation of the scientific process. +Realising these possibilities requires a vision for augmented AI coupled with a +diversity of AI approaches able to deal with fundamental aspects of causality +analysis and model discovery while enabling unbiased search across the space of +putative explanations. These advances hold the promise to unleash AI's +potential for searching and discovering the fundamental structure of our world +beyond what human scientists have been able to achieve. Such a vision would +push the boundaries of new fundamental science rather than automatize current +workflows and instead open doors for technological innovation to tackle some of +the greatest challenges facing humanity today. + +
+
+ comment: 35 pages, first draft of the final report from the Alan Turing + Institute on AI for Scientific Discovery +
+
+
+
+
+ + ♻ ☆ Deconfounded Causal Collaborative Filtering + + +
+ Recommender systems may be confounded by various types of confounding factors +(also called confounders) that may lead to inaccurate recommendations and +sacrificed recommendation performance. Current approaches to solving the +problem usually design each specific model for each specific confounder. +However, real-world systems may include a huge number of confounders and thus +designing each specific model for each specific confounder could be +unrealistic. More importantly, except for those ``explicit confounders'' that +experts can manually identify and process such as item's position in the +ranking list, there are also many ``latent confounders'' that are beyond the +imagination of experts. For example, users' rating on a song may depend on +their current mood or the current weather, and users' preference on ice creams +may depend on the air temperature. Such latent confounders may be unobservable +in the recorded training data. To solve the problem, we propose Deconfounded +Causal Collaborative Filtering (DCCF). We first frame user behaviors with +unobserved confounders into a causal graph, and then we design a front-door +adjustment model carefully fused with machine learning to deconfound the +influence of unobserved confounders. Experiments on real-world datasets show +that our method is able to deconfound unobserved confounders to achieve better +recommendation performance. + +
+
+ comment: Accepted by the ACM Transactions on Recommender Systems (TORS) +
+
+
+
+
+ + ♻ ☆ Causal Collaborative Filtering SIGIR + + +
+ Many of the traditional recommendation algorithms are designed based on the +fundamental idea of mining or learning correlative patterns from data to +estimate the user-item correlative preference. However, pure correlative +learning may lead to Simpson's paradox in predictions, and thus results in +sacrificed recommendation performance. Simpson's paradox is a well-known +statistical phenomenon, which causes confusions in statistical conclusions and +ignoring the paradox may result in inaccurate decisions. Fortunately, causal +and counterfactual modeling can help us to think outside of the observational +data for user modeling and personalization so as to tackle such issues. In this +paper, we propose Causal Collaborative Filtering (CCF) -- a general framework +for modeling causality in collaborative filtering and recommendation. We +provide a unified causal view of CF and mathematically show that many of the +traditional CF algorithms are actually special cases of CCF under simplified +causal graphs. We then propose a conditional intervention approach for +$do$-operations so that we can estimate the user-item causal preference based +on the observational data. Finally, we further propose a general counterfactual +constrained learning framework for estimating the user-item preferences. +Experiments are conducted on two types of real-world datasets -- traditional +and randomized trial data -- and results show that our framework can improve +the recommendation performance and reduce the Simpson's paradox problem of many +CF algorithms. + +
+
+ comment: Accepted by the 2023 ACM SIGIR International Conference on Theory of + Information Retrieval +
+
+
+
+
+ + ♻ ☆ Non-Asymptotic Pointwise and Worst-Case Bounds for Classical Spectrum + Estimators + + +
+ Spectrum estimation is a fundamental methodology in the analysis of +time-series data, with applications including medicine, speech analysis, and +control design. The asymptotic theory of spectrum estimation is +well-understood, but the theory is limited when the number of samples is fixed +and finite. This paper gives non-asymptotic error bounds for a broad class of +spectral estimators, both pointwise (at specific frequencies) and in the worst +case over all frequencies. The general method is used to derive error bounds +for the classical Blackman-Tukey, Bartlett, and Welch estimators. In +particular, these are first non-asymptotic error bounds for Bartlett and Welch +estimators. + +
+
+ comment: 15 pages, 3 figures, under review in IEEE Transactions on Signal + Processing +
+
+
+
+
+ + ♻ ☆ Fairness in Machine Learning meets with Equity in Healthcare AAAI + + +
+ With the growing utilization of machine learning in healthcare, there is +increasing potential to enhance healthcare outcomes. However, this also brings +the risk of perpetuating biases in data and model design that can harm certain +demographic groups based on factors such as age, gender, and race. This study +proposes an artificial intelligence framework, grounded in software engineering +principles, for identifying and mitigating biases in data and models while +ensuring fairness in healthcare settings. A case study is presented to +demonstrate how systematic biases in data can lead to amplified biases in model +predictions, and machine learning methods are suggested to prevent such biases. +Future research aims to test and validate the proposed ML framework in +real-world clinical settings to evaluate its impact on promoting health equity. + +
+
+ comment: Accepted in Association for the Advancement of Artificial + Intelligence (AAAI) 2023 , Responsible Medical AI, Design, and + Operationalization Symposium +
+
+
+
+
+ + ♻ ☆ Efficient Rate Optimal Regret for Adversarial Contextual MDPs Using + Online Function Approximation + + +
+ We present the OMG-CMDP! algorithm for regret minimization in adversarial +Contextual MDPs. The algorithm operates under the minimal assumptions of +realizable function class and access to online least squares and log loss +regression oracles. Our algorithm is efficient (assuming efficient online +regression oracles), simple and robust to approximation errors. It enjoys an +$\widetilde{O}(H^{2.5} \sqrt{ T|S||A| ( \mathcal{R}(\mathcal{O}) + H +\log(\delta^{-1}) )})$ regret guarantee, with $T$ being the number of episodes, +$S$ the state space, $A$ the action space, $H$ the horizon and +$\mathcal{R}(\mathcal{O}) = \mathcal{R}(\mathcal{O}_{\mathrm{sq}}^\mathcal{F}) ++ \mathcal{R}(\mathcal{O}_{\mathrm{log}}^\mathcal{P})$ is the sum of the +regression oracles' regret, used to approximate the context-dependent rewards +and dynamics, respectively. To the best of our knowledge, our algorithm is the +first efficient rate optimal regret minimization algorithm for adversarial +CMDPs that operates under the minimal standard assumption of online function +approximation. + +
+
+
+
+
+ + ♻ ☆ Hybrid quantum-classical machine learning for generative chemistry and + drug design + + +
+ Deep generative chemistry models emerge as powerful tools to expedite drug +discovery. However, the immense size and complexity of the structural space of +all possible drug-like molecules pose significant obstacles, which could be +overcome with hybrid architectures combining quantum computers with deep +classical networks. As the first step toward this goal, we built a compact +discrete variational autoencoder (DVAE) with a Restricted Boltzmann Machine +(RBM) of reduced size in its latent layer. The size of the proposed model was +small enough to fit on a state-of-the-art D-Wave quantum annealer and allowed +training on a subset of the ChEMBL dataset of biologically active compounds. +Finally, we generated 2331 novel chemical structures with medicinal chemistry +and synthetic accessibility properties in the ranges typical for molecules from +ChEMBL. The presented results demonstrate the feasibility of using already +existing or soon-to-be-available quantum computing devices as testbeds for +future drug discovery applications. + +
+
+ comment: 8 pages. 3 figures, 1 table +
+
+
+
+
+ + ♻ ☆ MADiff: Offline Multi-agent Learning with Diffusion Models + + +
+ Diffusion model (DM), as a powerful generative model, recently achieved huge +success in various scenarios including offline reinforcement learning, where +the policy learns to conduct planning by generating trajectory in the online +evaluation. However, despite the effectiveness shown for single-agent learning, +it remains unclear how DMs can operate in multi-agent problems, where agents +can hardly complete teamwork without good coordination by independently +modeling each agent's trajectories. In this paper, we propose MADiff, a novel +generative multi-agent learning framework to tackle this problem. MADiff is +realized with an attention-based diffusion model to model the complex +coordination among behaviors of multiple diffusion agents. To the best of our +knowledge, MADiff is the first diffusion-based multi-agent offline RL +framework, which behaves as both a decentralized policy and a centralized +controller, which includes opponent modeling and can be used for multi-agent +trajectory prediction. MADiff takes advantage of the powerful generative +ability of diffusion while well-suited in modeling complex multi-agent +interactions. Our experiments show the superior performance of MADiff compared +to baseline algorithms in a range of multi-agent learning tasks. + +
+
+ comment: 17 pages, 7 figures, 4 tables. The first two authors contributed + equally to the work +
+
+
+
+
+ + ♻ ☆ A Counterfactual Safety Margin Perspective on the Scoring of Autonomous + Vehicles' Riskiness + + +
+ Autonomous Vehicles (AVs) have the potential to provide numerous societal +benefits, such as decreased road accidents and increased overall transportation +efficiency. However, quantifying the risk associated with AVs is challenging +due to the lack of historical data and the rapidly evolving technology. This +paper presents a data-driven framework for comparing the risk of different AVs' +behaviors in various operational design domains (ODDs), based on counterfactual +simulations of "misbehaving" road users. We introduce the concept of +counterfactual safety margin, which represents the minimum deviation from +normal behavior that could lead to a collision. This concept helps to find the +most critical scenarios but also to assess the frequency and severity of risk +of AVs. We show that the proposed methodology is applicable even when the AV's +behavioral policy is unknown -- through worst- and best-case analyses -- making +the method useful also to external third-party risk assessors. Our experimental +results demonstrate the correlation between the safety margin, the driving +policy quality, and the ODD shedding light on the relative risk associated with +different AV providers. This work contributes to AV safety assessment and aids +in addressing legislative and insurance concerns surrounding this emerging +technology. + +
+
+ comment: updated affiliations +
+
+
+
+
+ + ♻ ☆ Can We Transfer Noise Patterns? A Multi-environment Spectrum Analysis + Model Using Generated Cases + + +
+ Spectrum analysis systems in online water quality testing are designed to +detect types and concentrations of pollutants and enable regulatory agencies to +respond promptly to pollution incidents. However, spectral data-based testing +devices suffer from complex noise patterns when deployed in non-laboratory +environments. To make the analysis model applicable to more environments, we +propose a noise patterns transferring model, which takes the spectrum of +standard water samples in different environments as cases and learns the +differences in their noise patterns, thus enabling noise patterns to transfer +to unknown samples. Unfortunately, the inevitable sample-level baseline noise +makes the model unable to obtain the paired data that only differ in +dataset-level environmental noise. To address the problem, we generate a +sample-to-sample case-base to exclude the interference of sample-level noise on +dataset-level noise learning, enhancing the system's learning performance. +Experiments on spectral data with different background noises demonstrate the +good noise-transferring ability of the proposed method against baseline systems +ranging from wavelet denoising, deep neural networks, and generative models. +From this research, we posit that our method can enhance the performance of DL +models by generating high-quality cases. The source code is made publicly +available online at https://github.com/Magnomic/CNST. + +
+
+
+
+
+ + ♻ ☆ Using Automated Algorithm Configuration for Parameter Control + + +
+ Dynamic Algorithm Configuration (DAC) tackles the question of how to +automatically learn policies to control parameters of algorithms in a +data-driven fashion. This question has received considerable attention from the +evolutionary community in recent years. Having a good benchmark collection to +gain structural understanding on the effectiveness and limitations of different +solution methods for DAC is therefore strongly desirable. Following recent work +on proposing DAC benchmarks with well-understood theoretical properties and +ground truth information, in this work, we suggest as a new DAC benchmark the +controlling of the key parameter $\lambda$ in the +$(1+(\lambda,\lambda))$~Genetic Algorithm for solving OneMax problems. We +conduct a study on how to solve the DAC problem via the use of (static) +automated algorithm configuration on the benchmark, and propose techniques to +significantly improve the performance of the approach. Our approach is able to +consistently outperform the default parameter control policy of the benchmark +derived from previous theoretical work on sufficiently large problem sizes. We +also present new findings on the landscape of the parameter-control search +policies and propose methods to compute stronger baselines for the benchmark +via numerical approximations of the true optimal policies. + +
+
+ comment: To appear in the Proc. of the ACM/SIGEVO Conference on Foundations of + Genetic Algorithms (FOGA XVII) +
+
+
+
+
+ + ♻ ☆ Unlearnable Examples Give a False Sense of Security: Piercing through + Unexploitable Data with Learnable Examples + + +
+ Safeguarding data from unauthorized exploitation is vital for privacy and +security, especially in recent rampant research in security breach such as +adversarial/membership attacks. To this end, \textit{unlearnable examples} +(UEs) have been recently proposed as a compelling protection, by adding +imperceptible perturbation to data so that models trained on them cannot +classify them accurately on original clean distribution. Unfortunately, we find +UEs provide a false sense of security, because they cannot stop unauthorized +users from utilizing other unprotected data to remove the protection, by +turning unlearnable data into learnable again. Motivated by this observation, +we formally define a new threat by introducing \textit{learnable unauthorized +examples} (LEs) which are UEs with their protection removed. The core of this +approach is a novel purification process that projects UEs onto the manifold of +LEs. This is realized by a new joint-conditional diffusion model which denoises +UEs conditioned on the pixel and perceptual similarity between UEs and LEs. +Extensive experiments demonstrate that LE delivers state-of-the-art countering +performance against both supervised UEs and unsupervised UEs in various +scenarios, which is the first generalizable countermeasure to UEs across +supervised learning and unsupervised learning. Our code is available at +\url{https://github.com/jiangw-0/LE_JCDP}. + +
+
+
+
+
+ + ♻ ☆ Multiscale Attention via Wavelet Neural Operators for Vision + Transformers + + +
+ Transformers have achieved widespread success in computer vision. At their +heart, there is a Self-Attention (SA) mechanism, an inductive bias that +associates each token in the input with every other token through a weighted +basis. The standard SA mechanism has quadratic complexity with the sequence +length, which impedes its utility to long sequences appearing in high +resolution vision. Recently, inspired by operator learning for PDEs, Adaptive +Fourier Neural Operators (AFNO) were introduced for high resolution attention +based on global convolution that is efficiently implemented via FFT. However, +the AFNO global filtering cannot well represent small and moderate scale +structures that commonly appear in natural images. To leverage the +coarse-to-fine scale structures we introduce a Multiscale Wavelet Attention +(MWA) by leveraging wavelet neural operators which incurs linear complexity in +the sequence size. We replace the attention in ViT with MWA and our experiments +with CIFAR and Tiny-ImageNet classification demonstrate significant improvement +over alternative Fourier-based attentions such as AFNO and Global Filter +Network (GFN). + +
+
+
+
+
+ + ♻ ☆ Leveraged Matrix Completion with Noise + + +
+ Completing low-rank matrices from subsampled measurements has received much +attention in the past decade. Existing works indicate that +$\mathcal{O}(nr\log^2(n))$ datums are required to theoretically secure the +completion of an $n \times n$ noisy matrix of rank $r$ with high probability, +under some quite restrictive assumptions: (1) the underlying matrix must be +incoherent; (2) observations follow the uniform distribution. The +restrictiveness is partially due to ignoring the roles of the leverage score +and the oracle information of each element. In this paper, we employ the +leverage scores to characterize the importance of each element and +significantly relax assumptions to: (1) not any other structure assumptions are +imposed on the underlying low-rank matrix; (2) elements being observed are +appropriately dependent on their importance via the leverage score. Under these +assumptions, instead of uniform sampling, we devise an ununiform/biased +sampling procedure that can reveal the ``importance'' of each observed element. +Our proofs are supported by a novel approach that phrases sufficient optimality +conditions based on the Golfing Scheme, which would be of independent interest +to the wider areas. Theoretical findings show that we can provably recover an +unknown $n\times n$ matrix of rank $r$ from just about $\mathcal{O}(nr\log^2 +(n))$ entries, even when the observed entries are corrupted with a small amount +of noisy information. The empirical results align precisely with our theories. + +
+
+ comment: This manuscript has been accepted for publication as a regular paper + in the IEEE Transactions on Cybernetics +
+
+
+
+
+ + ♻ ☆ Miipher: A Robust Speech Restoration Model Integrating Self-Supervised + Speech and Text Representations SP + + +
+ Speech restoration (SR) is a task of converting degraded speech signals into +high-quality ones. In this study, we propose a robust SR model called Miipher, +and apply Miipher to a new SR application: increasing the amount of +high-quality training data for speech generation by converting speech samples +collected from the Web to studio-quality. To make our SR model robust against +various degradation, we use (i) a speech representation extracted from w2v-BERT +for the input feature, and (ii) a text representation extracted from +transcripts via PnG-BERT as a linguistic conditioning feature. Experiments show +that Miipher (i) is robust against various audio degradation and (ii) enable us +to train a high-quality text-to-speech (TTS) model from restored speech samples +collected from the Web. Audio samples are available at our demo page: +google.github.io/df-conformer/miipher/ + +
+
+ comment: Accepted to WASPAA 2023 +
+
+
+
+
+ + ♻ ☆ Expediting Neural Network Verification via Network Reduction + + +
+ A wide range of verification methods have been proposed to verify the safety +properties of deep neural networks ensuring that the networks function +correctly in critical applications. However, many well-known verification tools +still struggle with complicated network architectures and large network sizes. +In this work, we propose a network reduction technique as a pre-processing +method prior to verification. The proposed method reduces neural networks via +eliminating stable ReLU neurons, and transforming them into a sequential neural +network consisting of ReLU and Affine layers which can be handled by the most +verification tools. We instantiate the reduction technique on the +state-of-the-art complete and incomplete verification tools, including +alpha-beta-crown, VeriNet and PRIMA. Our experiments on a large set of +benchmarks indicate that the proposed technique can significantly reduce neural +networks and speed up existing verification tools. Furthermore, the experiment +results also show that network reduction can improve the availability of +existing verification tools on many networks by reducing them into sequential +neural networks. + +
+
+
+
+
+ + ♻ ☆ Graph Neural Networks Provably Benefit from Structural Information: A + Feature Learning Perspective + + +
+ Graph neural networks (GNNs) have pioneered advancements in graph +representation learning, exhibiting superior feature learning and performance +over multilayer perceptrons (MLPs) when handling graph inputs. However, +understanding the feature learning aspect of GNNs is still in its initial +stage. This study aims to bridge this gap by investigating the role of graph +convolution within the context of feature learning theory in neural networks +using gradient descent training. We provide a distinct characterization of +signal learning and noise memorization in two-layer graph convolutional +networks (GCNs), contrasting them with two-layer convolutional neural networks +(CNNs). Our findings reveal that graph convolution significantly augments the +benign overfitting regime over the counterpart CNNs, where signal learning +surpasses noise memorization, by approximately factor $\sqrt{D}^{q-2}$, with +$D$ denoting a node's expected degree and $q$ being the power of the ReLU +activation function where $q > 2$. These findings highlight a substantial +discrepancy between GNNs and MLPs in terms of feature learning and +generalization capacity after gradient descent training, a conclusion further +substantiated by our empirical simulations. + +
+
+ comment: 33 pages, 7 figures. We have provided a clearer roadmap +
+
+
+
+
+ + ♻ ☆ Adaptive Filters in Graph Convolutional Neural Networks + + +
+ Over the last few years, we have witnessed the availability of an increasing +data generated from non-Euclidean domains, which are usually represented as +graphs with complex relationships, and Graph Neural Networks (GNN) have gained +a high interest because of their potential in processing graph-structured data. +In particular, there is a strong interest in exploring the possibilities in +performing convolution on graphs using an extension of the GNN architecture, +generally referred to as Graph Convolutional Neural Networks (ConvGNN). +Convolution on graphs has been achieved mainly in two forms: spectral and +spatial convolutions. Due to the higher flexibility in exploring and exploiting +the graph structure of data, there is recently an increasing interest in +investigating the possibilities that the spatial approach can offer. The idea +of finding a way to adapt the network behaviour to the inputs they process to +maximize the total performances has aroused much interest in the neural +networks literature over the years. This paper presents a novel method to adapt +the behaviour of a ConvGNN to the input proposing a method to perform spatial +convolution on graphs using input-specific filters, which are dynamically +generated from nodes feature vectors. The experimental assessment confirms the +capabilities of the proposed approach, which achieves satisfying results using +a low number of filters. + +
+
+ comment: This paper has been published in its final version on \textit{Pattern + Recognition} journal with DOI https://doi.org/10.1016/j.patcog.2023.109867 in + Open Access mode. Please consider it as final and peer-reviewed version +
+
+
+
+
+ + ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models: + A Comprehensive Survey + + +
+ Diffusion models and large language models have emerged as leading-edge +generative models and have sparked a revolutionary impact on various aspects of +human life. However, the practical implementation of these models has also +exposed inherent risks, highlighting their dual nature and raising concerns +regarding their trustworthiness. Despite the abundance of literature on this +subject, a comprehensive survey specifically delving into the intersection of +large-scale generative models and their trustworthiness remains largely absent. +To bridge this gap, This paper investigates both the long-standing and emerging +threats associated with these models across four fundamental dimensions: +privacy, security, fairness, and responsibility. In this way, we construct an +extensive map outlining the trustworthiness of these models, while also +providing practical recommendations and identifying future directions. These +efforts are crucial for promoting the trustworthy deployment of these models, +ultimately benefiting society as a whole. + +
+
+ comment: draft version +
+
+
+
+
+ + ♻ ☆ Semi-supervised detection of structural damage using Variational + Autoencoder and a One-Class Support Vector Machine + + +
+ In recent years, Artificial Neural Networks (ANNs) have been introduced in +Structural Health Monitoring (SHM) systems. A semi-supervised method with a +data-driven approach allows the ANN training on data acquired from an undamaged +structural condition to detect structural damages. In standard approaches, +after the training stage, a decision rule is manually defined to detect +anomalous data. However, this process could be made automatic using machine +learning methods, whom performances are maximised using hyperparameter +optimization techniques. The paper proposes a semi-supervised method with a +data-driven approach to detect structural anomalies. The methodology consists +of: (i) a Variational Autoencoder (VAE) to approximate undamaged data +distribution and (ii) a One-Class Support Vector Machine (OC-SVM) to +discriminate different health conditions using damage sensitive features +extracted from VAE's signal reconstruction. The method is applied to a scale +steel structure that was tested in nine damage's scenarios by IASC-ASCE +Structural Health Monitoring Task Group. + +
+
+
+
+
+ + ♻ ☆ Logistic-Normal Likelihoods for Heteroscedastic Label Noise + + +
+ A natural way of estimating heteroscedastic label noise in regression is to +model the observed (potentially noisy) target as a sample from a normal +distribution, whose parameters can be learned by minimizing the negative +log-likelihood. This formulation has desirable loss attenuation properties, as +it reduces the contribution of high-error examples. Intuitively, this behavior +can improve robustness against label noise by reducing overfitting. We propose +an extension of this simple and probabilistic approach to classification that +has the same desirable loss attenuation properties. Furthermore, we discuss and +address some practical challenges of this extension. We evaluate the +effectiveness of the method by measuring its robustness against label noise in +classification. We perform enlightening experiments exploring the inner +workings of the method, including sensitivity to hyperparameters, ablation +studies, and other insightful analyses. + +
+
+
+
+
+ + ♻ ☆ Continual Learning in Predictive Autoscaling + + +
+ Predictive Autoscaling is used to forecast the workloads of servers and +prepare the resources in advance to ensure service level objectives (SLOs) in +dynamic cloud environments. However, in practice, its prediction task often +suffers from performance degradation under abnormal traffics caused by external +events (such as sales promotional activities and applications +re-configurations), for which a common solution is to re-train the model with +data of a long historical period, but at the expense of high computational and +storage costs. To better address this problem, we propose a replay-based +continual learning method, i.e., Density-based Memory Selection and Hint-based +Network Learning Model (DMSHM), using only a small part of the historical log +to achieve accurate predictions. First, we discover the phenomenon of sample +overlap when applying replay-based continual learning in prediction tasks. In +order to surmount this challenge and effectively integrate new sample +distribution, we propose a density-based sample selection strategy that +utilizes kernel density estimation to calculate sample density as a reference +to compute sample weight, and employs weight sampling to construct a new memory +set. Then we implement hint-based network learning based on hint representation +to optimize the parameters. Finally, we conduct experiments on public and +industrial datasets to demonstrate that our proposed method outperforms +state-of-the-art continual learning methods in terms of memory capacity and +prediction accuracy. Furthermore, we demonstrate remarkable practicability of +DMSHM in real industrial applications. + +
+
+
+
+
+ + ♻ ☆ A jet tagging algorithm of graph network with HaarPooling message + passing + + +
+ Recently methods of graph neural networks (GNNs) have been applied to solving +the problems in high energy physics (HEP) and have shown its great potential +for quark-gluon tagging with graph representation of jet events. In this paper, +we introduce an approach of GNNs combined with a HaarPooling operation to +analyze the events, called HaarPooling Message Passing neural network (HMPNet). +In HMPNet, HaarPooling not only extracts the features of graph, but embeds +additional information obtained by clustering of k-means of different particle +features. We construct Haarpooling from five different features: absolute +energy $\log E$, transverse momentum $\log p_T$, relative coordinates +$(\Delta\eta,\Delta\phi)$, the mixed ones $(\log E, \log p_T)$ and $(\log E, +\log p_T, \Delta\eta,\Delta\phi)$. The results show that an appropriate +selection of information for HaarPooling enhances the accuracy of quark-gluon +tagging, as adding extra information of $\log P_T$ to the HMPNet outperforms +all the others, whereas adding relative coordinates information +$(\Delta\eta,\Delta\phi)$ is not very effective. This implies that by adding +effective particle features from HaarPooling can achieve much better results +than solely pure message passing neutral network (MPNN) can do, which +demonstrates significant improvement of feature extraction via the pooling +process. Finally we compare the HMPNet study, ordering by $p_T$, with other +studies and prove that the HMPNet is also a good choice of GNN algorithms for +jet tagging. + +
+
+
+
+
+ + ♻ ☆ Label-Noise Learning with Intrinsically Long-Tailed Data ICCV 2023 + + +
+ Label noise is one of the key factors that lead to the poor generalization of +deep learning models. Existing label-noise learning methods usually assume that +the ground-truth classes of the training data are balanced. However, the +real-world data is often imbalanced, leading to the inconsistency between +observed and intrinsic class distribution with label noises. In this case, it +is hard to distinguish clean samples from noisy samples on the intrinsic tail +classes with the unknown intrinsic class distribution. In this paper, we +propose a learning framework for label-noise learning with intrinsically +long-tailed data. Specifically, we propose two-stage bi-dimensional sample +selection (TABASCO) to better separate clean samples from noisy samples, +especially for the tail classes. TABASCO consists of two new separation metrics +that complement each other to compensate for the limitation of using a single +metric in sample separation. Extensive experiments on benchmarks demonstrate +the effectiveness of our method. Our code is available at +https://github.com/Wakings/TABASCO. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ On the Training Instability of Shuffling SGD with Batch Normalization ICML 2023 + + +
+ We uncover how SGD interacts with batch normalization and can exhibit +undesirable training dynamics such as divergence. More precisely, we study how +Single Shuffle (SS) and Random Reshuffle (RR) -- two widely used variants of +SGD -- interact surprisingly differently in the presence of batch +normalization: RR leads to much more stable evolution of training loss than SS. +As a concrete example, for regression using a linear network with batch +normalization, we prove that SS and RR converge to distinct global optima that +are "distorted" away from gradient descent. Thereafter, for classification we +characterize conditions under which training divergence for SS and RR can, and +cannot occur. We present explicit constructions to show how SS leads to +distorted optima in regression and divergence for classification, whereas RR +avoids both distortion and divergence. We validate our results by confirming +them empirically in realistic settings, and conclude that the separation +between SS and RR used with batch normalization is relevant in practice. + +
+
+ comment: ICML 2023 camera-ready version, added references; 75 pages +
+
+
+
+
+ + ♻ ☆ Diffusion Model in Causal Inference with Unmeasured Confounders + + +
+ We study how to extend the use of the diffusion model to answer the causal +question from the observational data under the existence of unmeasured +confounders. In Pearl's framework of using a Directed Acyclic Graph (DAG) to +capture the causal intervention, a Diffusion-based Causal Model (DCM) was +proposed incorporating the diffusion model to answer the causal questions more +accurately, assuming that all of the confounders are observed. However, +unmeasured confounders in practice exist, which hinders DCM from being +applicable. To alleviate this limitation of DCM, we propose an extended model +called Backdoor Criterion based DCM (BDCM), whose idea is rooted in the +Backdoor criterion to find the variables in DAG to be included in the decoding +process of the diffusion model so that we can extend DCM to the case with +unmeasured confounders. Synthetic data experiment demonstrates that our +proposed model captures the counterfactual distribution more precisely than DCM +under the unmeasured confounders. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ A stochastic optimization approach to train non-linear neural networks + with a higher-order variation regularization + + +
+ While highly expressive parametric models including deep neural networks have +an advantage to model complicated concepts, training such highly non-linear +models is known to yield a high risk of notorious overfitting. To address this +issue, this study considers a $(k,q)$th order variation regularization +($(k,q)$-VR), which is defined as the $q$th-powered integral of the absolute +$k$th order derivative of the parametric models to be trained; penalizing the +$(k,q)$-VR is expected to yield a smoother function, which is expected to avoid +overfitting. Particularly, $(k,q)$-VR encompasses the conventional +(general-order) total variation with $q=1$. While the $(k,q)$-VR terms applied +to general parametric models are computationally intractable due to the +integration, this study provides a stochastic optimization algorithm, that can +efficiently train general models with the $(k,q)$-VR without conducting +explicit numerical integration. The proposed approach can be applied to the +training of even deep neural networks whose structure is arbitrary, as it can +be implemented by only a simple stochastic gradient descent algorithm and +automatic differentiation. Our numerical experiments demonstrate that the +neural networks trained with the $(k,q)$-VR terms are more ``resilient'' than +those with the conventional parameter regularization. The proposed algorithm +also can be extended to the physics-informed training of neural networks +(PINNs). + +
+
+ comment: 13 pages, 24 figures +
+
+
+
+
+ + ♻ ☆ TARGET: Federated Class-Continual Learning via Exemplar-Free + Distillation ICCV 2023 + + +
+ This paper focuses on an under-explored yet important problem: Federated +Class-Continual Learning (FCCL), where new classes are dynamically added in +federated learning. Existing FCCL works suffer from various limitations, such +as requiring additional datasets or storing the private data from previous +tasks. In response, we first demonstrate that non-IID data exacerbates +catastrophic forgetting issue in FL. Then we propose a novel method called +TARGET (federat\textbf{T}ed cl\textbf{A}ss-continual lea\textbf{R}nin\textbf{G} +via \textbf{E}xemplar-free dis\textbf{T}illation), which alleviates +catastrophic forgetting in FCCL while preserving client data privacy. Our +proposed method leverages the previously trained global model to transfer +knowledge of old tasks to the current task at the model level. Moreover, a +generator is trained to produce synthetic data to simulate the global +distribution of data on each client at the data level. Compared to previous +FCCL methods, TARGET does not require any additional datasets or storing real +data from previous tasks, which makes it ideal for data-sensitive scenarios. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Cost-effective On-device Continual Learning over Memory Hierarchy with + Miro + + +
+ Continual learning (CL) trains NN models incrementally from a continuous +stream of tasks. To remember previously learned knowledge, prior studies store +old samples over a memory hierarchy and replay them when new tasks arrive. Edge +devices that adopt CL to preserve data privacy are typically energy-sensitive +and thus require high model accuracy while not compromising energy efficiency, +i.e., cost-effectiveness. Our work is the first to explore the design space of +hierarchical memory replay-based CL to gain insights into achieving +cost-effectiveness on edge devices. We present Miro, a novel system runtime +that carefully integrates our insights into the CL framework by enabling it to +dynamically configure the CL system based on resource states for the best +cost-effectiveness. To reach this goal, Miro also performs online profiling on +parameters with clear accuracy-energy trade-offs and adapts to optimal values +with low overhead. Extensive evaluations show that Miro significantly +outperforms baseline systems we build for comparison, consistently achieving +higher cost-effectiveness. + +
+
+ comment: This paper is to be published in the 29th Annual International + Conference on Mobile Computing and Networking (ACM MobiCom 23) +
+
+
+
+
+ + ♻ ☆ A General Implicit Framework for Fast NeRF Composition and Rendering + + +
+ A variety of Neural Radiance Fields (NeRF) methods have recently achieved +remarkable success in high render speed. However, current accelerating methods +are specialized and incompatible with various implicit methods, preventing +real-time composition over various types of NeRF works. Because NeRF relies on +sampling along rays, it is possible to provide general guidance for +acceleration. To that end, we propose a general implicit pipeline for composing +NeRF objects quickly. Our method enables the casting of dynamic shadows within +or between objects using analytical light sources while allowing multiple NeRF +objects to be seamlessly placed and rendered together with any arbitrary rigid +transformations. Mainly, our work introduces a new surface representation known +as Neural Depth Fields (NeDF) that quickly determines the spatial relationship +between objects by allowing direct intersection computation between rays and +implicit surfaces. It leverages an intersection neural network to query NeRF +for acceleration instead of depending on an explicit spatial structure.Our +proposed method is the first to enable both the progressive and interactive +composition of NeRF objects. Additionally, it also serves as a previewing +plugin for a range of existing NeRF works. + +
+
+ comment: 7 pages for main content +
+
+
+
+
+ + ♻ ☆ An Embarrassingly Simple Backdoor Attack on Self-supervised Learning ICCV '23 + + +
+ As a new paradigm in machine learning, self-supervised learning (SSL) is +capable of learning high-quality representations of complex data without +relying on labels. In addition to eliminating the need for labeled data, +research has found that SSL improves the adversarial robustness over supervised +learning since lacking labels makes it more challenging for adversaries to +manipulate model predictions. However, the extent to which this robustness +superiority generalizes to other types of attacks remains an open question. + We explore this question in the context of backdoor attacks. Specifically, we +design and evaluate CTRL, an embarrassingly simple yet highly effective +self-supervised backdoor attack. By only polluting a tiny fraction of training +data (<= 1%) with indistinguishable poisoning samples, CTRL causes any +trigger-embedded input to be misclassified to the adversary's designated class +with a high probability (>= 99%) at inference time. Our findings suggest that +SSL and supervised learning are comparably vulnerable to backdoor attacks. More +importantly, through the lens of CTRL, we study the inherent vulnerability of +SSL to backdoor attacks. With both empirical and analytical evidence, we reveal +that the representation invariance property of SSL, which benefits adversarial +robustness, may also be the very reason making \ssl highly susceptible to +backdoor attacks. Our findings also imply that the existing defenses against +supervised backdoor attacks are not easily retrofitted to the unique +vulnerability of SSL. + +
+
+ comment: The 2023 International Conference on Computer Vision (ICCV '23) +
+
+
+
+
+ + ♻ ☆ Fisher-Rao distance and pullback SPD cone distances between multivariate + normal distributions + + +
+ Data sets of multivariate normal distributions abound in many scientific +areas like diffusion tensor imaging, structure tensor computer vision, radar +signal processing, machine learning, just to name a few. In order to process +those normal data sets for downstream tasks like filtering, classification or +clustering, one needs to define proper notions of dissimilarities between +normals and paths joining them. The Fisher-Rao distance defined as the +Riemannian geodesic distance induced by the Fisher information metric is such a +principled metric distance which however is not known in closed-form excepts +for a few particular cases. In this work, we first report a fast and robust +method to approximate arbitrarily finely the Fisher-Rao distance between +multivariate normal distributions. Second, we introduce a class of distances +based on diffeomorphic embeddings of the normal manifold into a submanifold of +the higher-dimensional symmetric positive-definite cone corresponding to the +manifold of centered normal distributions. We show that the projective Hilbert +distance on the cone yields a metric on the embedded normal submanifold and we +pullback that cone distance with its associated straight line Hilbert cone +geodesics to obtain a distance and smooth paths between normal distributions. +Compared to the Fisher-Rao distance approximation, the pullback Hilbert cone +distance is computationally light since it requires to compute only the extreme +minimal and maximal eigenvalues of matrices. Finally, we show how to use those +distances in clustering tasks. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ Agent-Controller Representations: Principled Offline RL with Rich + Exogenous Information ICML 2023 + + +
+ Learning to control an agent from data collected offline in a rich +pixel-based visual observation space is vital for real-world applications of +reinforcement learning (RL). A major challenge in this setting is the presence +of input information that is hard to model and irrelevant to controlling the +agent. This problem has been approached by the theoretical RL community through +the lens of exogenous information, i.e, any control-irrelevant information +contained in observations. For example, a robot navigating in busy streets +needs to ignore irrelevant information, such as other people walking in the +background, textures of objects, or birds in the sky. In this paper, we focus +on the setting with visually detailed exogenous information, and introduce new +offline RL benchmarks offering the ability to study this problem. We find that +contemporary representation learning techniques can fail on datasets where the +noise is a complex and time dependent process, which is prevalent in practical +applications. To address these, we propose to use multi-step inverse models, +which have seen a great deal of interest in the RL theory community, to learn +Agent-Controller Representations for Offline-RL (ACRO). Despite being simple +and requiring no reward, we show theoretically and empirically that the +representation created by this objective greatly outperforms baselines. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ♻ ☆ Adaptive Experimentation at Scale: A Computational Framework for + Flexible Batches + + +
+ Standard bandit algorithms that assume continual reallocation of measurement +effort are challenging to implement due to delayed feedback and +infrastructural/organizational difficulties. Motivated by practical instances +involving a handful of reallocation epochs in which outcomes are measured in +batches, we develop a computation-driven adaptive experimentation framework +that can flexibly handle batching. Our main observation is that normal +approximations, which are universal in statistical inference, can also guide +the design of adaptive algorithms. By deriving a Gaussian sequential +experiment, we formulate a dynamic program that can leverage prior information +on average rewards. Instead of the typical theory-driven paradigm, we leverage +computational tools and empirical benchmarking for algorithm development. In +particular, our empirical analysis highlights a simple yet effective algorithm, +Residual Horizon Optimization, which iteratively solves a planning problem +using stochastic gradient descent. Our approach significantly improves +statistical power over standard methods, even when compared to Bayesian bandit +algorithms (e.g., Thompson sampling) that require full distributional knowledge +of individual rewards. Overall, we expand the scope of adaptive experimentation +to settings that are difficult for standard methods, involving limited +adaptivity, low signal-to-noise ratio, and unknown reward distributions. + +
+
+
+
+
+ + ♻ ☆ End-to-end AI framework for interpretable prediction of molecular and + crystal properties + + +
+ We introduce an end-to-end computational framework that allows for +hyperparameter optimization using the DeepHyper library, accelerated model +training, and interpretable AI inference. The framework is based on +state-of-the-art AI models including CGCNN, PhysNet, SchNet, MPNN, +MPNN-transformer, and TorchMD-NET. We employ these AI models along with the +benchmark QM9, hMOF, and MD17 datasets to showcase how the models can predict +user-specified material properties within modern computing environments. We +demonstrate transferable applications in the modeling of small molecules, +inorganic crystals and nanoporous metal organic frameworks with a unified, +standalone framework. We have deployed and tested this framework in the +ThetaGPU supercomputer at the Argonne Leadership Computing Facility, and in the +Delta supercomputer at the National Center for Supercomputing Applications to +provide researchers with modern tools to conduct accelerated AI-driven +discovery in leadership-class computing environments. We release these digital +assets as open source scientific software in GitLab, and ready-to-use Jupyter +notebooks in Google Colab. + +
+
+ comment: 20 pages, 10 images, 6 tables; v2: accepted to Machine Learning: + Science and Technology +
+
+
+
+
+ + ♻ ☆ Robust expected improvement for Bayesian optimization + + +
+ Bayesian Optimization (BO) links Gaussian Process (GP) surrogates with +sequential design toward optimizing expensive-to-evaluate black-box functions. +Example design heuristics, or so-called acquisition functions, like expected +improvement (EI), balance exploration and exploitation to furnish global +solutions under stringent evaluation budgets. However, they fall short when +solving for robust optima, meaning a preference for solutions in a wider domain +of attraction. Robust solutions are useful when inputs are imprecisely +specified, or where a series of solutions is desired. A common mathematical +programming technique in such settings involves an adversarial objective, +biasing a local solver away from ``sharp'' troughs. Here we propose a surrogate +modeling and active learning technique called robust expected improvement (REI) +that ports adversarial methodology into the BO/GP framework. After describing +the methods, we illustrate and draw comparisons to several competitors on +benchmark synthetic exercises and real problems of varying complexity. + +
+
+ comment: 27 pages, 17 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Faster variational quantum algorithms with quantum kernel-based + surrogate models + + +
+ We present a new optimization method for small-to-intermediate scale +variational algorithms on noisy near-term quantum processors which uses a +Gaussian process surrogate model equipped with a classically-evaluated quantum +kernel. Variational algorithms are typically optimized using gradient-based +approaches however these are difficult to implement on current noisy devices, +requiring large numbers of objective function evaluations. Our scheme shifts +this computational burden onto the classical optimizer component of these +hybrid algorithms, greatly reducing the number of queries to the quantum +processor. We focus on the variational quantum eigensolver (VQE) algorithm and +demonstrate numerically that such surrogate models are particularly well suited +to the algorithm's objective function. Next, we apply these models to both +noiseless and noisy VQE simulations and show that they exhibit better +performance than widely-used classical kernels in terms of final accuracy and +convergence speed. Compared to the typically-used stochastic gradient-descent +approach for VQAs, our quantum kernel-based approach is found to consistently +achieve significantly higher accuracy while requiring less than an order of +magnitude fewer quantum circuit evaluations. We analyse the performance of the +quantum kernel-based models in terms of the kernels' induced feature spaces and +explicitly construct their feature maps. Finally, we describe a scheme for +approximating the best-performing quantum kernel using a classically-efficient +tensor network representation of its input state and so provide a pathway for +scaling these methods to larger systems. + +
+
+
+
+
+ + ♻ ☆ Principles and Guidelines for Evaluating Social Robot Navigation + Algorithms + + +
+ A major challenge to deploying robots widely is navigation in human-populated +environments, commonly referred to as social robot navigation. While the field +of social navigation has advanced tremendously in recent years, the fair +evaluation of algorithms that tackle social navigation remains hard because it +involves not just robotic agents moving in static environments but also dynamic +human agents and their perceptions of the appropriateness of robot behavior. In +contrast, clear, repeatable, and accessible benchmarks have accelerated +progress in fields like computer vision, natural language processing and +traditional robot navigation by enabling researchers to fairly compare +algorithms, revealing limitations of existing solutions and illuminating +promising new directions. We believe the same approach can benefit social +navigation. In this paper, we pave the road towards common, widely accessible, +and repeatable benchmarking criteria to evaluate social robot navigation. Our +contributions include (a) a definition of a socially navigating robot as one +that respects the principles of safety, comfort, legibility, politeness, social +competency, agent understanding, proactivity, and responsiveness to context, +(b) guidelines for the use of metrics, development of scenarios, benchmarks, +datasets, and simulators to evaluate social navigation, and (c) a design of a +social navigation metrics framework to make it easier to compare results from +different simulators, robots and datasets. + +
+
+ comment: 42 pages, 11 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ MotionDeltaCNN: Sparse CNN Inference of Frame Differences in Moving + Camera Videos + + +
+ Convolutional neural network inference on video input is computationally +expensive and requires high memory bandwidth. Recently, DeltaCNN managed to +reduce the cost by only processing pixels with significant updates over the +previous frame. However, DeltaCNN relies on static camera input. Moving cameras +add new challenges in how to fuse newly unveiled image regions with already +processed regions efficiently to minimize the update rate - without increasing +memory overhead and without knowing the camera extrinsics of future frames. In +this work, we propose MotionDeltaCNN, a sparse CNN inference framework that +supports moving cameras. We introduce spherical buffers and padded convolutions +to enable seamless fusion of newly unveiled regions and previously processed +regions -- without increasing memory footprint. Our evaluation shows that we +outperform DeltaCNN by up to 90% for moving camera videos. + +
+
+
+
+
+ + ♻ ☆ Private Distribution Learning with Public Data: The View from Sample + Compression + + +
+ We study the problem of private distribution learning with access to public +data. In this setup, which we refer to as public-private learning, the learner +is given public and private samples drawn from an unknown distribution $p$ +belonging to a class $\mathcal Q$, with the goal of outputting an estimate of +$p$ while adhering to privacy constraints (here, pure differential privacy) +only with respect to the private samples. + We show that the public-private learnability of a class $\mathcal Q$ is +connected to the existence of a sample compression scheme for $\mathcal Q$, as +well as to an intermediate notion we refer to as list learning. Leveraging this +connection: (1) approximately recovers previous results on Gaussians over +$\mathbb R^d$; and (2) leads to new ones, including sample complexity upper +bounds for arbitrary $k$-mixtures of Gaussians over $\mathbb R^d$, results for +agnostic and distribution-shift resistant learners, as well as closure +properties for public-private learnability under taking mixtures and products +of distributions. Finally, via the connection to list learning, we show that +for Gaussians in $\mathbb R^d$, at least $d$ public samples are necessary for +private learnability, which is close to the known upper bound of $d+1$ public +samples. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ♻ ☆ Nesterov Meets Optimism: Rate-Optimal Separable Minimax Optimization ICML + 2023 + + +
+ We propose a new first-order optimization algorithm -- +AcceleratedGradient-OptimisticGradient (AG-OG) Descent Ascent -- for separable +convex-concave minimax optimization. The main idea of our algorithm is to +carefully leverage the structure of the minimax problem, performing Nesterov +acceleration on the individual component and optimistic gradient on the +coupling component. Equipped with proper restarting, we show that AG-OG +achieves the optimal convergence rate (up to a constant) for a variety of +settings, including bilinearly coupled strongly convex-strongly concave minimax +optimization (bi-SC-SC), bilinearly coupled convex-strongly concave minimax +optimization (bi-C-SC), and bilinear games. We also extend our algorithm to the +stochastic setting and achieve the optimal convergence rate in both bi-SC-SC +and bi-C-SC settings. AG-OG is the first single-call algorithm with optimal +convergence rates in both deterministic and stochastic settings for bilinearly +coupled minimax optimization problems. + +
+
+ comment: 44 pages. This version matches the camera-ready that appeared at ICML + 2023 under the same title +
+
+
+
+
+ + ♻ ☆ Continual Domain Adaptation on Aerial Images under Gradually Degrading + Weather + + +
+ Domain adaptation (DA) strives to mitigate the domain gap between the source +domain where a model is trained, and the target domain where the model is +deployed. When a deep learning model is deployed on an aerial platform, it may +face gradually degrading weather conditions during operation, leading to +widening domain gaps between the training data and the encountered evaluation +data. We synthesize two such gradually worsening weather conditions on real +images from two existing aerial imagery datasets, generating a total of four +benchmark datasets. Under the continual, or test-time adaptation setting, we +evaluate three DA models on our datasets: a baseline standard DA model and two +continual DA models. In such setting, the models can access only one small +portion, or one batch of the target data at a time, and adaptation takes place +continually, and over only one epoch of the data. The combination of the +constraints of continual adaptation, and gradually deteriorating weather +conditions provide the practical DA scenario for aerial deployment. Among the +evaluated models, we consider both convolutional and transformer architectures +for comparison. We discover stability issues during adaptation for existing +buffer-fed continual DA methods, and offer gradient normalization as a simple +solution to curb training instability. + +
+
+
+
+
+ + ♻ ☆ A deep complementary energy method for solid mechanics using minimum + complementary energy principle + + +
+ In recent years, the rapid advancement of deep learning has significantly +impacted various fields, particularly in solving partial differential equations +(PDEs) in solid mechanics, benefiting greatly from the remarkable approximation +capabilities of neural networks. In solving PDEs, Physics-Informed Neural +Networks (PINNs) and the Deep Energy Method (DEM) have garnered substantial +attention. The principle of minimum potential energy and complementary energy +are two important variational principles in solid mechanics. However,DEM is +based on the principle of minimum potential energy, but it lacks the important +form of minimum complementary energy. To bridge this gap, we propose the deep +complementary energy method (DCEM) based on the principle of minimum +complementary energy. The output function of DCEM is the stress function. We +extend DCEM to DCEM-Plus (DCEM-P), adding terms that satisfy partial +differential equations. Furthermore, we propose a deep complementary energy +operator method (DCEM-O) by combining operator learning with physical +equations. We train DCEM-O using existing high-fidelity numerical results and +the complementary energy together. We present numerical results using the +Prandtl and Airy stress functions and compare DCEM with existing PINNs and DEM +when modeling representative mechanical problems. The results demonstrate that +DCEM outperforms DEM in terms of stress accuracy and efficiency and has an +advantage in dealing with complex displacement boundary conditions. DCEM-P and +DCEM-O further enhance the accuracy and efficiency of DCEM. In summary, our +proposed DCEM marks the first time that complementary energy is extended to the +energy-based physics-informed neural network and provides an essential +supplementary energy form to the DEM in solid mechanics, offering promising +research prospects in computational mechanics. + +
+
+ comment: 58 pages, 30 figures +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Jurassic World Remake: Bringing Ancient Fossils Back to Life via + Zero-Shot Long Image-to-Image Translation + + +
+ With a strong understanding of the target domain from natural language, we +produce promising results in translating across large domain gaps and bringing +skeletons back to life. In this work, we use text-guided latent diffusion +models for zero-shot image-to-image translation (I2I) across large domain gaps +(longI2I), where large amounts of new visual features and new geometry need to +be generated to enter the target domain. Being able to perform translations +across large domain gaps has a wide variety of real-world applications in +criminology, astrology, environmental conservation, and paleontology. In this +work, we introduce a new task Skull2Animal for translating between skulls and +living animals. On this task, we find that unguided Generative Adversarial +Networks (GANs) are not capable of translating across large domain gaps. +Instead of these traditional I2I methods, we explore the use of guided +diffusion and image editing models and provide a new benchmark model, +Revive-2I, capable of performing zero-shot I2I via text-prompting latent +diffusion models. We find that guidance is necessary for longI2I because, to +bridge the large domain gap, prior knowledge about the target domain is needed. +In addition, we find that prompting provides the best and most scalable +information about the target domain as classifier-guided diffusion models +require retraining for specific use cases and lack stronger constraints on the +target domain because of the wide variety of images they are trained on. + +
+
+ comment: 9 pages, 10 figures, ACM Multimedia 2023 +
+
+
+
+
+ + ☆ CTP: Towards Vision-Language Continual Pretraining via Compatible + Momentum Contrast and Topology Preservation ICCV 2023 + + +
+ Vision-Language Pretraining (VLP) has shown impressive results on diverse +downstream tasks by offline training on large-scale datasets. Regarding the +growing nature of real-world data, such an offline training paradigm on +ever-expanding data is unsustainable, because models lack the continual +learning ability to accumulate knowledge constantly. However, most continual +learning studies are limited to uni-modal classification and existing +multi-modal datasets cannot simulate continual non-stationary data stream +scenarios. To support the study of Vision-Language Continual Pretraining +(VLCP), we first contribute a comprehensive and unified benchmark dataset P9D +which contains over one million product image-text pairs from 9 industries. The +data from each industry as an independent task supports continual learning and +conforms to the real-world long-tail nature to simulate pretraining on web +data. We comprehensively study the characteristics and challenges of VLCP, and +propose a new algorithm: Compatible momentum contrast with Topology +Preservation, dubbed CTP. The compatible momentum model absorbs the knowledge +of the current and previous-task models to flexibly update the modal feature. +Moreover, Topology Preservation transfers the knowledge of embedding across +tasks while preserving the flexibility of feature adjustment. The experimental +results demonstrate our method not only achieves superior performance compared +with other baselines but also does not bring an expensive training burden. +Dataset and codes are available at https://github.com/KevinLight831/CTP. + +
+
+ comment: Accepted by ICCV 2023. Code: https://github.com/KevinLight831/CTP +
+
+
+
+
+ + ☆ Temporal Sentence Grounding in Streaming Videos ACM MM 2023 + + +
+ This paper aims to tackle a novel task - Temporal Sentence Grounding in +Streaming Videos (TSGSV). The goal of TSGSV is to evaluate the relevance +between a video stream and a given sentence query. Unlike regular videos, +streaming videos are acquired continuously from a particular source, and are +always desired to be processed on-the-fly in many applications such as +surveillance and live-stream analysis. Thus, TSGSV is challenging since it +requires the model to infer without future frames and process long historical +frames effectively, which is untouched in the early methods. To specifically +address the above challenges, we propose two novel methods: (1) a TwinNet +structure that enables the model to learn about upcoming events; and (2) a +language-guided feature compressor that eliminates redundant visual frames and +reinforces the frames that are relevant to the query. We conduct extensive +experiments using ActivityNet Captions, TACoS, and MAD datasets. The results +demonstrate the superiority of our proposed methods. A systematic ablation +study also confirms their effectiveness. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ VoxSnap: X-Large Speaker Verification Dataset on Camera ICASSP2023 + + +
+ In this paper, we contribute a novel and extensive dataset for speaker +verification, which contains noisy 38k identities/1.45M utterances (VoxSnap) +and relatively cleaned 18k identities/1.02M (VoxSnap-Clean) utterances for +training. Firstly, we collect a 60K+ users' list as well as their avatar and +download their SHORT videos on the YouTube. Then, an automatically pipeline is +devised to extract target user's speech segments and videos, which is efficient +and scalable. To the best of our knowledge, the VoxSnap dataset is the largest +speaker recognition dataset. Secondly, we develop a series of experiments based +on VoxSnap-clean together with VoxCeleb2. Our findings highlight a notable +improvement in performance, ranging from 15% to 30%, across different backbone +architectures, upon integrating our dataset for training. The dataset will be +released SOON~. + +
+
+ comment: submit to ICASSP2023 +
+
+
+
+
+ + ☆ Orthogonal Temporal Interpolation for Zero-Shot Video Recognition + + +
+ Zero-shot video recognition (ZSVR) is a task that aims to recognize video +categories that have not been seen during the model training process. Recently, +vision-language models (VLMs) pre-trained on large-scale image-text pairs have +demonstrated impressive transferability for ZSVR. To make VLMs applicable to +the video domain, existing methods often use an additional temporal learning +module after the image-level encoder to learn the temporal relationships among +video frames. Unfortunately, for video from unseen categories, we observe an +abnormal phenomenon where the model that uses spatial-temporal feature performs +much worse than the model that removes temporal learning module and uses only +spatial feature. We conjecture that improper temporal modeling on video +disrupts the spatial feature of the video. To verify our hypothesis, we propose +Feature Factorization to retain the orthogonal temporal feature of the video +and use interpolation to construct refined spatial-temporal feature. The model +using appropriately refined spatial-temporal feature performs better than the +one using only spatial feature, which verifies the effectiveness of the +orthogonal temporal feature for the ZSVR task. Therefore, an Orthogonal +Temporal Interpolation module is designed to learn a better refined +spatial-temporal video feature during training. Additionally, a Matching Loss +is introduced to improve the quality of the orthogonal temporal feature. We +propose a model called OTI for ZSVR by employing orthogonal temporal +interpolation and the matching loss based on VLMs. The ZSVR accuracies on +popular video datasets (i.e., Kinetics-600, UCF101 and HMDB51) show that OTI +outperforms the previous state-of-the-art method by a clear margin. + +
+
+
+
+
+ + ♻ ☆ SLIC: Large Receptive Field Learning with Self-Conditioned Adaptability + for Learned Image Compression + + +
+ Recently, transformers are trending as replacements for CNNs in vision tasks, +including compression. This trend compels us to question the inherent +limitations of CNNs compared to transformers and to explore if CNNs can be +enhanced to achieve the same or even better performance than transformers. We +want to design a pure CNN based model for compression as most devices are +optimized for CNNs well. In our analysis, we find that the key strengths of +transformers lie in their dynamic weights and large receptive fields. To enable +CNNs with such properties, we propose a novel transform module with large +receptive filed learning and self-conditioned adaptability for learned image +compression, named SLIC. Specifically, we enlarge the receptive field of +depth-wise convolution with suitable complexity and generate the weights +according to given conditions. In addition, we also investigate the +self-conditioned factor for channels. To prove the effectiveness of our +proposed transform module, we equip it with existing entropy models ChARM, +SCCTX, and SWAtten and we obtain models SLIC-ChARM, SLIC-SCCTX, and +SLIC-SWAtten. Extensive experiments demonstrate our SLIC-ChARM, SLIC-SCCTX, and +SLIC-SWAtten have significant improvements over corresponding baselines and +achieve SOTA performances with suitable complexity on 5 test datasets (Kodak, +Tecnick, CLIC 20, CLIC 21, JPEGAI). Code will be available at +https://github.com/JiangWeibeta/SLIC. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ Towards Unified Text-based Person Retrieval: A Large-scale + Multi-Attribute and Language Search Benchmark + + +
+ In this paper, we introduce a large Multi-Attribute and Language Search +dataset for text-based person retrieval, called MALS, and explore the +feasibility of performing pre-training on both attribute recognition and +image-text matching tasks in one stone. In particular, MALS contains 1,510,330 +image-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES, +and all images are annotated with 27 attributes. Considering the privacy +concerns and annotation costs, we leverage the off-the-shelf diffusion models +to generate the dataset. To verify the feasibility of learning from the +generated data, we develop a new joint Attribute Prompt Learning and Text +Matching Learning (APTM) framework, considering the shared knowledge between +attribute and text. As the name implies, APTM contains an attribute prompt +learning stream and a text matching learning stream. (1) The attribute prompt +learning leverages the attribute prompts for image-attribute alignment, which +enhances the text matching learning. (2) The text matching learning facilitates +the representation learning on fine-grained details, and in turn, boosts the +attribute prompt learning. Extensive experiments validate the effectiveness of +the pre-training on MALS, achieving state-of-the-art retrieval performance via +APTM on three challenging real-world benchmarks. In particular, APTM achieves a +consistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on +CUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively. + +
+
+
+
+
+ + ♻ ☆ Audio is all in one: speech-driven gesture synthetics using WavLM + pre-trained model + + +
+ The generation of co-speech gestures for digital humans is an emerging area +in the field of virtual human creation. Prior research has made progress by +using acoustic and semantic information as input and adopting classify method +to identify the person's ID and emotion for driving co-speech gesture +generation. However, this endeavour still faces significant challenges. These +challenges go beyond the intricate interplay between co-speech gestures, speech +acoustic, and semantics; they also encompass the complexities associated with +personality, emotion, and other obscure but important factors. This paper +introduces "diffmotion-v2," a speech-conditional diffusion-based and +non-autoregressive transformer-based generative model with WavLM pre-trained +model. It can produce individual and stylized full-body co-speech gestures only +using raw speech audio, eliminating the need for complex multimodal processing +and manually annotated. Firstly, considering that speech audio not only +contains acoustic and semantic features but also conveys personality traits, +emotions, and more subtle information related to accompanying gestures, we +pioneer the adaptation of WavLM, a large-scale pre-trained model, to extract +low-level and high-level audio information. Secondly, we introduce an adaptive +layer norm architecture in the transformer-based layer to learn the +relationship between speech information and accompanying gestures. Extensive +subjective evaluation experiments are conducted on the Trinity, ZEGGS, and BEAT +datasets to confirm the WavLM and the model's ability to synthesize natural +co-speech gestures with various styles. + +
+
+ comment: 10 pages, 5 figures, 1 table +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 13 + +
+
+
+ + ☆ Diagnostic Reasoning Prompts Reveal the Potential for Large Language + Model Interpretability in Medicine + + +
+ One of the major barriers to using large language models (LLMs) in medicine +is the perception they use uninterpretable methods to make clinical decisions +that are inherently different from the cognitive processes of clinicians. In +this manuscript we develop novel diagnostic reasoning prompts to study whether +LLMs can perform clinical reasoning to accurately form a diagnosis. We find +that GPT4 can be prompted to mimic the common clinical reasoning processes of +clinicians without sacrificing diagnostic accuracy. This is significant because +an LLM that can use clinical reasoning to provide an interpretable rationale +offers physicians a means to evaluate whether LLMs can be trusted for patient +care. Novel prompting methods have the potential to expose the black box of +LLMs, bringing them one step closer to safe and effective use in medicine. + +
+
+
+
+
+ + ☆ An Ensemble Approach to Question Classification: Integrating Electra + Transformer, GloVe, and LSTM + + +
+ This paper introduces a novel ensemble approach for question classification +using state-of-the-art models -- Electra, GloVe, and LSTM. The proposed model +is trained and evaluated on the TREC dataset, a well-established benchmark for +question classification tasks. The ensemble model combines the strengths of +Electra, a transformer-based model for language understanding, GloVe, a global +vectors for word representation, and LSTM, a recurrent neural network variant, +providing a robust and efficient solution for question classification. +Extensive experiments were carried out to compare the performance of the +proposed ensemble approach with other cutting-edge models, such as BERT, +RoBERTa, and DistilBERT. Our results demonstrate that the ensemble model +outperforms these models across all evaluation metrics, achieving an accuracy +of 0.8 on the test set. These findings underscore the effectiveness of the +ensemble approach in enhancing the performance of question classification +tasks, and invite further exploration of ensemble methods in natural language +processing. + +
+
+
+
+
+ + ☆ Faithful to Whom? Questioning Interpretability Measures in NLP + + +
+ A common approach to quantifying model interpretability is to calculate +faithfulness metrics based on iteratively masking input tokens and measuring +how much the predicted label changes as a result. However, we show that such +metrics are generally not suitable for comparing the interpretability of +different neural text classifiers as the response to masked inputs is highly +model-specific. We demonstrate that iterative masking can produce large +variation in faithfulness scores between comparable models, and show that +masked samples are frequently outside the distribution seen during training. We +further investigate the impact of adversarial attacks and adversarial training +on faithfulness scores, and demonstrate the relevance of faithfulness measures +for analyzing feature salience in text adversarial attacks. Our findings +provide new insights into the limitations of current faithfulness metrics and +key considerations to utilize them appropriately. + +
+
+
+
+
+ + ☆ Modeling the Dashboard Provenance + + +
+ Organizations of all kinds, whether public or private, profit-driven or +non-profit, and across various industries and sectors, rely on dashboards for +effective data visualization. However, the reliability and efficacy of these +dashboards rely on the quality of the visual and data they present. Studies +show that less than a quarter of dashboards provide information about their +sources, which is just one of the expected metadata when provenance is +seriously considered. Provenance is a record that describes people, +organizations, entities, and activities that had a role in the production, +influence, or delivery of a piece of data or an object. This paper aims to +provide a provenance representation model, that entitles standardization, +modeling, generation, capture, and visualization, specifically designed for +dashboards and its visual and data components. The proposed model will offer a +comprehensive set of essential provenance metadata that enables users to +evaluate the quality, consistency, and reliability of the information presented +on dashboards. This will allow a clear and precise understanding of the context +in which a specific dashboard was developed, ultimately leading to better +decision-making. + +
+
+ comment: 8 pages, 4 figures, one table, to be published in VIS 2023 (Vis + + Prov) x Domain +
+
+
+
+
+ + ☆ Token-Scaled Logit Distillation for Ternary Weight Generative Language + Models + + +
+ Generative Language Models (GLMs) have shown impressive performance in tasks +such as text generation, understanding, and reasoning. However, the large model +size poses challenges for practical deployment. To solve this problem, +Quantization-Aware Training (QAT) has become increasingly popular. However, +current QAT methods for generative models have resulted in a noticeable loss of +accuracy. To counteract this issue, we propose a novel knowledge distillation +method specifically designed for GLMs. Our method, called token-scaled logit +distillation, prevents overfitting and provides superior learning from the +teacher model and ground truth. This research marks the first evaluation of +ternary weight quantization-aware training of large-scale GLMs with less than +1.0 degradation in perplexity and no loss of accuracy in a reasoning task. + +
+
+
+
+
+ + ☆ MACO: A Modality Adversarial and Contrastive Framework for + Modality-missing Multi-modal Knowledge Graph Completion NLPCC 2023 + + +
+ Recent years have seen significant advancements in multi-modal knowledge +graph completion (MMKGC). MMKGC enhances knowledge graph completion (KGC) by +integrating multi-modal entity information, thereby facilitating the discovery +of unobserved triples in the large-scale knowledge graphs (KGs). Nevertheless, +existing methods emphasize the design of elegant KGC models to facilitate +modality interaction, neglecting the real-life problem of missing modalities in +KGs. The missing modality information impedes modal interaction, consequently +undermining the model's performance. In this paper, we propose a modality +adversarial and contrastive framework (MACO) to solve the modality-missing +problem in MMKGC. MACO trains a generator and discriminator adversarially to +generate missing modality features that can be incorporated into the MMKGC +model. Meanwhile, we design a cross-modal contrastive loss to improve the +performance of the generator. Experiments on public benchmarks with further +explorations demonstrate that MACO could achieve state-of-the-art results and +serve as a versatile framework to bolster various MMKGC models. Our code and +benchmark data are available at https://github.com/zjukg/MACO. + +
+
+ comment: This is the ArXiv version of our paper accepted by NLPCC 2023. The + code will be released soon +
+
+
+
+
+ + ♻ ☆ PInKS: Preconditioned Commonsense Inference with Minimal Supervision AACL 2022 + + +
+ Reasoning with preconditions such as "glass can be used for drinking water +unless the glass is shattered" remains an open problem for language models. The +main challenge lies in the scarcity of preconditions data and the model's lack +of support for such reasoning. We present PInKS, Preconditioned Commonsense +Inference with WeaK Supervision, an improved model for reasoning with +preconditions through minimum supervision. We show, both empirically and +theoretically, that PInKS improves the results on benchmarks focused on +reasoning with the preconditions of commonsense knowledge (up to 40% Macro-F1 +scores). We further investigate PInKS through PAC-Bayesian informativeness +analysis, precision measures, and ablation study. + +
+
+ comment: AACL 2022 +
+
+
+
+
+ + ♻ ☆ PaCo: Preconditions Attributed to Commonsense Knowledge EMNLP 2022 + + +
+ Humans can seamlessly reason with circumstantial preconditions of commonsense +knowledge. We understand that a glass is used for drinking water, unless the +glass is broken or the water is toxic. Despite state-of-the-art (SOTA) language +models' (LMs) impressive performance on inferring commonsense knowledge, it is +unclear whether they understand the circumstantial preconditions. To address +this gap, we propose a novel challenge of reasoning with circumstantial +preconditions. We collect a dataset, called PaCo, consisting of 12.4 thousand +preconditions of commonsense statements expressed in natural language. Based on +this dataset, we create three canonical evaluation tasks and use them to +examine the capability of existing LMs to understand situational preconditions. +Our results reveal a 10-30% gap between machine and human performance on our +tasks, which shows that reasoning with preconditions is an open challenge. + +
+
+ comment: EMNLP 2022 (Findings) +
+
+
+
+
+ + ♻ ☆ Multi-View Zero-Shot Open Intent Induction from Dialogues: Multi Domain + Batch and Proxy Gradient Transfer SIGDIAL + + +
+ In Task Oriented Dialogue (TOD) system, detecting and inducing new intents +are two main challenges to apply the system in the real world. In this paper, +we suggest the semantic multi-view model to resolve these two challenges: (1) +SBERT for General Embedding (GE), (2) Multi Domain Batch (MDB) for dialogue +domain knowledge, and (3) Proxy Gradient Transfer (PGT) for cluster-specialized +semantic. MDB feeds diverse dialogue datasets to the model at once to tackle +the multi-domain problem by learning the multiple domain knowledge. We +introduce a novel method PGT, which employs the Siamese network to fine-tune +the model with a clustering method directly.Our model can learn how to cluster +dialogue utterances by using PGT. Experimental results demonstrate that our +multi-view model with MDB and PGT significantly improves the Open Intent +Induction performance compared to baseline systems. + +
+
+ comment: 8 pages, 3 figures, SIGDIAL DSTC 2023 workshop +
+
+
+
+
+ + ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? + + +
+ Large vision-language models have recently achieved remarkable progress, +exhibiting great perception and reasoning abilities concerning visual +information. However, how to effectively evaluate these large vision-language +models remains a major obstacle, hindering future model development. +Traditional benchmarks like VQAv2 or COCO Caption provide quantitative +performance measurements but suffer from a lack of fine-grained ability +assessment and non-robust evaluation metrics. Recent subjective benchmarks, +such as OwlEval, offer comprehensive evaluations of a model's abilities by +incorporating human labor, but they are not scalable and display significant +bias. In response to these challenges, we propose MMBench, a novel +multi-modality benchmark. MMBench methodically develops a comprehensive +evaluation pipeline, primarily comprised of two elements. The first element is +a meticulously curated dataset that surpasses existing similar benchmarks in +terms of the number and variety of evaluation questions and abilities. The +second element introduces a novel CircularEval strategy and incorporates the +use of ChatGPT. This implementation is designed to convert free-form +predictions into pre-defined choices, thereby facilitating a more robust +evaluation of the model's predictions. MMBench is a systematically-designed +objective benchmark for robustly evaluating the various abilities of +vision-language models. We hope MMBench will assist the research community in +better evaluating their models and encourage future advancements in this +domain. Project page: https://opencompass.org.cn/mmbench. + +
+
+
+
+
+ + ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual + Supervision ICCV 2023 + + +
+ Interactions between humans are diverse and context-dependent, but previous +works have treated them as categorical, disregarding the heavy tail of possible +interactions. We propose a new paradigm of learning human-human interactions as +free text from a single still image, allowing for flexibility in modeling the +unlimited space of situations and relationships between people. To overcome the +absence of data labelled specifically for this task, we use knowledge +distillation applied to synthetic caption data produced by a large language +model without explicit supervision. We show that the pseudo-labels produced by +this procedure can be used to train a captioning model to effectively +understand human-human interactions in images, as measured by a variety of +metrics that measure textual and semantic faithfulness and factual groundedness +of our predictions. We further show that our approach outperforms SOTA image +captioning and situation recognition models on this task. We will release our +code and pseudo-labels along with Waldo and Wenda, a manually-curated test set +for still image human-human interaction understanding. + +
+
+ comment: To be presented at ICCV 2023. Project webpage: + https://learning-interactions.github.io +
+
+
+
+
+ + ♻ ☆ Improving Few-shot and Zero-shot Entity Linking with Coarse-to-Fine + Lexicon-based Retriever NLPCC2023 + + +
+ Few-shot and zero-shot entity linking focus on the tail and emerging +entities, which are more challenging but closer to real-world scenarios. The +mainstream method is the ''retrieve and rerank'' two-stage framework. In this +paper, we propose a coarse-to-fine lexicon-based retriever to retrieve entity +candidates in an effective manner, which operates in two layers. The first +layer retrieves coarse-grained candidates by leveraging entity names, while the +second layer narrows down the search to fine-grained candidates within the +coarse-grained ones. In addition, this second layer utilizes entity +descriptions to effectively disambiguate tail or new entities that share names +with existing popular entities. Experimental results indicate that our approach +can obtain superior performance without requiring extensive finetuning in the +retrieval stage. Notably, our approach ranks the 1st in NLPCC 2023 Shared Task +6 on Chinese Few-shot and Zero-shot Entity Linking. + +
+
+ comment: Accepted to NLPCC2023 +
+
+
+
+
+ + ♻ ☆ LabelPrompt: Effective Prompt-based Learning for Relation Classification + + +
+ Recently, prompt-based learning has gained popularity across many natural +language processing (NLP) tasks by reformulating them into a cloze-style format +to better align pre-trained language models (PLMs) with downstream tasks. +However, applying this approach to relation classification poses unique +challenges. Specifically, associating natural language words that fill the +masked token with semantic relation labels (\textit{e.g.} +\textit{``org:founded\_by}'') is difficult. To address this challenge, this +paper presents a novel prompt-based learning method, namely LabelPrompt, for +the relation classification task. Motivated by the intuition to ``GIVE MODEL +CHOICES!'', we first define additional tokens to represent relation labels, +which regard these tokens as the verbaliser with semantic initialisation and +explicitly construct them with a prompt template method. Then, to mitigate +inconsistency between predicted relations and given entities, we implement an +entity-aware module with contrastive learning. Last, we conduct an attention +query strategy within the self-attention layer to differentiates prompt tokens +and sequence tokens. Together, these strategies enhance the adaptability of +prompt-based learning, especially when only small labelled datasets is +available. Comprehensive experiments on benchmark datasets demonstrate the +superiority of our method, particularly in the few-shot scenario. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 41 + +
+
+
+ + ☆ Improving Face Recognition from Caption Supervision with Multi-Granular + Contextual Feature Aggregation + + +
+ We introduce caption-guided face recognition (CGFR) as a new framework to +improve the performance of commercial-off-the-shelf (COTS) face recognition +(FR) systems. In contrast to combining soft biometrics (eg., facial marks, +gender, and age) with face images, in this work, we use facial descriptions +provided by face examiners as a piece of auxiliary information. However, due to +the heterogeneity of the modalities, improving the performance by directly +fusing the textual and facial features is very challenging, as both lie in +different embedding spaces. In this paper, we propose a contextual feature +aggregation module (CFAM) that addresses this issue by effectively exploiting +the fine-grained word-region interaction and global image-caption association. +Specifically, CFAM adopts a self-attention and a cross-attention scheme for +improving the intra-modality and inter-modality relationship between the image +and textual features, respectively. Additionally, we design a textual feature +refinement module (TFRM) that refines the textual features of the pre-trained +BERT encoder by updating the contextual embeddings. This module enhances the +discriminative power of textual features with a cross-modal projection loss and +realigns the word and caption embeddings with visual features by incorporating +a visual-semantic alignment loss. We implemented the proposed CGFR framework on +two face recognition models (ArcFace and AdaFace) and evaluated its performance +on the Multi-Modal CelebA-HQ dataset. Our framework significantly improves the +performance of ArcFace in both 1:1 verification and 1:N identification +protocol. + +
+
+ comment: This article has been accepted for publication in the IEEE + International Joint Conference on Biometrics (IJCB), 2023 +
+
+
+
+
+ + ☆ Manifold DivideMix: A Semi-Supervised Contrastive Learning Framework for + Severe Label Noise + + +
+ Deep neural networks have proven to be highly effective when large amounts of +data with clean labels are available. However, their performance degrades when +training data contains noisy labels, leading to poor generalization on the test +set. Real-world datasets contain noisy label samples that either have similar +visual semantics to other classes (in-distribution) or have no semantic +relevance to any class (out-of-distribution) in the dataset. Most +state-of-the-art methods leverage ID labeled noisy samples as unlabeled data +for semi-supervised learning, but OOD labeled noisy samples cannot be used in +this way because they do not belong to any class within the dataset. Hence, in +this paper, we propose incorporating the information from all the training data +by leveraging the benefits of self-supervised training. Our method aims to +extract a meaningful and generalizable embedding space for each sample +regardless of its label. Then, we employ a simple yet effective K-nearest +neighbor method to remove portions of out-of-distribution samples. By +discarding these samples, we propose an iterative "Manifold DivideMix" +algorithm to find clean and noisy samples, and train our model in a +semi-supervised way. In addition, we propose "MixEMatch", a new algorithm for +the semi-supervised step that involves mixup augmentation at the input and +final hidden representations of the model. This will extract better +representations by interpolating both in the input and manifold spaces. +Extensive experiments on multiple synthetic-noise image benchmarks and +real-world web-crawled datasets demonstrate the effectiveness of our proposed +framework. Code is available at https://github.com/Fahim-F/ManifoldDivideMix. + +
+
+
+
+
+ + ☆ UGC Quality Assessment: Exploring the Impact of Saliency in Deep + Feature-Based Quality Assessment + + +
+ The volume of User Generated Content (UGC) has increased in recent years. The +challenge with this type of content is assessing its quality. So far, the +state-of-the-art metrics are not exhibiting a very high correlation with +perceptual quality. In this paper, we explore state-of-the-art metrics that +extract/combine natural scene statistics and deep neural network features. We +experiment with these by introducing saliency maps to improve perceptibility. +We train and test our models using public datasets, namely, YouTube-UGC and +KoNViD-1k. Preliminary results indicate that high correlations are achieved by +using only deep features while adding saliency is not always boosting the +performance. Our results and code will be made publicly available to serve as a +benchmark for the research community and can be found on our project page: +https://github.com/xinyiW915/SPIE-2023-Supplementary. + +
+
+
+
+
+ + ☆ Optimizing Brain Tumor Classification: A Comprehensive Study on Transfer + Learning and Imbalance Handling in Deep Learning Models + + +
+ Deep learning has emerged as a prominent field in recent literature, +showcasing the introduction of models that utilize transfer learning to achieve +remarkable accuracies in the classification of brain tumor MRI images. However, +the majority of these proposals primarily focus on balanced datasets, +neglecting the inherent data imbalance present in real-world scenarios. +Consequently, there is a pressing need for approaches that not only address the +data imbalance but also prioritize precise classification of brain cancer. In +this work, we present a novel deep learning-based approach, called Transfer +Learning-CNN, for brain tumor classification using MRI data. The proposed model +leverages the predictive capabilities of existing publicly available models by +utilizing their pre-trained weights and transferring those weights to the CNN. +By leveraging a publicly available Brain MRI dataset, the experiment evaluated +various transfer learning models for classifying different tumor types, +including meningioma, glioma, and pituitary tumors. We investigate the impact +of different loss functions, including focal loss, and oversampling methods, +such as SMOTE and ADASYN, in addressing the data imbalance issue. Notably, the +proposed strategy, which combines VGG-16 and CNN, achieved an impressive +accuracy rate of 96%, surpassing alternative approaches significantly. + +
+
+ comment: Our code is available at + https://github.com/Razaimam45/AI701-Project-Transfer-Learning-approach-for-imbalance-classification-of-Brain-Tumor-MRI- +
+
+
+
+
+ + ☆ Modified Topological Image Preprocessing for Skin Lesion Classifications SC + + +
+ This paper proposes a modified Topological Data Analysis model for skin +images preprocessing and enhancements. The skin lesion dataset HAM10000 used +with the intention of identifying the important objects in relevant regions of +the images. In order to evaluate both the original dataset and the preprocessed +dataset, Deep Convolutional Neural Network and Vision Transformer models were +utilized to train both models. After training, the experimental results +demonstrate that the images preprocessed using the Modified Topological Data +Analysis consistently perform better. + +
+
+ comment: Presented at CSCE 2022, The 2022 World Congress in Computer Science, + Computer Engineering & Applied Computing, July 25-28, 2022, Las Vegas, USA +
+
+
+
+
+ + ☆ PV-SSD: A Projection and Voxel-based Double Branch Single-Stage 3D + Object Detector + + +
+ LIDAR-based 3D object detection and classification is crucial for autonomous +driving. However, inference in real-time from extremely sparse 3D data poses a +formidable challenge. To address this issue, a common approach is to project +point clouds onto a bird's-eye or perspective view, effectively converting them +into an image-like data format. However, this excessive compression of point +cloud data often leads to the loss of information. This paper proposes a 3D +object detector based on voxel and projection double branch feature extraction +(PV-SSD) to address the problem of information loss. We add voxel features +input containing rich local semantic information, which is fully fused with the +projected features in the feature extraction stage to reduce the local +information loss caused by projection. A good performance is achieved compared +to the previous work. In addition, this paper makes the following +contributions: 1) a voxel feature extraction method with variable receptive +fields is proposed; 2) a feature point sampling method by weight sampling is +used to filter out the feature points that are more conducive to the detection +task; 3) the MSSFA module is proposed based on the SSFA module. To verify the +effectiveness of our method, we designed comparison experiments. + +
+
+
+
+
+ + ☆ RMP-Loss: Regularizing Membrane Potential Distribution for Spiking + Neural Networks ICCV2023 + + +
+ Spiking Neural Networks (SNNs) as one of the biology-inspired models have +received much attention recently. It can significantly reduce energy +consumption since they quantize the real-valued membrane potentials to 0/1 +spikes to transmit information thus the multiplications of activations and +weights can be replaced by additions when implemented on hardware. However, +this quantization mechanism will inevitably introduce quantization error, thus +causing catastrophic information loss. To address the quantization error +problem, we propose a regularizing membrane potential loss (RMP-Loss) to adjust +the distribution which is directly related to quantization error to a range +close to the spikes. Our method is extremely simple to implement and +straightforward to train an SNN. Furthermore, it is shown to consistently +outperform previous state-of-the-art methods over different network +architectures and datasets. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Shape-guided Conditional Latent Diffusion Models for Synthesising Brain + Vasculature + + +
+ The Circle of Willis (CoW) is the part of cerebral vasculature responsible +for delivering blood to the brain. Understanding the diverse anatomical +variations and configurations of the CoW is paramount to advance research on +cerebrovascular diseases and refine clinical interventions. However, +comprehensive investigation of less prevalent CoW variations remains +challenging because of the dominance of a few commonly occurring +configurations. We propose a novel generative approach utilising a conditional +latent diffusion model with shape and anatomical guidance to generate realistic +3D CoW segmentations, including different phenotypical variations. Our +conditional latent diffusion model incorporates shape guidance to better +preserve vessel continuity and demonstrates superior performance when compared +to alternative generative models, including conditional variants of 3D GAN and +3D VAE. We observed that our model generated CoW variants that are more +realistic and demonstrate higher visual fidelity than competing approaches with +an FID score 53\% better than the best-performing GAN-based model. + +
+
+
+
+
+ + ☆ Neural Networks at a Fraction with Pruned Quaternions + + +
+ Contemporary state-of-the-art neural networks have increasingly large numbers +of parameters, which prevents their deployment on devices with limited +computational power. Pruning is one technique to remove unnecessary weights and +reduce resource requirements for training and inference. In addition, for ML +tasks where the input data is multi-dimensional, using higher-dimensional data +embeddings such as complex numbers or quaternions has been shown to reduce the +parameter count while maintaining accuracy. In this work, we conduct pruning on +real and quaternion-valued implementations of different architectures on +classification tasks. We find that for some architectures, at very high +sparsity levels, quaternion models provide higher accuracies than their real +counterparts. For example, at the task of image classification on CIFAR-10 +using Conv-4, at $3\%$ of the number of parameters as the original model, the +pruned quaternion version outperforms the pruned real by more than $10\%$. +Experiments on various network architectures and datasets show that for +deployment in extremely resource-constrained environments, a sparse quaternion +network might be a better candidate than a real sparse model of similar +architecture. + +
+
+
+
+
+ + ☆ Shrinking Class Space for Enhanced Certainty in Semi-Supervised Learning ICCV 2023 + + +
+ Semi-supervised learning is attracting blooming attention, due to its success +in combining unlabeled data. To mitigate potentially incorrect pseudo labels, +recent frameworks mostly set a fixed confidence threshold to discard uncertain +samples. This practice ensures high-quality pseudo labels, but incurs a +relatively low utilization of the whole unlabeled set. In this work, our key +insight is that these uncertain samples can be turned into certain ones, as +long as the confusion classes for the top-1 class are detected and removed. +Invoked by this, we propose a novel method dubbed ShrinkMatch to learn +uncertain samples. For each uncertain sample, it adaptively seeks a shrunk +class space, which merely contains the original top-1 class, as well as +remaining less likely classes. Since the confusion ones are removed in this +space, the re-calculated top-1 confidence can satisfy the pre-defined +threshold. We then impose a consistency regularization between a pair of +strongly and weakly augmented samples in the shrunk space to strive for +discriminative representations. Furthermore, considering the varied reliability +among uncertain samples and the gradually improved model during training, we +correspondingly design two reweighting principles for our uncertain loss. Our +method exhibits impressive performance on widely adopted benchmarks. Code is +available at https://github.com/LiheYoung/ShrinkMatch. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Unsupervised Image Denoising in Real-World Scenarios via + Self-Collaboration Parallel Generative Adversarial Branches ICCV 2023 + + +
+ Deep learning methods have shown remarkable performance in image denoising, +particularly when trained on large-scale paired datasets. However, acquiring +such paired datasets for real-world scenarios poses a significant challenge. +Although unsupervised approaches based on generative adversarial networks offer +a promising solution for denoising without paired datasets, they are difficult +in surpassing the performance limitations of conventional GAN-based +unsupervised frameworks without significantly modifying existing structures or +increasing the computational complexity of denoisers. To address this problem, +we propose a SC strategy for multiple denoisers. This strategy can achieve +significant performance improvement without increasing the inference complexity +of the GAN-based denoising framework. Its basic idea is to iteratively replace +the previous less powerful denoiser in the filter-guided noise extraction +module with the current powerful denoiser. This process generates better +synthetic clean-noisy image pairs, leading to a more powerful denoiser for the +next iteration. This baseline ensures the stability and effectiveness of the +training network. The experimental results demonstrate the superiority of our +method over state-of-the-art unsupervised methods. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Dual Meta-Learning with Longitudinally Generalized Regularization for + One-Shot Brain Tissue Segmentation Across the Human Lifespan ICCV 2023 + + +
+ Brain tissue segmentation is essential for neuroscience and clinical studies. +However, segmentation on longitudinal data is challenging due to dynamic brain +changes across the lifespan. Previous researches mainly focus on +self-supervision with regularizations and will lose longitudinal generalization +when fine-tuning on a specific age group. In this paper, we propose a dual +meta-learning paradigm to learn longitudinally consistent representations and +persist when fine-tuning. Specifically, we learn a plug-and-play feature +extractor to extract longitudinal-consistent anatomical representations by +meta-feature learning and a well-initialized task head for fine-tuning by +meta-initialization learning. Besides, two class-aware regularizations are +proposed to encourage longitudinal consistency. Experimental results on the +iSeg2019 and ADNI datasets demonstrate the effectiveness of our method. Our +code is available at https://github.com/ladderlab-xjtu/DuMeta. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ A Survey on Deep Neural Network Pruning-Taxonomy, Comparison, Analysis, + and Recommendations + + +
+ Modern deep neural networks, particularly recent large language models, come +with massive model sizes that require significant computational and storage +resources. To enable the deployment of modern models on resource-constrained +environments and accelerate inference time, researchers have increasingly +explored pruning techniques as a popular research direction in neural network +compression. However, there is a dearth of up-to-date comprehensive review +papers on pruning. To address this issue, in this survey, we provide a +comprehensive review of existing research works on deep neural network pruning +in a taxonomy of 1) universal/specific speedup, 2) when to prune, 3) how to +prune, and 4) fusion of pruning and other compression techniques. We then +provide a thorough comparative analysis of seven pairs of contrast settings for +pruning (e.g., unstructured/structured) and explore emerging topics, including +post-training pruning, different levels of supervision for pruning, and broader +applications (e.g., adversarial robustness) to shed light on the commonalities +and differences of existing methods and lay the foundation for further method +development. To facilitate future research, we build a curated collection of +datasets, networks, and evaluations on different applications. Finally, we +provide some valuable recommendations on selecting pruning methods and prospect +promising research directions. We build a repository at +https://github.com/hrcheng1066/awesome-pruning. + +
+
+
+
+
+ + ☆ Tissue Segmentation of Thick-Slice Fetal Brain MR Scans with Guidance + from High-Quality Isotropic Volumes + + +
+ Accurate tissue segmentation of thick-slice fetal brain magnetic resonance +(MR) scans is crucial for both reconstruction of isotropic brain MR volumes and +the quantification of fetal brain development. However, this task is +challenging due to the use of thick-slice scans in clinically-acquired fetal +brain data. To address this issue, we propose to leverage high-quality +isotropic fetal brain MR volumes (and also their corresponding annotations) as +guidance for segmentation of thick-slice scans. Due to existence of significant +domain gap between high-quality isotropic volume (i.e., source data) and +thick-slice scans (i.e., target data), we employ a domain adaptation technique +to achieve the associated knowledge transfer (from high-quality +volumes to thick-slice scans). Specifically, we first register the +available high-quality isotropic fetal brain MR volumes across different +gestational weeks to construct longitudinally-complete source data. To capture +domain-invariant information, we then perform Fourier decomposition to extract +image content and style codes. Finally, we propose a novel Cycle-Consistent +Domain Adaptation Network (C2DA-Net) to efficiently transfer the knowledge +learned from high-quality isotropic volumes for accurate tissue segmentation of +thick-slice scans. Our C2DA-Net can fully utilize a small set of annotated +isotropic volumes to guide tissue segmentation on unannotated thick-slice +scans. Extensive experiments on a large-scale dataset of 372 clinically +acquired thick-slice MR scans demonstrate that our C2DA-Net achieves much +better performance than cutting-edge methods quantitatively and qualitatively. + +
+
+ comment: 10 pages, 9 figures, 5 tables, Fetal MRI, Brain tissue segmentation, + Unsupervised domain adaptation, Cycle-consistency +
+
+
+
+
+ + ☆ Influence Function Based Second-Order Channel Pruning-Evaluating True + Loss Changes For Pruning Is Possible Without Retraining + + +
+ A challenge of channel pruning is designing efficient and effective criteria +to select channels to prune. A widely used criterion is minimal performance +degeneration. To accurately evaluate the truth performance degeneration +requires retraining the survived weights to convergence, which is prohibitively +slow. Hence existing pruning methods use previous weights (without retraining) +to evaluate the performance degeneration. However, we observe the loss changes +differ significantly with and without retraining. It motivates us to develop a +technique to evaluate true loss changes without retraining, with which channels +to prune can be selected more reliably and confidently. We first derive a +closed-form estimator of the true loss change per pruning mask change, using +influence functions without retraining. Influence function which is from robust +statistics reveals the impacts of a training sample on the model's prediction +and is repurposed by us to assess impacts on true loss changes. We then show +how to assess the importance of all channels simultaneously and develop a novel +global channel pruning algorithm accordingly. We conduct extensive experiments +to verify the effectiveness of the proposed algorithm. To the best of our +knowledge, we are the first that shows evaluating true loss changes for pruning +without retraining is possible. This finding will open up opportunities for a +series of new paradigms to emerge that differ from existing pruning methods. +The code is available at https://github.com/hrcheng1066/IFSO. + +
+
+ comment: chrome-extension://ogjibjphoadhljaoicdnjnmgokohngcc/assets/icon-50207e67.png +
+
+
+
+
+ + ☆ FastLLVE: Real-Time Low-Light Video Enhancement with Intensity-Aware + Lookup Table + + +
+ Low-Light Video Enhancement (LLVE) has received considerable attention in +recent years. One of the critical requirements of LLVE is inter-frame +brightness consistency, which is essential for maintaining the temporal +coherence of the enhanced video. However, most existing single-image-based +methods fail to address this issue, resulting in flickering effect that +degrades the overall quality after enhancement. Moreover, 3D Convolution Neural +Network (CNN)-based methods, which are designed for video to maintain +inter-frame consistency, are computationally expensive, making them impractical +for real-time applications. To address these issues, we propose an efficient +pipeline named FastLLVE that leverages the Look-Up-Table (LUT) technique to +maintain inter-frame brightness consistency effectively. Specifically, we +design a learnable Intensity-Aware LUT (IA-LUT) module for adaptive +enhancement, which addresses the low-dynamic problem in low-light scenarios. +This enables FastLLVE to perform low-latency and low-complexity enhancement +operations while maintaining high-quality results. Experimental results on +benchmark datasets demonstrate that our method achieves the State-Of-The-Art +(SOTA) performance in terms of both image quality and inter-frame brightness +consistency. More importantly, our FastLLVE can process 1,080p videos at +$\mathit{50+}$ Frames Per Second (FPS), which is $\mathit{2 \times}$ faster +than SOTA CNN-based methods in inference time, making it a promising solution +for real-time applications. The code is available at +https://github.com/Wenhao-Li-777/FastLLVE. + +
+
+ comment: 11pages, 9 Figures, and 6 Tables. Accepted by ACMMM 2023 +
+
+
+
+
+ + ☆ Target before Shooting: Accurate Anomaly Detection and Localization + under One Millisecond via Cascade Patch Retrieval + + +
+ In this work, by re-examining the "matching" nature of Anomaly Detection +(AD), we propose a new AD framework that simultaneously enjoys new records of +AD accuracy and dramatically high running speed. In this framework, the anomaly +detection problem is solved via a cascade patch retrieval procedure that +retrieves the nearest neighbors for each test image patch in a coarse-to-fine +fashion. Given a test sample, the top-K most similar training images are first +selected based on a robust histogram matching process. Secondly, the nearest +neighbor of each test patch is retrieved over the similar geometrical locations +on those "global nearest neighbors", by using a carefully trained local metric. +Finally, the anomaly score of each test image patch is calculated based on the +distance to its "local nearest neighbor" and the "non-background" probability. +The proposed method is termed "Cascade Patch Retrieval" (CPR) in this work. +Different from the conventional patch-matching-based AD algorithms, CPR selects +proper "targets" (reference images and locations) before "shooting" +(patch-matching). On the well-acknowledged MVTec AD, BTAD and MVTec-3D AD +datasets, the proposed algorithm consistently outperforms all the comparing +SOTA methods by remarkable margins, measured by various AD metrics. +Furthermore, CPR is extremely efficient. It runs at the speed of 113 FPS with +the standard setting while its simplified version only requires less than 1 ms +to process an image at the cost of a trivial accuracy drop. The code of CPR is +available at https://github.com/flyinghu123/CPR. + +
+
+ comment: 13 pages,8 figures +
+
+
+
+
+ + ☆ Self-supervised Noise2noise Method Utilizing Corrupted Images with a + Modular Network for LDCT Denoising + + +
+ Deep learning is a very promising technique for low-dose computed tomography +(LDCT) image denoising. However, traditional deep learning methods require +paired noisy and clean datasets, which are often difficult to obtain. This +paper proposes a new method for performing LDCT image denoising with only LDCT +data, which means that normal-dose CT (NDCT) is not needed. We adopt a +combination including the self-supervised noise2noise model and the +noisy-as-clean strategy. First, we add a second yet similar type of noise to +LDCT images multiple times. Note that we use LDCT images based on the +noisy-as-clean strategy for corruption instead of NDCT images. Then, the +noise2noise model is executed with only the secondary corrupted images for +training. We select a modular U-Net structure from several candidates with +shared parameters to perform the task, which increases the receptive field +without increasing the parameter size. The experimental results obtained on the +Mayo LDCT dataset show the effectiveness of the proposed method compared with +that of state-of-the-art deep learning methods. The developed code is available +at https://github.com/XYuan01/Self-supervised-Noise2Noise-for-LDCT. + +
+
+
+
+
+ + ☆ TextDiff: Mask-Guided Residual Diffusion Models for Scene Text Image + Super-Resolution + + +
+ The goal of scene text image super-resolution is to reconstruct +high-resolution text-line images from unrecognizable low-resolution inputs. The +existing methods relying on the optimization of pixel-level loss tend to yield +text edges that exhibit a notable degree of blurring, thereby exerting a +substantial impact on both the readability and recognizability of the text. To +address these issues, we propose TextDiff, the first diffusion-based framework +tailored for scene text image super-resolution. It contains two modules: the +Text Enhancement Module (TEM) and the Mask-Guided Residual Diffusion Module +(MRD). The TEM generates an initial deblurred text image and a mask that +encodes the spatial location of the text. The MRD is responsible for +effectively sharpening the text edge by modeling the residuals between the +ground-truth images and the initial deblurred images. Extensive experiments +demonstrate that our TextDiff achieves state-of-the-art (SOTA) performance on +public benchmark datasets and can improve the readability of scene text images. +Moreover, our proposed MRD module is plug-and-play that effectively sharpens +the text edges produced by SOTA methods. This enhancement not only improves the +readability and recognizability of the results generated by SOTA methods but +also does not require any additional joint training. Available +Codes:https://github.com/Lenubolim/TextDiff. + +
+
+
+
+
+ + ☆ Free-ATM: Exploring Unsupervised Learning on Diffusion-Generated Images + with Free Attention Masks + + +
+ Despite the rapid advancement of unsupervised learning in visual +representation, it requires training on large-scale datasets that demand costly +data collection, and pose additional challenges due to concerns regarding data +privacy. Recently, synthetic images generated by text-to-image diffusion +models, have shown great potential for benefiting image recognition. Although +promising, there has been inadequate exploration dedicated to unsupervised +learning on diffusion-generated images. To address this, we start by uncovering +that diffusion models' cross-attention layers inherently provide +annotation-free attention masks aligned with corresponding text inputs on +generated images. We then investigate the problems of three prevalent +unsupervised learning techniques ( i.e., contrastive learning, masked modeling, +and vision-language pretraining) and introduce customized solutions by fully +exploiting the aforementioned free attention masks. Our approach is validated +through extensive experiments that show consistent improvements in baseline +models across various downstream tasks, including image classification, +detection, segmentation, and image-text retrieval. By utilizing our method, it +is possible to close the performance gap between unsupervised pretraining on +synthetic data and real-world scenarios. + +
+
+
+
+
+ + ☆ AerialVLN: Vision-and-Language Navigation for UAVs ICCV 2023 + + +
+ Recently emerged Vision-and-Language Navigation (VLN) tasks have drawn +significant attention in both computer vision and natural language processing +communities. Existing VLN tasks are built for agents that navigate on the +ground, either indoors or outdoors. However, many tasks require intelligent +agents to carry out in the sky, such as UAV-based goods delivery, +traffic/security patrol, and scenery tour, to name a few. Navigating in the sky +is more complicated than on the ground because agents need to consider the +flying height and more complex spatial relationship reasoning. To fill this gap +and facilitate research in this field, we propose a new task named AerialVLN, +which is UAV-based and towards outdoor environments. We develop a 3D simulator +rendered by near-realistic pictures of 25 city-level scenarios. Our simulator +supports continuous navigation, environment extension and configuration. We +also proposed an extended baseline model based on the widely-used +cross-modal-alignment (CMA) navigation methods. We find that there is still a +significant gap between the baseline model and human performance, which +suggests AerialVLN is a new challenging task. Dataset and code is available at +https://github.com/AirVLN/AirVLN. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ CLE Diffusion: Controllable Light Enhancement Diffusion Model + + +
+ Low light enhancement has gained increasing importance with the rapid +development of visual creation and editing. However, most existing enhancement +algorithms are designed to homogeneously increase the brightness of images to a +pre-defined extent, limiting the user experience. To address this issue, we +propose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a +novel diffusion framework to provide users with rich controllability. Built +with a conditional diffusion model, we introduce an illumination embedding to +let users control their desired brightness level. Additionally, we incorporate +the Segment-Anything Model (SAM) to enable user-friendly region +controllability, where users can click on objects to specify the regions they +wish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves +competitive performance regarding quantitative metrics, qualitative results, +and versatile controllability. Project page: +\url{https://yuyangyin.github.io/CLEDiffusion/} + +
+
+
+
+
+ + ☆ IP-Adapter: Text Compatible Image Prompt Adapter for Text-to-Image + Diffusion Models + + +
+ Recent years have witnessed the strong power of large text-to-image diffusion +models for the impressive generative capability to create high-fidelity images. +However, it is very tricky to generate desired images using only text prompt as +it often involves complex prompt engineering. An alternative to text prompt is +image prompt, as the saying goes: "an image is worth a thousand words". +Although existing methods of direct fine-tuning from pretrained models are +effective, they require large computing resources and are not compatible with +other base models, text prompt, and structural controls. In this paper, we +present IP-Adapter, an effective and lightweight adapter to achieve image +prompt capability for the pretrained text-to-image diffusion models. The key +design of our IP-Adapter is decoupled cross-attention mechanism that separates +cross-attention layers for text features and image features. Despite the +simplicity of our method, an IP-Adapter with only 22M parameters can achieve +comparable or even better performance to a fully fine-tuned image prompt model. +As we freeze the pretrained diffusion model, the proposed IP-Adapter can be +generalized not only to other custom models fine-tuned from the same base +model, but also to controllable generation using existing controllable tools. +With the benefit of the decoupled cross-attention strategy, the image prompt +can also work well with the text prompt to achieve multimodal image generation. +The project page is available at \url{https://ip-adapter.github.io}. + +
+
+
+
+
+ + ☆ 3D Scene Graph Prediction on Point Clouds Using Knowledge Graphs + + +
+ 3D scene graph prediction is a task that aims to concurrently predict object +classes and their relationships within a 3D environment. As these environments +are primarily designed by and for humans, incorporating commonsense knowledge +regarding objects and their relationships can significantly constrain and +enhance the prediction of the scene graph. In this paper, we investigate the +application of commonsense knowledge graphs for 3D scene graph prediction on +point clouds of indoor scenes. Through experiments conducted on a real-world +indoor dataset, we demonstrate that integrating external commonsense knowledge +via the message-passing method leads to a 15.0 % improvement in scene graph +prediction accuracy with external knowledge and $7.96\%$ with internal +knowledge when compared to state-of-the-art algorithms. We also tested in the +real world with 10 frames per second for scene graph generation to show the +usage of the model in a more realistic robotics setting. + +
+
+ comment: accepted at CASE 2023 +
+
+
+
+
+ + ☆ StairNetV3: Depth-aware Stair Modeling using Deep Learning + + +
+ Vision-based stair perception can help autonomous mobile robots deal with the +challenge of climbing stairs, especially in unfamiliar environments. To address +the problem that current monocular vision methods are difficult to model stairs +accurately without depth information, this paper proposes a depth-aware stair +modeling method for monocular vision. Specifically, we take the extraction of +stair geometric features and the prediction of depth images as joint tasks in a +convolutional neural network (CNN), with the designed information propagation +architecture, we can achieve effective supervision for stair geometric feature +learning by depth information. In addition, to complete the stair modeling, we +take the convex lines, concave lines, tread surfaces and riser surfaces as +stair geometric features and apply Gaussian kernels to enable the network to +predict contextual information within the stair lines. Combined with the depth +information obtained by depth sensors, we propose a stair point cloud +reconstruction method that can quickly get point clouds belonging to the stair +step surfaces. Experiments on our dataset show that our method has a +significant improvement over the previous best monocular vision method, with an +intersection over union (IOU) increase of 3.4 %, and the lightweight version +has a fast detection speed and can meet the requirements of most real-time +applications. Our dataset is available at +https://data.mendeley.com/datasets/6kffmjt7g2/1. + +
+
+
+
+
+ + ☆ LAW-Diffusion: Complex Scene Generation by Diffusion with Layouts + + +
+ Thanks to the rapid development of diffusion models, unprecedented progress +has been witnessed in image synthesis. Prior works mostly rely on pre-trained +linguistic models, but a text is often too abstract to properly specify all the +spatial properties of an image, e.g., the layout configuration of a scene, +leading to the sub-optimal results of complex scene generation. In this paper, +we achieve accurate complex scene generation by proposing a semantically +controllable Layout-AWare diffusion model, termed LAW-Diffusion. Distinct from +the previous Layout-to-Image generation (L2I) methods that only explore +category-aware relationships, LAW-Diffusion introduces a spatial dependency +parser to encode the location-aware semantic coherence across objects as a +layout embedding and produces a scene with perceptually harmonious object +styles and contextual relations. To be specific, we delicately instantiate each +object's regional semantics as an object region map and leverage a +location-aware cross-object attention module to capture the spatial +dependencies among those disentangled representations. We further propose an +adaptive guidance schedule for our layout guidance to mitigate the trade-off +between the regional semantic alignment and the texture fidelity of generated +objects. Moreover, LAW-Diffusion allows for instance reconfiguration while +maintaining the other regions in a synthesized image by introducing a +layout-aware latent grafting mechanism to recompose its local regional +semantics. To better verify the plausibility of generated scenes, we propose a +new evaluation metric for the L2I task, dubbed Scene Relation Score (SRS) to +measure how the images preserve the rational and harmonious relations among +contextual objects. Comprehensive experiments demonstrate that our +LAW-Diffusion yields the state-of-the-art generative performance, especially +with coherent object relations. + +
+
+
+
+
+ + ☆ Compositional Feature Augmentation for Unbiased Scene Graph Generation ICCV 2023 + + +
+ Scene Graph Generation (SGG) aims to detect all the visual relation triplets + in a given image. With the emergence of various advanced +techniques for better utilizing both the intrinsic and extrinsic information in +each relation triplet, SGG has achieved great progress over the recent years. +However, due to the ubiquitous long-tailed predicate distributions, today's SGG +models are still easily biased to the head predicates. Currently, the most +prevalent debiasing solutions for SGG are re-balancing methods, e.g., changing +the distributions of original training samples. In this paper, we argue that +all existing re-balancing strategies fail to increase the diversity of the +relation triplet features of each predicate, which is critical for robust SGG. +To this end, we propose a novel Compositional Feature Augmentation (CFA) +strategy, which is the first unbiased SGG work to mitigate the bias issue from +the perspective of increasing the diversity of triplet features. Specifically, +we first decompose each relation triplet feature into two components: intrinsic +feature and extrinsic feature, which correspond to the intrinsic +characteristics and extrinsic contexts of a relation triplet, respectively. +Then, we design two different feature augmentation modules to enrich the +feature diversity of original relation triplets by replacing or mixing up +either their intrinsic or extrinsic features from other samples. Due to its +model-agnostic nature, CFA can be seamlessly incorporated into various SGG +frameworks. Extensive ablations have shown that CFA achieves a new +state-of-the-art performance on the trade-off between different metrics. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Condition-Adaptive Graph Convolution Learning for Skeleton-Based Gait + Recognition + + +
+ Graph convolutional networks have been widely applied in skeleton-based gait +recognition. A key challenge in this task is to distinguish the individual +walking styles of different subjects across various views. Existing +state-of-the-art methods employ uniform convolutions to extract features from +diverse sequences and ignore the effects of viewpoint changes. To overcome +these limitations, we propose a condition-adaptive graph (CAG) convolution +network that can dynamically adapt to the specific attributes of each skeleton +sequence and the corresponding view angle. In contrast to using fixed weights +for all joints and sequences, we introduce a joint-specific filter learning +(JSFL) module in the CAG method, which produces sequence-adaptive filters at +the joint level. The adaptive filters capture fine-grained patterns that are +unique to each joint, enabling the extraction of diverse spatial-temporal +information about body parts. Additionally, we design a view-adaptive topology +learning (VATL) module that generates adaptive graph topologies. These graph +topologies are used to correlate the joints adaptively according to the +specific view conditions. Thus, CAG can simultaneously adjust to various +walking styles and viewpoints. Experiments on the two most widely used datasets +(i.e., CASIA-B and OU-MVLP) show that CAG surpasses all previous skeleton-based +methods. Moreover, the recognition performance can be enhanced by simply +combining CAG with appearance-based methods, demonstrating the ability of CAG +to provide useful complementary information.The source code will be available +at https://github.com/OliverHxh/CAG. + +
+
+ comment: Accepted by TIP journal +
+
+
+
+
+ + ♻ ☆ POSTER: A Pyramid Cross-Fusion Transformer Network for Facial Expression + Recognition ICCV + + +
+ Facial expression recognition (FER) is an important task in computer vision, +having practical applications in areas such as human-computer interaction, +education, healthcare, and online monitoring. In this challenging FER task, +there are three key issues especially prevalent: inter-class similarity, +intra-class discrepancy, and scale sensitivity. While existing works typically +address some of these issues, none have fully addressed all three challenges in +a unified framework. In this paper, we propose a two-stream Pyramid +crOss-fuSion TransformER network (POSTER), that aims to holistically solve all +three issues. Specifically, we design a transformer-based cross-fusion method +that enables effective collaboration of facial landmark features and image +features to maximize proper attention to salient facial regions. Furthermore, +POSTER employs a pyramid structure to promote scale invariance. Extensive +experimental results demonstrate that our POSTER achieves new state-of-the-art +results on RAF-DB (92.05%), FERPlus (91.62%), as well as AffectNet 7 class +(67.31%) and 8 class (63.34%). The code is available at +https://github.com/zczcwh/POSTER. + +
+
+ comment: ICCV Workshop (AMFG) 2023 +
+
+
+
+
+ + ♻ ☆ CheckerPose: Progressive Dense Keypoint Localization for Object Pose + Estimation with Graph Neural Network ICCV2023 + + +
+ Estimating the 6-DoF pose of a rigid object from a single RGB image is a +crucial yet challenging task. Recent studies have shown the great potential of +dense correspondence-based solutions, yet improvements are still needed to +reach practical deployment. In this paper, we propose a novel pose estimation +algorithm named CheckerPose, which improves on three main aspects. Firstly, +CheckerPose densely samples 3D keypoints from the surface of the 3D object and +finds their 2D correspondences progressively in the 2D image. Compared to +previous solutions that conduct dense sampling in the image space, our strategy +enables the correspondence searching in a 2D grid (i.e., pixel coordinate). +Secondly, for our 3D-to-2D correspondence, we design a compact binary code +representation for 2D image locations. This representation not only allows for +progressive correspondence refinement but also converts the correspondence +regression to a more efficient classification problem. Thirdly, we adopt a +graph neural network to explicitly model the interactions among the sampled 3D +keypoints, further boosting the reliability and accuracy of the +correspondences. Together, these novel components make CheckerPose a strong +pose estimation algorithm. When evaluated on the popular Linemod, Linemod-O, +and YCB-V object pose estimation benchmarks, CheckerPose clearly boosts the +accuracy of correspondence-based methods and achieves state-of-the-art +performances. Code is available at https://github.com/RuyiLian/CheckerPose. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ FemtoDet: An Object Detection Baseline for Energy Versus Performance + Tradeoffs ICCV 2023 + + +
+ Efficient detectors for edge devices are often optimized for parameters or +speed count metrics, which remain in weak correlation with the energy of +detectors. + However, some vision applications of convolutional neural networks, such as +always-on surveillance cameras, are critical for energy constraints. + This paper aims to serve as a baseline by designing detectors to reach +tradeoffs between energy and performance from two perspectives: + 1) We extensively analyze various CNNs to identify low-energy architectures, +including selecting activation functions, convolutions operators, and feature +fusion structures on necks. These underappreciated details in past work +seriously affect the energy consumption of detectors; + 2) To break through the dilemmatic energy-performance problem, we propose a +balanced detector driven by energy using discovered low-energy components named +\textit{FemtoDet}. + In addition to the novel construction, we improve FemtoDet by considering +convolutions and training strategy optimizations. + Specifically, we develop a new instance boundary enhancement (IBE) module for +convolution optimization to overcome the contradiction between the limited +capacity of CNNs and detection tasks in diverse spatial representations, and +propose a recursive warm-restart (RecWR) for optimizing training strategy to +escape the sub-optimization of light-weight detectors by considering the data +shift produced in popular augmentations. + As a result, FemtoDet with only 68.77k parameters achieves a competitive +score of 46.3 AP50 on PASCAL VOC and 1.11 W $\&$ 64.47 FPS on Qualcomm +Snapdragon 865 CPU platforms. + Extensive experiments on COCO and TJU-DHD datasets indicate that the proposed +method achieves competitive results in diverse scenes. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Foiling Explanations in Deep Neural Networks + + +
+ Deep neural networks (DNNs) have greatly impacted numerous fields over the +past decade. Yet despite exhibiting superb performance over many problems, +their black-box nature still poses a significant challenge with respect to +explainability. Indeed, explainable artificial intelligence (XAI) is crucial in +several fields, wherein the answer alone -- sans a reasoning of how said answer +was derived -- is of little value. This paper uncovers a troubling property of +explanation methods for image-based DNNs: by making small visual changes to the +input image -- hardly influencing the network's output -- we demonstrate how +explanations may be arbitrarily manipulated through the use of evolution +strategies. Our novel algorithm, AttaXAI, a model-agnostic, adversarial attack +on XAI algorithms, only requires access to the output logits of a classifier +and to the explanation map; these weak assumptions render our approach highly +useful where real-world models and data are concerned. We compare our method's +performance on two benchmark datasets -- CIFAR100 and ImageNet -- using four +different pretrained deep-learning models: VGG16-CIFAR100, VGG16-ImageNet, +MobileNet-CIFAR100, and Inception-v3-ImageNet. We find that the XAI methods can +be manipulated without the use of gradients or other model internals. Our novel +algorithm is successfully able to manipulate an image in a manner imperceptible +to the human eye, such that the XAI method outputs a specific explanation map. +To our knowledge, this is the first such method in a black-box setting, and we +believe it has significant value where explainability is desired, required, or +legally mandatory. + +
+
+ comment: Snir Vitrack Tamam and Raz Lapid contributed equally +
+
+
+
+
+ + ♻ ☆ Voxurf: Voxel-based Efficient and Accurate Neural Surface Reconstruction ICLR 2023 + + +
+ Neural surface reconstruction aims to reconstruct accurate 3D surfaces based +on multi-view images. Previous methods based on neural volume rendering mostly +train a fully implicit model with MLPs, which typically require hours of +training for a single scene. Recent efforts explore the explicit volumetric +representation to accelerate the optimization via memorizing significant +information with learnable voxel grids. However, existing voxel-based methods +often struggle in reconstructing fine-grained geometry, even when combined with +an SDF-based volume rendering scheme. We reveal that this is because 1) the +voxel grids tend to break the color-geometry dependency that facilitates +fine-geometry learning, and 2) the under-constrained voxel grids lack spatial +coherence and are vulnerable to local minima. In this work, we present Voxurf, +a voxel-based surface reconstruction approach that is both efficient and +accurate. Voxurf addresses the aforementioned issues via several key designs, +including 1) a two-stage training procedure that attains a coherent coarse +shape and recovers fine details successively, 2) a dual color network that +maintains color-geometry dependency, and 3) a hierarchical geometry feature to +encourage information propagation across voxels. Extensive experiments show +that Voxurf achieves high efficiency and high quality at the same time. On the +DTU benchmark, Voxurf achieves higher reconstruction quality with a 20x +training speedup compared to previous fully implicit methods. Our code is +available at https://github.com/wutong16/Voxurf. + +
+
+ comment: ICLR 2023 Spotlight. Our code is available at + https://github.com/wutong16/Voxurf +
+
+
+
+
+ + ♻ ☆ NightHazeFormer: Single Nighttime Haze Removal Using Prior Query + Transformer + + +
+ Nighttime image dehazing is a challenging task due to the presence of +multiple types of adverse degrading effects including glow, haze, blurry, +noise, color distortion, and so on. However, most previous studies mainly focus +on daytime image dehazing or partial degradations presented in nighttime hazy +scenes, which may lead to unsatisfactory restoration results. In this paper, we +propose an end-to-end transformer-based framework for nighttime haze removal, +called NightHazeFormer. Our proposed approach consists of two stages: +supervised pre-training and semi-supervised fine-tuning. During the +pre-training stage, we introduce two powerful priors into the transformer +decoder to generate the non-learnable prior queries, which guide the model to +extract specific degradations. For the fine-tuning, we combine the generated +pseudo ground truths with input real-world nighttime hazy images as paired +images and feed into the synthetic domain to fine-tune the pre-trained model. +This semi-supervised fine-tuning paradigm helps improve the generalization to +real domain. In addition, we also propose a large-scale synthetic dataset +called UNREAL-NH, to simulate the real-world nighttime haze scenarios +comprehensively. Extensive experiments on several synthetic and real-world +datasets demonstrate the superiority of our NightHazeFormer over +state-of-the-art nighttime haze removal methods in terms of both visually and +quantitatively. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ AlignDet: Aligning Pre-training and Fine-tuning in Object Detection ICCV 2023 + + +
+ The paradigm of large-scale pre-training followed by downstream fine-tuning +has been widely employed in various object detection algorithms. In this paper, +we reveal discrepancies in data, model, and task between the pre-training and +fine-tuning procedure in existing practices, which implicitly limit the +detector's performance, generalization ability, and convergence speed. To this +end, we propose AlignDet, a unified pre-training framework that can be adapted +to various existing detectors to alleviate the discrepancies. AlignDet +decouples the pre-training process into two stages, i.e., image-domain and +box-domain pre-training. The image-domain pre-training optimizes the detection +backbone to capture holistic visual abstraction, and box-domain pre-training +learns instance-level semantics and task-aware concepts to initialize the parts +out of the backbone. By incorporating the self-supervised pre-trained +backbones, we can pre-train all modules for various detectors in an +unsupervised paradigm. As depicted in Figure 1, extensive experiments +demonstrate that AlignDet can achieve significant improvements across diverse +protocols, such as detection algorithm, model backbone, data setting, and +training schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by +2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs. + +
+
+ comment: Camera Ready Version on ICCV 2023. Code and Models are publicly + available. Project Page: https://liming-ai.github.io/AlignDet +
+
+
+
+
+ + ♻ ☆ MMBench: Is Your Multi-modal Model an All-around Player? + + +
+ Large vision-language models have recently achieved remarkable progress, +exhibiting great perception and reasoning abilities concerning visual +information. However, how to effectively evaluate these large vision-language +models remains a major obstacle, hindering future model development. +Traditional benchmarks like VQAv2 or COCO Caption provide quantitative +performance measurements but suffer from a lack of fine-grained ability +assessment and non-robust evaluation metrics. Recent subjective benchmarks, +such as OwlEval, offer comprehensive evaluations of a model's abilities by +incorporating human labor, but they are not scalable and display significant +bias. In response to these challenges, we propose MMBench, a novel +multi-modality benchmark. MMBench methodically develops a comprehensive +evaluation pipeline, primarily comprised of two elements. The first element is +a meticulously curated dataset that surpasses existing similar benchmarks in +terms of the number and variety of evaluation questions and abilities. The +second element introduces a novel CircularEval strategy and incorporates the +use of ChatGPT. This implementation is designed to convert free-form +predictions into pre-defined choices, thereby facilitating a more robust +evaluation of the model's predictions. MMBench is a systematically-designed +objective benchmark for robustly evaluating the various abilities of +vision-language models. We hope MMBench will assist the research community in +better evaluating their models and encourage future advancements in this +domain. Project page: https://opencompass.org.cn/mmbench. + +
+
+
+
+
+ + ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual + Supervision ICCV 2023 + + +
+ Interactions between humans are diverse and context-dependent, but previous +works have treated them as categorical, disregarding the heavy tail of possible +interactions. We propose a new paradigm of learning human-human interactions as +free text from a single still image, allowing for flexibility in modeling the +unlimited space of situations and relationships between people. To overcome the +absence of data labelled specifically for this task, we use knowledge +distillation applied to synthetic caption data produced by a large language +model without explicit supervision. We show that the pseudo-labels produced by +this procedure can be used to train a captioning model to effectively +understand human-human interactions in images, as measured by a variety of +metrics that measure textual and semantic faithfulness and factual groundedness +of our predictions. We further show that our approach outperforms SOTA image +captioning and situation recognition models on this task. We will release our +code and pseudo-labels along with Waldo and Wenda, a manually-curated test set +for still image human-human interaction understanding. + +
+
+ comment: To be presented at ICCV 2023. Project webpage: + https://learning-interactions.github.io +
+
+
+
+
+ + ♻ ☆ UDTIRI: An Open-Source Intelligent Road Inspection Benchmark Suite + + +
+ It is seen that there is enormous potential to leverage powerful deep +learning methods in the emerging field of urban digital twins. It is +particularly in the area of intelligent road inspection where there is +currently limited research and data available. To facilitate progress in this +field, we have developed a well-labeled road pothole dataset named Urban +Digital Twins Intelligent Road Inspection (UDTIRI) dataset. We hope this +dataset will enable the use of powerful deep learning methods in urban road +inspection, providing algorithms with a more comprehensive understanding of the +scene and maximizing their potential. Our dataset comprises 1000 images of +potholes, captured in various scenarios with different lighting and humidity +conditions. Our intention is to employ this dataset for object detection, +semantic segmentation, and instance segmentation tasks. Our team has devoted +significant effort to conducting a detailed statistical analysis, and +benchmarking a selection of representative algorithms from recent years. We +also provide a multi-task platform for researchers to fully exploit the +performance of various algorithms with the support of UDTIRI dataset. + +
+
+ comment: Database webpage: https://www.udtiri.com/, Kaggle webpage: + https://www.kaggle.com/datasets/jiahangli617/udtiri +
+
+
+
+
+ + ♻ ☆ SparseNeRF: Distilling Depth Ranking for Few-shot Novel View Synthesis ICCV 2023 + + +
+ Neural Radiance Field (NeRF) significantly degrades when only a limited +number of views are available. To complement the lack of 3D information, +depth-based models, such as DSNeRF and MonoSDF, explicitly assume the +availability of accurate depth maps of multiple views. They linearly scale the +accurate depth maps as supervision to guide the predicted depth of few-shot +NeRFs. However, accurate depth maps are difficult and expensive to capture due +to wide-range depth distances in the wild. + In this work, we present a new Sparse-view NeRF (SparseNeRF) framework that +exploits depth priors from real-world inaccurate observations. The inaccurate +depth observations are either from pre-trained depth models or coarse depth +maps of consumer-level depth sensors. Since coarse depth maps are not strictly +scaled to the ground-truth depth maps, we propose a simple yet effective +constraint, a local depth ranking method, on NeRFs such that the expected depth +ranking of the NeRF is consistent with that of the coarse depth maps in local +patches. To preserve the spatial continuity of the estimated depth of NeRF, we +further propose a spatial continuity constraint to encourage the consistency of +the expected depth continuity of NeRF with coarse depth maps. Surprisingly, +with simple depth ranking constraints, SparseNeRF outperforms all +state-of-the-art few-shot NeRF methods (including depth-based models) on +standard LLFF and DTU datasets. Moreover, we collect a new dataset NVS-RGBD +that contains real-world depth maps from Azure Kinect, ZED 2, and iPhone 13 +Pro. Extensive experiments on NVS-RGBD dataset also validate the superiority +and generalizability of SparseNeRF. Code and dataset are available at +https://sparsenerf.github.io/. + +
+
+ comment: Accepted by ICCV 2023, Project page: https://sparsenerf.github.io/ +
+
+
+
+
+ + ♻ ☆ Neural LiDAR Fields for Novel View Synthesis ICCV 2023 + + +
+ We present Neural Fields for LiDAR (NFL), a method to optimise a neural field +scene representation from LiDAR measurements, with the goal of synthesizing +realistic LiDAR scans from novel viewpoints. NFL combines the rendering power +of neural fields with a detailed, physically motivated model of the LiDAR +sensing process, thus enabling it to accurately reproduce key sensor behaviors +like beam divergence, secondary returns, and ray dropping. We evaluate NFL on +synthetic and real LiDAR scans and show that it outperforms explicit +reconstruct-then-simulate methods as well as other NeRF-style methods on LiDAR +novel view synthesis task. Moreover, we show that the improved realism of the +synthesized views narrows the domain gap to real scans and translates to better +registration and semantic segmentation performance. + +
+
+ comment: ICCV 2023 - camera ready. Project page: + https://research.nvidia.com/labs/toronto-ai/nfl/ +
+
+
+
+
+ + ♻ ☆ Defense-Prefix for Preventing Typographic Attacks on CLIP ICCV2023 + + +
+ Vision-language pre-training models (VLPs) have exhibited revolutionary +improvements in various vision-language tasks. In VLP, some adversarial attacks +fool a model into false or absurd classifications. Previous studies addressed +these attacks by fine-tuning the model or changing its architecture. However, +these methods risk losing the original model's performance and are difficult to +apply to downstream tasks. In particular, their applicability to other tasks +has not been considered. In this study, we addressed the reduction of the +impact of typographic attacks on CLIP without changing the model parameters. To +achieve this, we expand the idea of ``prefix learning'' and introduce our +simple yet effective method: Defense-Prefix (DP), which inserts the DP token +before a class name to make words ``robust'' against typographic attacks. Our +method can be easily applied to downstream tasks, such as object detection, +because the proposed method is independent of the model parameters. Our method +significantly improves the accuracy of classification tasks for typographic +attack datasets, while maintaining the zero-shot capabilities of the model. In +addition, we leverage our proposed method for object detection, demonstrating +its high applicability and effectiveness. The codes and datasets are available +at https://github.com/azuma164/Defense-Prefix. + +
+
+ comment: ICCV2023 Workshop +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ InTune: Reinforcement Learning-based Data Pipeline Optimization for Deep + Recommendation Models RecSys 2023 + + +
+ Deep learning-based recommender models (DLRMs) have become an essential +component of many modern recommender systems. Several companies are now +building large compute clusters reserved only for DLRM training, driving new +interest in cost- and time- saving optimizations. The systems challenges faced +in this setting are unique; while typical deep learning training jobs are +dominated by model execution, the most important factor in DLRM training +performance is often online data ingestion. + In this paper, we explore the unique characteristics of this data ingestion +problem and provide insights into DLRM training pipeline bottlenecks and +challenges. We study real-world DLRM data processing pipelines taken from our +compute cluster at Netflix to observe the performance impacts of online +ingestion and to identify shortfalls in existing pipeline optimizers. We find +that current tooling either yields sub-optimal performance, frequent crashes, +or else requires impractical cluster re-organization to adopt. Our studies lead +us to design and build a new solution for data pipeline optimization, InTune. + InTune employs a reinforcement learning (RL) agent to learn how to distribute +the CPU resources of a trainer machine across a DLRM data pipeline to more +effectively parallelize data loading and improve throughput. Our experiments +show that InTune can build an optimized data pipeline configuration within only +a few minutes, and can easily be integrated into existing training workflows. +By exploiting the responsiveness and adaptability of RL, InTune achieves higher +online data ingestion rates than existing optimizers, thus reducing idle times +in model execution and increasing efficiency. We apply InTune to our real-world +cluster, and find that it increases data ingestion throughput by as much as +2.29X versus state-of-the-art data pipeline optimizers while also improving +both CPU & GPU utilization. + +
+
+ comment: Accepted at RecSys 2023. 11 pages, 2 pages of references. 8 figures + with 2 tables +
+
+
+
+
+ + ☆ Transforming Sentiment Analysis in the Financial Domain with ChatGPT + + +
+ Financial sentiment analysis plays a crucial role in decoding market trends +and guiding strategic trading decisions. Despite the deployment of advanced +deep learning techniques and language models to refine sentiment analysis in +finance, this study breaks new ground by investigating the potential of large +language models, particularly ChatGPT 3.5, in financial sentiment analysis, +with a strong emphasis on the foreign exchange market (forex). Employing a +zero-shot prompting approach, we examine multiple ChatGPT prompts on a +meticulously curated dataset of forex-related news headlines, measuring +performance using metrics such as precision, recall, f1-score, and Mean +Absolute Error (MAE) of the sentiment class. Additionally, we probe the +correlation between predicted sentiment and market returns as an additional +evaluation approach. ChatGPT, compared to FinBERT, a well-established sentiment +analysis model for financial texts, exhibited approximately 35\% enhanced +performance in sentiment classification and a 36\% higher correlation with +market returns. By underlining the significance of prompt engineering, +particularly in zero-shot contexts, this study spotlights ChatGPT's potential +to substantially boost sentiment analysis in financial applications. By sharing +the utilized dataset, our intention is to stimulate further research and +advancements in the field of financial services. + +
+
+ comment: 10 pages, 8 figures, Preprint submitted to Machine Learning with + Applications +
+
+
+
+
+ + ☆ CDR: Conservative Doubly Robust Learning for Debiased Recommendation + + +
+ In recommendation systems (RS), user behavior data is observational rather +than experimental, resulting in widespread bias in the data. Consequently, +tackling bias has emerged as a major challenge in the field of recommendation +systems. Recently, Doubly Robust Learning (DR) has gained significant attention +due to its remarkable performance and robust properties. However, our +experimental findings indicate that existing DR methods are severely impacted +by the presence of so-called Poisonous Imputation, where the imputation +significantly deviates from the truth and becomes counterproductive. + To address this issue, this work proposes Conservative Doubly Robust strategy +(CDR) which filters imputations by scrutinizing their mean and variance. +Theoretical analyses show that CDR offers reduced variance and improved tail +bounds.In addition, our experimental investigations illustrate that CDR +significantly enhances performance and can indeed reduce the frequency of +poisonous imputation. + +
+
+
+
+
+ + ♻ ☆ Unified Matrix Factorization with Dynamic Multi-view Clustering + + +
+ Matrix factorization (MF) is a classical collaborative filtering algorithm +for recommender systems. It decomposes the user-item interaction matrix into a +product of low-dimensional user representation matrix and item representation +matrix. In typical recommendation scenarios, the user-item interaction paradigm +is usually a two-stage process and requires static clustering analysis of the +obtained user and item representations. The above process, however, is time and +computationally intensive, making it difficult to apply in real-time to +e-commerce or Internet of Things environments with billions of users and +trillions of items. To address this, we propose a unified matrix factorization +method based on dynamic multi-view clustering (MFDMC) that employs an +end-to-end training paradigm. Specifically, in each view, a user/item +representation is regarded as a weighted projection of all clusters. The +representation of each cluster is learnable, enabling the dynamic discarding of +bad clusters. Furthermore, we employ multi-view clustering to represent +multiple roles of users/items, effectively utilizing the representation space +and improving the interpretability of the user/item representations for +downstream tasks. Extensive experiments show that our proposed MFDMC achieves +state-of-the-art performance on real-world recommendation datasets. +Additionally, comprehensive visualization and ablation studies interpretably +confirm that our method provides meaningful representations for downstream +tasks of users/items. + +
+
+
+
+
+ + ♻ ☆ LabelPrompt: Effective Prompt-based Learning for Relation Classification + + +
+ Recently, prompt-based learning has gained popularity across many natural +language processing (NLP) tasks by reformulating them into a cloze-style format +to better align pre-trained language models (PLMs) with downstream tasks. +However, applying this approach to relation classification poses unique +challenges. Specifically, associating natural language words that fill the +masked token with semantic relation labels (\textit{e.g.} +\textit{``org:founded\_by}'') is difficult. To address this challenge, this +paper presents a novel prompt-based learning method, namely LabelPrompt, for +the relation classification task. Motivated by the intuition to ``GIVE MODEL +CHOICES!'', we first define additional tokens to represent relation labels, +which regard these tokens as the verbaliser with semantic initialisation and +explicitly construct them with a prompt template method. Then, to mitigate +inconsistency between predicted relations and given entities, we implement an +entity-aware module with contrastive learning. Last, we conduct an attention +query strategy within the self-attention layer to differentiates prompt tokens +and sequence tokens. Together, these strategies enhance the adaptability of +prompt-based learning, especially when only small labelled datasets is +available. Comprehensive experiments on benchmark datasets demonstrate the +superiority of our method, particularly in the few-shot scenario. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+
+
+
+ + Machine Learning 55 + +
+
+
+ + ☆ Effect of Choosing Loss Function when Using T-batching for + Representation Learning on Dynamic Networks + + +
+ Representation learning methods have revolutionized machine learning on +networks by converting discrete network structures into continuous domains. +However, dynamic networks that evolve over time pose new challenges. To address +this, dynamic representation learning methods have gained attention, offering +benefits like reduced learning time and improved accuracy by utilizing temporal +information. + T-batching is a valuable technique for training dynamic network models that +reduces training time while preserving vital conditions for accurate modeling. +However, we have identified a limitation in the training loss function used +with t-batching. Through mathematical analysis, we propose two alternative loss +functions that overcome these issues, resulting in enhanced training +performance. + We extensively evaluate the proposed loss functions on synthetic and +real-world dynamic networks. The results consistently demonstrate superior +performance compared to the original loss function. Notably, in a real-world +network characterized by diverse user interaction histories, the proposed loss +functions achieved more than 26.9% enhancement in Mean Reciprocal Rank (MRR) +and more than 11.8% improvement in Recall@10. These findings underscore the +efficacy of the proposed loss functions in dynamic network modeling. + +
+
+ comment: 29 pages, 10 figures, 4 tables, Submitted to Information Sciences +
+
+
+
+
+ + ☆ Optimizing Offensive Gameplan in the National Basketball Association + with Machine Learning + + +
+ Throughout the analytical revolution that has occurred in the NBA, the +development of specific metrics and formulas has given teams, coaches, and +players a new way to see the game. However - the question arises - how can we +verify any metrics? One method would simply be eyeball approximation (trying +out many different gameplans) and/or trial and error - an estimation-based and +costly approach. Another approach is to try to model already existing metrics +with a unique set of features using machine learning techniques. The key to +this approach is that with these features that are selected, we can try to +gauge the effectiveness of these features combined, rather than using +individual analysis in simple metric evaluation. If we have an accurate model, +it can particularly help us determine the specifics of gameplan execution. In +this paper, the statistic ORTG (Offensive Rating, developed by Dean Oliver) was +found to have a correlation with different NBA playtypes using both a linear +regression model and a neural network regression model, although ultimately, a +neural network worked slightly better than linear regression. Using the +accuracy of the models as a justification, the next step was to optimize the +output of the model with test examples, which would demonstrate the combination +of features to best achieve a highly functioning offense. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ When Monte-Carlo Dropout Meets Multi-Exit: Optimizing Bayesian Neural + Networks on FPGA + + +
+ Bayesian Neural Networks (BayesNNs) have demonstrated their capability of +providing calibrated prediction for safety-critical applications such as +medical imaging and autonomous driving. However, the high algorithmic +complexity and the poor hardware performance of BayesNNs hinder their +deployment in real-life applications. To bridge this gap, this paper proposes a +novel multi-exit Monte-Carlo Dropout (MCD)-based BayesNN that achieves +well-calibrated predictions with low algorithmic complexity. To further reduce +the barrier to adopting BayesNNs, we propose a transformation framework that +can generate FPGA-based accelerators for multi-exit MCD-based BayesNNs. Several +novel optimization techniques are introduced to improve hardware performance. +Our experiments demonstrate that our auto-generated accelerator achieves higher +energy efficiency than CPU, GPU, and other state-of-the-art hardware +implementations. + +
+
+
+
+
+ + ☆ Generalizing Topological Graph Neural Networks with Paths + + +
+ While Graph Neural Networks (GNNs) have made significant strides in diverse +areas, they are hindered by a theoretical constraint known as the +1-Weisfeiler-Lehmann test. Even though latest advancements in higher-order GNNs +can overcome this boundary, they typically center around certain graph +components like cliques or cycles. However, our investigation goes a different +route. We put emphasis on paths, which are inherent in every graph. We are able +to construct a more general topological perspective and form a bridge to +certain established theories about other topological domains. Interestingly, +without any assumptions on graph sub-structures, our approach surpasses earlier +techniques in this field, achieving state-of-the-art performance on several +benchmarks. + +
+
+
+
+
+ + ☆ An Ensemble Approach to Question Classification: Integrating Electra + Transformer, GloVe, and LSTM + + +
+ This paper introduces a novel ensemble approach for question classification +using state-of-the-art models -- Electra, GloVe, and LSTM. The proposed model +is trained and evaluated on the TREC dataset, a well-established benchmark for +question classification tasks. The ensemble model combines the strengths of +Electra, a transformer-based model for language understanding, GloVe, a global +vectors for word representation, and LSTM, a recurrent neural network variant, +providing a robust and efficient solution for question classification. +Extensive experiments were carried out to compare the performance of the +proposed ensemble approach with other cutting-edge models, such as BERT, +RoBERTa, and DistilBERT. Our results demonstrate that the ensemble model +outperforms these models across all evaluation metrics, achieving an accuracy +of 0.8 on the test set. These findings underscore the effectiveness of the +ensemble approach in enhancing the performance of question classification +tasks, and invite further exploration of ensemble methods in natural language +processing. + +
+
+
+
+
+ + ☆ Reinforcement Graph Clustering with Unknown Cluster Number + + +
+ Deep graph clustering, which aims to group nodes into disjoint clusters by +neural networks in an unsupervised manner, has attracted great attention in +recent years. Although the performance has been largely improved, the excellent +performance of the existing methods heavily relies on an accurately predefined +cluster number, which is not always available in the real-world scenario. To +enable the deep graph clustering algorithms to work without the guidance of the +predefined cluster number, we propose a new deep graph clustering method termed +Reinforcement Graph Clustering (RGC). In our proposed method, cluster number +determination and unsupervised representation learning are unified into a +uniform framework by the reinforcement learning mechanism. Concretely, the +discriminative node representations are first learned with the contrastive +pretext task. Then, to capture the clustering state accurately with both local +and global information in the graph, both node and cluster states are +considered. Subsequently, at each state, the qualities of different cluster +numbers are evaluated by the quality network, and the greedy action is executed +to determine the cluster number. In order to conduct feedback actions, the +clustering-oriented reward function is proposed to enhance the cohesion of the +same clusters and separate the different clusters. Extensive experiments +demonstrate the effectiveness and efficiency of our proposed method. The source +code of RGC is shared at https://github.com/yueliu1999/RGC and a collection +(papers, codes and, datasets) of deep graph clustering is shared at +https://github.com/yueliu1999/Awesome-Deep-Graph-Clustering on Github. + +
+
+
+
+
+ + ☆ Approximate and Weighted Data Reconstruction Attack in Federated + Learning + + +
+ Federated Learning (FL) is a distributed learning paradigm that enables +multiple clients to collaborate on building a machine learning model without +sharing their private data. Although FL is considered privacy-preserved by +design, recent data reconstruction attacks demonstrate that an attacker can +recover clients' training data based on the parameters shared in FL. However, +most existing methods fail to attack the most widely used horizontal Federated +Averaging (FedAvg) scenario, where clients share model parameters after +multiple local training steps. To tackle this issue, we propose an +interpolation-based approximation method, which makes attacking FedAvg +scenarios feasible by generating the intermediate model updates of the clients' +local training processes. Then, we design a layer-wise weighted loss function +to improve the data quality of reconstruction. We assign different weights to +model updates in different layers concerning the neural network structure, with +the weights tuned by Bayesian optimization. Finally, experimental results +validate the superiority of our proposed approximate and weighted attack (AWA) +method over the other state-of-the-art methods, as demonstrated by the +substantial improvement in different evaluation metrics for image data +reconstructions. + +
+
+
+
+
+ + ☆ SoK: Realistic Adversarial Attacks and Defenses for Intelligent Network + Intrusion Detection + + +
+ Machine Learning (ML) can be incredibly valuable to automate anomaly +detection and cyber-attack classification, improving the way that Network +Intrusion Detection (NID) is performed. However, despite the benefits of ML +models, they are highly susceptible to adversarial cyber-attack examples +specifically crafted to exploit them. A wide range of adversarial attacks have +been created and researchers have worked on various defense strategies to +safeguard ML models, but most were not intended for the specific constraints of +a communication network and its communication protocols, so they may lead to +unrealistic examples in the NID domain. This Systematization of Knowledge (SoK) +consolidates and summarizes the state-of-the-art adversarial learning +approaches that can generate realistic examples and could be used in real ML +development and deployment scenarios with real network traffic flows. This SoK +also describes the open challenges regarding the use of adversarial ML in the +NID domain, defines the fundamental properties that are required for an +adversarial example to be realistic, and provides guidelines for researchers to +ensure that their future experiments are adequate for a real communication +network. + +
+
+ comment: 31 pages, 3 tables, 6 figures, Computers and Security journal +
+
+
+
+
+ + ☆ SAILOR: Structural Augmentation Based Tail Node Representation Learning CIKM 2023 + + +
+ Graph Neural Networks (GNNs) have achieved state-of-the-art performance in +representation learning for graphs recently. However, the effectiveness of +GNNs, which capitalize on the key operation of message propagation, highly +depends on the quality of the topology structure. Most of the graphs in +real-world scenarios follow a long-tailed distribution on their node degrees, +that is, a vast majority of the nodes in the graph are tail nodes with only a +few connected edges. GNNs produce inferior node representations for tail nodes +since they lack structural information. In the pursuit of promoting the +expressiveness of GNNs for tail nodes, we explore how the deficiency of +structural information deteriorates the performance of tail nodes and propose a +general Structural Augmentation based taIL nOde Representation learning +framework, dubbed as SAILOR, which can jointly learn to augment the graph +structure and extract more informative representations for tail nodes. +Extensive experiments on public benchmark datasets demonstrate that SAILOR can +significantly improve the tail node representations and outperform the +state-of-the-art baselines. + +
+
+ comment: Accepted by CIKM 2023; Code is available at + https://github.com/Jie-Re/SAILO +
+
+
+
+
+ + ☆ Faithful to Whom? Questioning Interpretability Measures in NLP + + +
+ A common approach to quantifying model interpretability is to calculate +faithfulness metrics based on iteratively masking input tokens and measuring +how much the predicted label changes as a result. However, we show that such +metrics are generally not suitable for comparing the interpretability of +different neural text classifiers as the response to masked inputs is highly +model-specific. We demonstrate that iterative masking can produce large +variation in faithfulness scores between comparable models, and show that +masked samples are frequently outside the distribution seen during training. We +further investigate the impact of adversarial attacks and adversarial training +on faithfulness scores, and demonstrate the relevance of faithfulness measures +for analyzing feature salience in text adversarial attacks. Our findings +provide new insights into the limitations of current faithfulness metrics and +key considerations to utilize them appropriately. + +
+
+
+
+
+ + ☆ Neural Networks at a Fraction with Pruned Quaternions + + +
+ Contemporary state-of-the-art neural networks have increasingly large numbers +of parameters, which prevents their deployment on devices with limited +computational power. Pruning is one technique to remove unnecessary weights and +reduce resource requirements for training and inference. In addition, for ML +tasks where the input data is multi-dimensional, using higher-dimensional data +embeddings such as complex numbers or quaternions has been shown to reduce the +parameter count while maintaining accuracy. In this work, we conduct pruning on +real and quaternion-valued implementations of different architectures on +classification tasks. We find that for some architectures, at very high +sparsity levels, quaternion models provide higher accuracies than their real +counterparts. For example, at the task of image classification on CIFAR-10 +using Conv-4, at $3\%$ of the number of parameters as the original model, the +pruned quaternion version outperforms the pruned real by more than $10\%$. +Experiments on various network architectures and datasets show that for +deployment in extremely resource-constrained environments, a sparse quaternion +network might be a better candidate than a real sparse model of similar +architecture. + +
+
+
+
+
+ + ☆ A Survey on Deep Neural Network Pruning-Taxonomy, Comparison, Analysis, + and Recommendations + + +
+ Modern deep neural networks, particularly recent large language models, come +with massive model sizes that require significant computational and storage +resources. To enable the deployment of modern models on resource-constrained +environments and accelerate inference time, researchers have increasingly +explored pruning techniques as a popular research direction in neural network +compression. However, there is a dearth of up-to-date comprehensive review +papers on pruning. To address this issue, in this survey, we provide a +comprehensive review of existing research works on deep neural network pruning +in a taxonomy of 1) universal/specific speedup, 2) when to prune, 3) how to +prune, and 4) fusion of pruning and other compression techniques. We then +provide a thorough comparative analysis of seven pairs of contrast settings for +pruning (e.g., unstructured/structured) and explore emerging topics, including +post-training pruning, different levels of supervision for pruning, and broader +applications (e.g., adversarial robustness) to shed light on the commonalities +and differences of existing methods and lay the foundation for further method +development. To facilitate future research, we build a curated collection of +datasets, networks, and evaluations on different applications. Finally, we +provide some valuable recommendations on selecting pruning methods and prospect +promising research directions. We build a repository at +https://github.com/hrcheng1066/awesome-pruning. + +
+
+
+
+
+ + ☆ Few-shot Class-incremental Learning: A Survey + + +
+ Few-shot Class-Incremental Learning (FSCIL) presents a unique challenge in +machine learning, as it necessitates the continuous learning of new classes +from sparse labeled training samples without forgetting previous knowledge. +While this field has seen recent progress, it remains an active area of +exploration. This paper aims to provide a comprehensive and systematic review +of FSCIL. In our in-depth examination, we delve into various facets of FSCIL, +encompassing the problem definition, the discussion of primary challenges of +unreliable empirical risk minimization and the stability-plasticity dilemma, +general schemes, and relevant problems of incremental learning and few-shot +learning. Besides, we offer an overview of benchmark datasets and evaluation +metrics. Furthermore, we introduce the classification methods in FSCIL from +data-based, structure-based, and optimization-based approaches and the object +detection methods in FSCIL from anchor-free and anchor-based approaches. Beyond +these, we illuminate several promising research directions within FSCIL that +merit further investigation. + +
+
+
+
+
+ + ☆ Discovering the Symptom Patterns of COVID-19 from Recovered and Deceased + Patients Using Apriori Association Rule Mining + + +
+ The COVID-19 pandemic has a devastating impact globally, claiming millions of +lives and causing significant social and economic disruptions. In order to +optimize decision-making and allocate limited resources, it is essential to +identify COVID-19 symptoms and determine the severity of each case. Machine +learning algorithms offer a potent tool in the medical field, particularly in +mining clinical datasets for useful information and guiding scientific +decisions. Association rule mining is a machine learning technique for +extracting hidden patterns from data. This paper presents an application of +association rule mining based Apriori algorithm to discover symptom patterns +from COVID-19 patients. The study, using 2875 records of patient, identified +the most common symptoms as apnea (72%), cough (64%), fever (59%), weakness +(18%), myalgia (14.5%), and sore throat (12%). The proposed method provides +clinicians with valuable insight into disease that can assist them in managing +and treating it effectively. + +
+
+
+
+
+ + ☆ Heterogeneous Multi-Agent Reinforcement Learning via Mirror Descent + Policy Optimization + + +
+ This paper presents an extension of the Mirror Descent method to overcome +challenges in cooperative Multi-Agent Reinforcement Learning (MARL) settings, +where agents have varying abilities and individual policies. The proposed +Heterogeneous-Agent Mirror Descent Policy Optimization (HAMDPO) algorithm +utilizes the multi-agent advantage decomposition lemma to enable efficient +policy updates for each agent while ensuring overall performance improvements. +By iteratively updating agent policies through an approximate solution of the +trust-region problem, HAMDPO guarantees stability and improves performance. +Moreover, the HAMDPO algorithm is capable of handling both continuous and +discrete action spaces for heterogeneous agents in various MARL problems. We +evaluate HAMDPO on Multi-Agent MuJoCo and StarCraftII tasks, demonstrating its +superiority over state-of-the-art algorithms such as HATRPO and HAPPO. These +results suggest that HAMDPO is a promising approach for solving cooperative +MARL problems and could potentially be extended to address other challenging +problems in the field of MARL. + +
+
+
+
+
+ + ☆ Weighted Sparse Partial Least Squares for Joint Sample and Feature + Selection + + +
+ Sparse Partial Least Squares (sPLS) is a common dimensionality reduction +technique for data fusion, which projects data samples from two views by +seeking linear combinations with a small number of variables with the maximum +variance. However, sPLS extracts the combinations between two data sets with +all data samples so that it cannot detect latent subsets of samples. To extend +the application of sPLS by identifying a specific subset of samples and remove +outliers, we propose an $\ell_\infty/\ell_0$-norm constrained weighted sparse +PLS ($\ell_\infty/\ell_0$-wsPLS) method for joint sample and feature selection, +where the $\ell_\infty/\ell_0$-norm constrains are used to select a subset of +samples. We prove that the $\ell_\infty/\ell_0$-norm constrains have the +Kurdyka-\L{ojasiewicz}~property so that a globally convergent algorithm is +developed to solve it. Moreover, multi-view data with a same set of samples can +be available in various real problems. To this end, we extend the +$\ell_\infty/\ell_0$-wsPLS model and propose two multi-view wsPLS models for +multi-view data fusion. We develop an efficient iterative algorithm for each +multi-view wsPLS model and show its convergence property. As well as numerical +and biomedical data experiments demonstrate the efficiency of the proposed +methods. + +
+
+
+
+
+ + ☆ Probabilistic Imputation for Time-series Classification with Missing + Data + + +
+ Multivariate time series data for real-world applications typically contain a +significant amount of missing values. The dominant approach for classification +with such missing values is to impute them heuristically with specific values +(zero, mean, values of adjacent time-steps) or learnable parameters. However, +these simple strategies do not take the data generative process into account, +and more importantly, do not effectively capture the uncertainty in prediction +due to the multiple possibilities for the missing values. In this paper, we +propose a novel probabilistic framework for classification with multivariate +time series data with missing values. Our model consists of two parts; a deep +generative model for missing value imputation and a classifier. Extending the +existing deep generative models to better capture structures of time-series +data, our deep generative model part is trained to impute the missing values in +multiple plausible ways, effectively modeling the uncertainty of the +imputation. The classifier part takes the time series data along with the +imputed missing values and classifies signals, and is trained to capture the +predictive uncertainty due to the multiple possibilities of imputations. +Importantly, we show that na\"ively combining the generative model and the +classifier could result in trivial solutions where the generative model does +not produce meaningful imputations. To resolve this, we present a novel +regularization technique that can promote the model to produce useful +imputation values that help classification. Through extensive experiments on +real-world time series data with missing values, we demonstrate the +effectiveness of our method. + +
+
+
+
+
+ + ☆ Precipitation nowcasting with generative diffusion models + + +
+ In recent years traditional numerical methods for accurate weather prediction +have been increasingly challenged by deep learning methods. Numerous historical +datasets used for short and medium-range weather forecasts are typically +organized into a regular spatial grid structure. This arrangement closely +resembles images: each weather variable can be visualized as a map or, when +considering the temporal axis, as a video. Several classes of generative +models, comprising Generative Adversarial Networks, Variational Autoencoders, +or the recent Denoising Diffusion Models have largely proved their +applicability to the next-frame prediction problem, and is thus natural to test +their performance on the weather prediction benchmarks. Diffusion models are +particularly appealing in this context, due to the intrinsically probabilistic +nature of weather forecasting: what we are really interested to model is the +probability distribution of weather indicators, whose expected value is the +most likely prediction. + In our study, we focus on a specific subset of the ERA-5 dataset, which +includes hourly data pertaining to Central Europe from the years 2016 to 2021. +Within this context, we examine the efficacy of diffusion models in handling +the task of precipitation nowcasting. Our work is conducted in comparison to +the performance of well-established U-Net models, as documented in the existing +literature. Our proposed approach of Generative Ensemble Diffusion (GED) +utilizes a diffusion model to generate a set of possible weather scenarios +which are then amalgamated into a probable prediction via the use of a +post-processing network. This approach, in comparison to recent deep learning +models, substantially outperformed them in terms of overall performance. + +
+
+ comment: 21 pages, 6 figures +
+
+
+
+
+ + ☆ Generalized Independent Noise Condition for Estimating Causal Structure + with Latent Variables + + +
+ We investigate the challenging task of learning causal structure in the +presence of latent variables, including locating latent variables and +determining their quantity, and identifying causal relationships among both +latent and observed variables. To address this, we propose a Generalized +Independent Noise (GIN) condition for linear non-Gaussian acyclic causal models +that incorporate latent variables, which establishes the independence between a +linear combination of certain measured variables and some other measured +variables. Specifically, for two observed random vectors $\bf{Y}$ and $\bf{Z}$, +GIN holds if and only if $\omega^{\intercal}\mathbf{Y}$ and $\mathbf{Z}$ are +independent, where $\omega$ is a non-zero parameter vector determined by the +cross-covariance between $\mathbf{Y}$ and $\mathbf{Z}$. We then give necessary +and sufficient graphical criteria of the GIN condition in linear non-Gaussian +acyclic causal models. Roughly speaking, GIN implies the existence of an +exogenous set $\mathcal{S}$ relative to the parent set of $\mathbf{Y}$ (w.r.t. +the causal ordering), such that $\mathcal{S}$ d-separates $\mathbf{Y}$ from +$\mathbf{Z}$. Interestingly, we find that the independent noise condition +(i.e., if there is no confounder, causes are independent of the residual +derived from regressing the effect on the causes) can be seen as a special case +of GIN. With such a connection between GIN and latent causal structures, we +further leverage the proposed GIN condition, together with a well-designed +search procedure, to efficiently estimate Linear, Non-Gaussian Latent +Hierarchical Models (LiNGLaHs), where latent confounders may also be causally +related and may even follow a hierarchical structure. We show that the +underlying causal structure of a LiNGLaH is identifiable in light of GIN +conditions under mild assumptions. Experimental results show the effectiveness +of the proposed approach. + +
+
+
+
+
+ + ☆ Estimating and Incentivizing Imperfect-Knowledge Agents with Hidden + Rewards + + +
+ In practice, incentive providers (i.e., principals) often cannot observe the +reward realizations of incentivized agents, which is in contrast to many +principal-agent models that have been previously studied. This information +asymmetry challenges the principal to consistently estimate the agent's unknown +rewards by solely watching the agent's decisions, which becomes even more +challenging when the agent has to learn its own rewards. This complex setting +is observed in various real-life scenarios ranging from renewable energy +storage contracts to personalized healthcare incentives. Hence, it offers not +only interesting theoretical questions but also wide practical relevance. This +paper explores a repeated adverse selection game between a self-interested +learning agent and a learning principal. The agent tackles a multi-armed bandit +(MAB) problem to maximize their expected reward plus incentive. On top of the +agent's learning, the principal trains a parallel algorithm and faces a +trade-off between consistently estimating the agent's unknown rewards and +maximizing their own utility by offering adaptive incentives to lead the agent. +For a non-parametric model, we introduce an estimator whose only input is the +history of principal's incentives and agent's choices. We unite this estimator +with a proposed data-driven incentive policy within a MAB framework. Without +restricting the type of the agent's algorithm, we prove finite-sample +consistency of the estimator and a rigorous regret bound for the principal by +considering the sequential externality imposed by the agent. Lastly, our +theoretical results are reinforced by simulations justifying applicability of +our framework to green energy aggregator contracts. + +
+
+ comment: 72 pages, 6 figures. arXiv admin note: text overlap with + arXiv:2304.07407 +
+
+
+
+
+ + ☆ Learning on Graphs with Out-of-Distribution Nodes KDD'22 + + +
+ Graph Neural Networks (GNNs) are state-of-the-art models for performing +prediction tasks on graphs. While existing GNNs have shown great performance on +various tasks related to graphs, little attention has been paid to the scenario +where out-of-distribution (OOD) nodes exist in the graph during training and +inference. Borrowing the concept from CV and NLP, we define OOD nodes as nodes +with labels unseen from the training set. Since a lot of networks are +automatically constructed by programs, real-world graphs are often noisy and +may contain nodes from unknown distributions. In this work, we define the +problem of graph learning with out-of-distribution nodes. Specifically, we aim +to accomplish two tasks: 1) detect nodes which do not belong to the known +distribution and 2) classify the remaining nodes to be one of the known +classes. We demonstrate that the connection patterns in graphs are informative +for outlier detection, and propose Out-of-Distribution Graph Attention Network +(OODGAT), a novel GNN model which explicitly models the interaction between +different kinds of nodes and separate inliers from outliers during feature +propagation. Extensive experiments show that OODGAT outperforms existing +outlier detection methods by a large margin, while being better or comparable +in terms of in-distribution classification. + +
+
+ comment: Accepted by KDD'22 +
+
+
+
+
+ + ☆ The Hard-Constraint PINNs for Interface Optimal Control Problems + + +
+ We show that the physics-informed neural networks (PINNs), in combination +with some recently developed discontinuity capturing neural networks, can be +applied to solve optimal control problems subject to partial differential +equations (PDEs) with interfaces and some control constraints. The resulting +algorithm is mesh-free and scalable to different PDEs, and it ensures the +control constraints rigorously. Since the boundary and interface conditions, as +well as the PDEs, are all treated as soft constraints by lumping them into a +weighted loss function, it is necessary to learn them simultaneously and there +is no guarantee that the boundary and interface conditions can be satisfied +exactly. This immediately causes difficulties in tuning the weights in the +corresponding loss function and training the neural networks. To tackle these +difficulties and guarantee the numerical accuracy, we propose to impose the +boundary and interface conditions as hard constraints in PINNs by developing a +novel neural network architecture. The resulting hard-constraint PINNs approach +guarantees that both the boundary and interface conditions can be satisfied +exactly and they are decoupled from the learning of the PDEs. Its efficiency is +promisingly validated by some elliptic and parabolic interface optimal control +problems. + +
+
+
+
+
+ + ☆ Generating observation guided ensembles for data assimilation with + denoising diffusion probabilistic model + + +
+ This paper presents an ensemble data assimilation method using the pseudo +ensembles generated by denoising diffusion probabilistic model. Since the model +is trained against noisy and sparse observation data, this model can produce +divergent ensembles close to observations. Thanks to the variance in generated +ensembles, our proposed method displays better performance than the +well-established ensemble data assimilation method when the simulation model is +imperfect. + +
+
+
+
+
+ + ☆ Understanding the robustness difference between stochastic gradient + descent and adaptive gradient methods + + +
+ Stochastic gradient descent (SGD) and adaptive gradient methods, such as Adam +and RMSProp, have been widely used in training deep neural networks. We +empirically show that while the difference between the standard generalization +performance of models trained using these methods is small, those trained using +SGD exhibit far greater robustness under input perturbations. Notably, our +investigation demonstrates the presence of irrelevant frequencies in natural +datasets, where alterations do not affect models' generalization performance. +However, models trained with adaptive methods show sensitivity to these +changes, suggesting that their use of irrelevant frequencies can lead to +solutions sensitive to perturbations. To better understand this difference, we +study the learning dynamics of gradient descent (GD) and sign gradient descent +(signGD) on a synthetic dataset that mirrors natural signals. With a +three-dimensional input space, the models optimized with GD and signGD have +standard risks close to zero but vary in their adversarial risks. Our result +shows that linear models' robustness to $\ell_2$-norm bounded changes is +inversely proportional to the model parameters' weight norm: a smaller weight +norm implies better robustness. In the context of deep learning, our +experiments show that SGD-trained neural networks show smaller Lipschitz +constants, explaining the better robustness to input perturbations than those +trained with adaptive gradient methods. + +
+
+
+
+
+ + ☆ Camouflaged Image Synthesis Is All You Need to Boost Camouflaged + Detection + + +
+ Camouflaged objects that blend into natural scenes pose significant +challenges for deep-learning models to detect and synthesize. While camouflaged +object detection is a crucial task in computer vision with diverse real-world +applications, this research topic has been constrained by limited data +availability. We propose a framework for synthesizing camouflage data to +enhance the detection of camouflaged objects in natural scenes. Our approach +employs a generative model to produce realistic camouflage images, which can be +used to train existing object detection models. Specifically, we use a +camouflage environment generator supervised by a camouflage distribution +classifier to synthesize the camouflage images, which are then fed into our +generator to expand the dataset. Our framework outperforms the current +state-of-the-art method on three datasets (COD10k, CAMO, and CHAMELEON), +demonstrating its effectiveness in improving camouflaged object detection. This +approach can serve as a plug-and-play data generation and augmentation module +for existing camouflaged object detection tasks and provides a novel way to +introduce more diversity and distributions into current camouflage datasets. + +
+
+
+
+
+ + ☆ SimMatchV2: Semi-Supervised Learning with Graph Consistency + + +
+ Semi-Supervised image classification is one of the most fundamental problem +in computer vision, which significantly reduces the need for human labor. In +this paper, we introduce a new semi-supervised learning algorithm - SimMatchV2, +which formulates various consistency regularizations between labeled and +unlabeled data from the graph perspective. In SimMatchV2, we regard the +augmented view of a sample as a node, which consists of a label and its +corresponding representation. Different nodes are connected with the edges, +which are measured by the similarity of the node representations. Inspired by +the message passing and node classification in graph theory, we propose four +types of consistencies, namely 1) node-node consistency, 2) node-edge +consistency, 3) edge-edge consistency, and 4) edge-node consistency. We also +uncover that a simple feature normalization can reduce the gaps of the feature +norm between different augmented views, significantly improving the performance +of SimMatchV2. Our SimMatchV2 has been validated on multiple semi-supervised +learning benchmarks. Notably, with ResNet-50 as our backbone and 300 epochs of +training, SimMatchV2 achieves 71.9\% and 76.2\% Top-1 Accuracy with 1\% and +10\% labeled examples on ImageNet, which significantly outperforms the previous +methods and achieves state-of-the-art performance. Code and pre-trained models +are available at +\href{https://github.com/mingkai-zheng/SimMatchV2}{https://github.com/mingkai-zheng/SimMatchV2}. + +
+
+
+
+
+ + ☆ MDB: Interactively Querying Datasets and Models + + +
+ As models are trained and deployed, developers need to be able to +systematically debug errors that emerge in the machine learning pipeline. We +present MDB, a debugging framework for interactively querying datasets and +models. MDB integrates functional programming with relational algebra to build +expressive queries over a database of datasets and model predictions. Queries +are reusable and easily modified, enabling debuggers to rapidly iterate and +refine queries to discover and characterize errors and model behaviors. We +evaluate MDB on object detection, bias discovery, image classification, and +data imputation tasks across self-driving videos, large language models, and +medical records. Our experiments show that MDB enables up to 10x faster and +40\% shorter queries than other baselines. In a user study, we find developers +can successfully construct complex queries that describe errors of machine +learning models. + +
+
+
+
+
+ + ♻ ☆ Analysis of functional neural codes of deep learning models + + +
+ Deep neural networks (DNNs), the agents of deep learning (DL), require a +massive number of parallel/sequential operations. This makes it difficult to +comprehend DNNs' operations and impedes proper diagnosis. Without better +knowledge of their internal process, deploying DNNs in high-stakes domains can +lead to catastrophic failures. Therefore, to build more reliable DNNs/DL to be +deployed in high-stakes real-world problems, it is imperative that we gain +insights into DNNs' internal operations underlying their decision-making. Here, +we use the self-organizing map (SOM) to analyze DL models' internal codes +associated with DNNs' decision-making. Our analyses suggest that shallow layers +close to the input layer compress features into condensed space and that deep +layers close to the output layer expand feature space. We also found evidence +indicating that compressed features may underlie DNNs' vulnerabilities to +adversarial perturbations. + +
+
+ comment: 13 pages, 8 main figures, 3 supplemental figures, 3 supplemental + tables +
+
+
+
+
+ + ♻ ☆ What Constitutes Good Contrastive Learning in Time-Series Forecasting? IJCAI'22 + + +
+ In recent years, the introduction of self-supervised contrastive learning +(SSCL) has demonstrated remarkable improvements in representation learning +across various domains, including natural language processing and computer +vision. By leveraging the inherent benefits of self-supervision, SSCL enables +the pre-training of representation models using vast amounts of unlabeled data. +Despite these advances, there remains a significant gap in understanding the +impact of different SSCL strategies on time series forecasting performance, as +well as the specific benefits that SSCL can bring. This paper aims to address +these gaps by conducting a comprehensive analysis of the effectiveness of +various training variables, including different SSCL algorithms, learning +strategies, model architectures, and their interplay. Additionally, to gain +deeper insights into the improvements brought about by SSCL in the context of +time-series forecasting, a qualitative analysis of the empirical receptive +field is performed. Through our experiments, we demonstrate that the end-to-end +training of a Transformer model using the Mean Squared Error (MSE) loss and +SSCL emerges as the most effective approach in time series forecasting. +Notably, the incorporation of the contrastive objective enables the model to +prioritize more pertinent information for forecasting, such as scale and +periodic relationships. These findings contribute to a better understanding of +the benefits of SSCL in time series forecasting and provide valuable insights +for future research in this area. Our codes are available at +https://github.com/chiyuzhang94/contrastive_learning_time-series_e2e. + +
+
+ comment: Accepted at IJCAI'22 Workshop-AI4TS: AI for Time Series Analysis +
+
+
+
+
+ + ♻ ☆ Hard-Constrained Deep Learning for Climate Downscaling + + +
+ The availability of reliable, high-resolution climate and weather data is +important to inform long-term decisions on climate adaptation and mitigation +and to guide rapid responses to extreme events. Forecasting models are limited +by computational costs and, therefore, often generate coarse-resolution +predictions. Statistical downscaling, including super-resolution methods from +deep learning, can provide an efficient method of upsampling low-resolution +data. However, despite achieving visually compelling results in some cases, +such models frequently violate conservation laws when predicting physical +variables. In order to conserve physical quantities, here we introduce methods +that guarantee statistical constraints are satisfied by a deep learning +downscaling model while also improving their performance according to +traditional metrics. We compare different constraining approaches and +demonstrate their applicability across different neural architectures as well +as a variety of climate and weather datasets. Besides enabling faster and more +accurate climate predictions through downscaling, we also show that our novel +methodologies can improve super-resolution for satellite data and standard +datasets. + +
+
+
+
+
+ + ♻ ☆ CaRT: Certified Safety and Robust Tracking in Learning-based Motion + Planning for Multi-Agent Systems + + +
+ The key innovation of our analytical method, CaRT, lies in establishing a new +hierarchical, distributed architecture to guarantee the safety and robustness +of a given learning-based motion planning policy. First, in a nominal setting, +the analytical form of our CaRT safety filter formally ensures safe maneuvers +of nonlinear multi-agent systems, optimally with minimal deviation from the +learning-based policy. Second, in off-nominal settings, the analytical form of +our CaRT robust filter optimally tracks the certified safe trajectory, +generated by the previous layer in the hierarchy, the CaRT safety filter. We +show using contraction theory that CaRT guarantees safety and the exponential +boundedness of the trajectory tracking error, even under the presence of +deterministic and stochastic disturbance. Also, the hierarchical nature of CaRT +enables enhancing its robustness for safety just by its superior tracking to +the certified safe trajectory, thereby making it suitable for off-nominal +scenarios with large disturbances. This is a major distinction from +conventional safety function-driven approaches, where the robustness originates +from the stability of a safe set, which could pull the system +over-conservatively to the interior of the safe set. Our log-barrier +formulation in CaRT allows for its distributed implementation in multi-agent +settings. We demonstrate the effectiveness of CaRT in several examples of +nonlinear motion planning and control problems, including optimal, +multi-spacecraft reconfiguration. + +
+
+ comment: IEEE Conference on Decision and Control (CDC), Preprint Version, + Accepted July, 2023 +
+
+
+
+
+ + ♻ ☆ Rotation-equivariant Graph Neural Networks for Learning Glassy Liquids + Representations + + +
+ Within the glassy liquids community, the use of Machine Learning (ML) to +model particles' static structure is currently a hot topic. The state of the +art consists in Graph Neural Networks (GNNs), which have a great expressive +power but are heavy models with numerous parameters and lack interpretability. +Inspired by recent advances in the field of Machine Learning group-equivariant +representations, we build a GNN that learns a robust representation of the +glass' static structure by constraining it to preserve the roto-translation +(SE(3)) equivariance. We show that this constraint not only significantly +improves the predictive power but also improves the ability to generalize to +unseen temperatures while allowing to reduce the number of parameters. +Furthermore, interpretability is improved, as we can relate the action of our +basic convolution layer to well-known rotation-invariant expert features. +Through transfer-learning experiments we demonstrate that our network learns a +robust representation, which allows us to push forward the idea of a learned +glass structural order parameter. + +
+
+ comment: 15 pages, 9 figures plus references and appendix +
+
+
+
+
+ + ♻ ☆ Large Language Models can Implement Policy Iteration ICLR 2023 + + +
+ This work presents In-Context Policy Iteration, an algorithm for performing +Reinforcement Learning (RL), in-context, using foundation models. While the +application of foundation models to RL has received considerable attention, +most approaches rely on either (1) the curation of expert demonstrations +(either through manual design or task-specific pretraining) or (2) adaptation +to the task of interest using gradient methods (either fine-tuning or training +of adapter layers). Both of these techniques have drawbacks. Collecting +demonstrations is labor-intensive, and algorithms that rely on them do not +outperform the experts from which the demonstrations were derived. All gradient +techniques are inherently slow, sacrificing the "few-shot" quality that made +in-context learning attractive to begin with. In this work, we present an +algorithm, ICPI, that learns to perform RL tasks without expert demonstrations +or gradients. Instead we present a policy-iteration method in which the prompt +content is the entire locus of learning. ICPI iteratively updates the contents +of the prompt from which it derives its policy through trial-and-error +interaction with an RL environment. In order to eliminate the role of +in-weights learning (on which approaches like Decision Transformer rely +heavily), we demonstrate our algorithm using Codex, a language model with no +prior knowledge of the domains on which we evaluate it. + +
+
+ comment: 10 pages, 4 figures, submitted to ICLR 2023 +
+
+
+
+
+ + ♻ ☆ A Primal-Dual Algorithm for Hybrid Federated Learning + + +
+ Very few methods for hybrid federated learning, where clients only hold +subsets of both features and samples, exist. Yet, this scenario is very +important in practical settings. We provide a fast, robust algorithm for hybrid +federated learning that hinges on Fenchel Duality. We prove the convergence of +the algorithm to the same solution as if the model was trained centrally in a +variety of practical regimes. Furthermore, we provide experimental results that +demonstrate the performance improvements of the algorithm over a commonly used +method in federated learning, FedAvg. We also provide privacy considerations +and necessary steps to protect client data. + +
+
+
+
+
+ + ♻ ☆ On the Power of Gradual Network Alignment Using Dual-Perception + Similarities + + +
+ Network alignment (NA) is the task of finding the correspondence of nodes +between two networks based on the network structure and node attributes. Our +study is motivated by the fact that, since most of existing NA methods have +attempted to discover all node pairs at once, they do not harness information +enriched through interim discovery of node correspondences to more accurately +find the next correspondences during the node matching. To tackle this +challenge, we propose Grad-Align, a new NA method that gradually discovers node +pairs by making full use of node pairs exhibiting strong consistency, which are +easy to be discovered in the early stage of gradual matching. Specifically, +Grad-Align first generates node embeddings of the two networks based on graph +neural networks along with our layer-wise reconstruction loss, a loss built +upon capturing the first-order and higher-order neighborhood structures. Then, +nodes are gradually aligned by computing dual-perception similarity measures +including the multi-layer embedding similarity as well as the Tversky +similarity, an asymmetric set similarity using the Tversky index applicable to +networks with different scales. Additionally, we incorporate an edge +augmentation module into Grad-Align to reinforce the structural consistency. +Through comprehensive experiments using real-world and synthetic datasets, we +empirically demonstrate that Grad-Align consistently outperforms +state-of-the-art NA methods. + +
+
+ comment: 16 pages, 11 figures, 4 tables; 13 pages, to appear in the IEEE + Transactions on Pattern Analysis and Machine Intelligence (Please cite our + journal version that will appear in an upcoming issue.) +
+
+
+
+
+ + ♻ ☆ Decentralized SGD and Average-direction SAM are Asymptotically + Equivalent ICML 2023 + + +
+ Decentralized stochastic gradient descent (D-SGD) allows collaborative +learning on massive devices simultaneously without the control of a central +server. However, existing theories claim that decentralization invariably +undermines generalization. In this paper, we challenge the conventional belief +and present a completely new perspective for understanding decentralized +learning. We prove that D-SGD implicitly minimizes the loss function of an +average-direction Sharpness-aware minimization (SAM) algorithm under general +non-convex non-$\beta$-smooth settings. This surprising asymptotic equivalence +reveals an intrinsic regularization-optimization trade-off and three advantages +of decentralization: (1) there exists a free uncertainty evaluation mechanism +in D-SGD to improve posterior estimation; (2) D-SGD exhibits a gradient +smoothing effect; and (3) the sharpness regularization effect of D-SGD does not +decrease as total batch size increases, which justifies the potential +generalization benefit of D-SGD over centralized SGD (C-SGD) in large-batch +scenarios. + +
+
+ comment: Accepted for publication in the 40th International Conference on + Machine Learning (ICML 2023) +
+
+
+
+
+ + ♻ ☆ AlignDet: Aligning Pre-training and Fine-tuning in Object Detection ICCV 2023 + + +
+ The paradigm of large-scale pre-training followed by downstream fine-tuning +has been widely employed in various object detection algorithms. In this paper, +we reveal discrepancies in data, model, and task between the pre-training and +fine-tuning procedure in existing practices, which implicitly limit the +detector's performance, generalization ability, and convergence speed. To this +end, we propose AlignDet, a unified pre-training framework that can be adapted +to various existing detectors to alleviate the discrepancies. AlignDet +decouples the pre-training process into two stages, i.e., image-domain and +box-domain pre-training. The image-domain pre-training optimizes the detection +backbone to capture holistic visual abstraction, and box-domain pre-training +learns instance-level semantics and task-aware concepts to initialize the parts +out of the backbone. By incorporating the self-supervised pre-trained +backbones, we can pre-train all modules for various detectors in an +unsupervised paradigm. As depicted in Figure 1, extensive experiments +demonstrate that AlignDet can achieve significant improvements across diverse +protocols, such as detection algorithm, model backbone, data setting, and +training schedule. For example, AlignDet improves FCOS by 5.3 mAP, RetinaNet by +2.1 mAP, Faster R-CNN by 3.3 mAP, and DETR by 2.3 mAP under fewer epochs. + +
+
+ comment: Camera Ready Version on ICCV 2023. Code and Models are publicly + available. Project Page: https://liming-ai.github.io/AlignDet +
+
+
+
+
+ + ♻ ☆ Fine-grained Graph Learning for Multi-view Subspace Clustering + + +
+ Multi-view subspace clustering (MSC) is a popular unsupervised method by +integrating heterogeneous information to reveal the intrinsic clustering +structure hidden across views. Usually, MSC methods use graphs (or affinity +matrices) fusion to learn a common structure, and further apply graph-based +approaches to clustering. Despite progress, most of the methods do not +establish the connection between graph learning and clustering. Meanwhile, +conventional graph fusion strategies assign coarse-grained weights to combine +multi-graph, ignoring the importance of local structure. In this paper, we +propose a fine-grained graph learning framework for multi-view subspace +clustering (FGL-MSC) to address these issues. To utilize the multi-view +information sufficiently, we design a specific graph learning method by +introducing graph regularization and a local structure fusion pattern. The main +challenge is how to optimize the fine-grained fusion weights while generating +the learned graph that fits the clustering task, thus making the clustering +representation meaningful and competitive. Accordingly, an iterative algorithm +is proposed to solve the above joint optimization problem, which obtains the +learned graph, the clustering representation, and the fusion weights +simultaneously. Extensive experiments on eight real-world datasets show that +the proposed framework has comparable performance to the state-of-the-art +methods. The source code of the proposed method is available at +https://github.com/siriuslay/FGL-MSC. + +
+
+
+
+
+ + ♻ ☆ Spectral Ranking Inferences based on General Multiway Comparisons + + +
+ This paper studies the performance of the spectral method in the estimation +and uncertainty quantification of the unobserved preference scores of compared +entities in a very general and more realistic setup in which the comparison +graph consists of hyper-edges of possible heterogeneous sizes and the number of +comparisons can be as low as one for a given hyper-edge. Such a setting is +pervasive in real applications, circumventing the need to specify the graph +randomness and the restrictive homogeneous sampling assumption imposed in the +commonly-used Bradley-Terry-Luce (BTL) or Plackett-Luce (PL) models. +Furthermore, in the scenarios when the BTL or PL models are appropriate, we +unravel the relationship between the spectral estimator and the Maximum +Likelihood Estimator (MLE). We discover that a two-step spectral method, where +we apply the optimal weighting estimated from the equal weighting vanilla +spectral method, can achieve the same asymptotic efficiency as the MLE. Given +the asymptotic distributions of the estimated preference scores, we also +introduce a comprehensive framework to carry out both one-sample and two-sample +ranking inferences, applicable to both fixed and random graph settings. It is +noteworthy that it is the first time effective two-sample rank testing methods +are proposed. Finally, we substantiate our findings via comprehensive numerical +simulations and subsequently apply our developed methodologies to perform +statistical inferences on statistics journals and movie rankings. + +
+
+
+
+
+ + ♻ ☆ Q-Learning for MDPs with General Spaces: Convergence and Near Optimality + via Quantization under Weak Continuity + + +
+ Reinforcement learning algorithms often require finiteness of state and +action spaces in Markov decision processes (MDPs) (also called controlled +Markov chains) and various efforts have been made in the literature towards the +applicability of such algorithms for continuous state and action spaces. In +this paper, we show that under very mild regularity conditions (in particular, +involving only weak continuity of the transition kernel of an MDP), Q-learning +for standard Borel MDPs via quantization of states and actions (called +Quantized Q-Learning) converges to a limit, and furthermore this limit +satisfies an optimality equation which leads to near optimality with either +explicit performance bounds or which are guaranteed to be asymptotically +optimal. Our approach builds on (i) viewing quantization as a measurement +kernel and thus a quantized MDP as a partially observed Markov decision process +(POMDP), (ii) utilizing near optimality and convergence results of Q-learning +for POMDPs, and (iii) finally, near-optimality of finite state model +approximations for MDPs with weakly continuous kernels which we show to +correspond to the fixed point of the constructed POMDP. Thus, our paper +presents a very general convergence and approximation result for the +applicability of Q-learning for continuous MDPs. + +
+
+
+
+
+ + ♻ ☆ Tiny-PPG: A Lightweight Deep Neural Network for Real-Time Detection of + Motion Artifacts in Photoplethysmogram Signals on Edge Devices + + +
+ Photoplethysmogram (PPG) signals are easily contaminated by motion artifacts +in real-world settings, despite their widespread use in Internet-of-Things +(IoT) based wearable and smart health devices for cardiovascular health +monitoring. This study proposed a lightweight deep neural network, called +Tiny-PPG, for accurate and real-time PPG artifact segmentation on IoT edge +devices. The model was trained and tested on a public dataset, PPG DaLiA, which +featured complex artifacts with diverse lengths and morphologies during various +daily activities of 15 subjects using a watch-type device (Empatica E4). The +model structure, training method and loss function were specifically designed +to balance detection accuracy and speed for real-time PPG artifact detection in +resource-constrained embedded devices. To optimize the model size and +capability in multi-scale feature representation, the model employed depth-wise +separable convolution and atrous spatial pyramid pooling modules, respectively. +Additionally, the contrastive loss was also utilized to further optimize the +feature embeddings. With additional model pruning, Tiny-PPG achieved +state-of-the-art detection accuracy of 87.4% while only having 19,726 model +parameters (0.15 megabytes), and was successfully deployed on an STM32 embedded +system for real-time PPG artifact detection. Therefore, this study provides an +effective solution for resource-constraint IoT smart health devices in PPG +artifact detection. + +
+
+
+
+
+ + ♻ ☆ Learning Human-Human Interactions in Images from Weak Textual + Supervision ICCV 2023 + + +
+ Interactions between humans are diverse and context-dependent, but previous +works have treated them as categorical, disregarding the heavy tail of possible +interactions. We propose a new paradigm of learning human-human interactions as +free text from a single still image, allowing for flexibility in modeling the +unlimited space of situations and relationships between people. To overcome the +absence of data labelled specifically for this task, we use knowledge +distillation applied to synthetic caption data produced by a large language +model without explicit supervision. We show that the pseudo-labels produced by +this procedure can be used to train a captioning model to effectively +understand human-human interactions in images, as measured by a variety of +metrics that measure textual and semantic faithfulness and factual groundedness +of our predictions. We further show that our approach outperforms SOTA image +captioning and situation recognition models on this task. We will release our +code and pseudo-labels along with Waldo and Wenda, a manually-curated test set +for still image human-human interaction understanding. + +
+
+ comment: To be presented at ICCV 2023. Project webpage: + https://learning-interactions.github.io +
+
+
+
+
+ + ♻ ☆ UDTIRI: An Open-Source Intelligent Road Inspection Benchmark Suite + + +
+ It is seen that there is enormous potential to leverage powerful deep +learning methods in the emerging field of urban digital twins. It is +particularly in the area of intelligent road inspection where there is +currently limited research and data available. To facilitate progress in this +field, we have developed a well-labeled road pothole dataset named Urban +Digital Twins Intelligent Road Inspection (UDTIRI) dataset. We hope this +dataset will enable the use of powerful deep learning methods in urban road +inspection, providing algorithms with a more comprehensive understanding of the +scene and maximizing their potential. Our dataset comprises 1000 images of +potholes, captured in various scenarios with different lighting and humidity +conditions. Our intention is to employ this dataset for object detection, +semantic segmentation, and instance segmentation tasks. Our team has devoted +significant effort to conducting a detailed statistical analysis, and +benchmarking a selection of representative algorithms from recent years. We +also provide a multi-task platform for researchers to fully exploit the +performance of various algorithms with the support of UDTIRI dataset. + +
+
+ comment: Database webpage: https://www.udtiri.com/, Kaggle webpage: + https://www.kaggle.com/datasets/jiahangli617/udtiri +
+
+
+
+
+ + ♻ ☆ Permutation Decision Trees + + +
+ Decision Tree is a well understood Machine Learning model that is based on +minimizing impurities in the internal nodes. The most common impurity measures +are Shannon entropy and Gini impurity. These impurity measures are insensitive +to the order of training data and hence the final tree obtained is invariant to +any permutation of the data. This leads to a serious limitation in modeling +data instances that have order dependencies. In this work, we propose the use +of Effort-To-Compress (ETC) - a complexity measure, for the first time, as an +impurity measure. Unlike Shannon entropy and Gini impurity, structural impurity +based on ETC is able to capture order dependencies in the data, thus obtaining +potentially different decision trees for different permutations of the same +data instances (Permutation Decision Trees). We then introduce the notion of +Permutation Bagging achieved using permutation decision trees without the need +for random feature selection and sub-sampling. We compare the performance of +the proposed permutation bagged decision trees with Random Forests. Our model +does not assume that the data instances are independent and identically +distributed. Potential applications include scenarios where a temporal order +present in the data instances is to be respected. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Approximation and Non-parametric Estimation of ResNet-type Convolutional + Neural Networks + + +
+ Convolutional neural networks (CNNs) have been shown to achieve optimal +approximation and estimation error rates (in minimax sense) in several function +classes. However, previous analyzed optimal CNNs are unrealistically wide and +difficult to obtain via optimization due to sparse constraints in important +function classes, including the H\"older class. We show a ResNet-type CNN can +attain the minimax optimal error rates in these classes in more plausible +situations -- it can be dense, and its width, channel size, and filter size are +constant with respect to sample size. The key idea is that we can replicate the +learning ability of Fully-connected neural networks (FNNs) by tailored CNNs, as +long as the FNNs have \textit{block-sparse} structures. Our theory is general +in a sense that we can automatically translate any approximation rate achieved +by block-sparse FNNs into that by CNNs. As an application, we derive +approximation and estimation error rates of the aformentioned type of CNNs for +the Barron and H\"older classes with the same strategy. + +
+
+ comment: Version 4: Fixed the constant B^{(fc)} in Theorems 1, 5 and the norm + upper bound of w^{(l)}_m in Lemma 1. 8 pages + References 2 pages + + Supplemental material 18 pages +
+
+
+
+
+ + ♻ ☆ Harmonic (Quantum) Neural Networks + + +
+ Harmonic functions are abundant in nature, appearing in limiting cases of +Maxwell's, Navier-Stokes equations, the heat and the wave equation. +Consequently, there are many applications of harmonic functions from industrial +process optimisation to robotic path planning and the calculation of first exit +times of random walks. Despite their ubiquity and relevance, there have been +few attempts to incorporate inductive biases towards harmonic functions in +machine learning contexts. In this work, we demonstrate effective means of +representing harmonic functions in neural networks and extend such results also +to quantum neural networks to demonstrate the generality of our approach. We +benchmark our approaches against (quantum) physics-informed neural networks, +where we show favourable performance. + +
+
+ comment: 12 pages (main), 7 pages (supplementary), 7 figures +
+
+
+
+
+ + ♻ ☆ Nexus sine qua non: Essentially Connected Networks for Traffic + Forecasting + + +
+ Spatiotemporal graph neural networks (STGNNs) have emerged as a leading +approach for learning representations and forecasting on traffic datasets with +underlying topological and correlational structures. However, current STGNNs +use intricate techniques with high complexities to capture these structures, +making them difficult to understand and scale. The existence of simple yet +efficient architectures remains an open question. Upon closer examination, we +find what lies at the core of STGNN's representations are certain forms of +spatiotemporal contextualization. In light of this, we design Nexus sine qua +non (NexuSQN), an essentially connected network built on an efficient +message-passing backbone. NexuSQN simply uses learnable "where" and "when" +locators for the aforementioned contextualization and omits any intricate +components such as RNNs, Transformers, and diffusion convolutions. Results show +that NexuSQN outperforms intricately designed benchmarks in terms of size, +computational efficiency, and accuracy. This suggests a promising future for +developing simple yet efficient neural predictors. + +
+
+
+
+
+ + ♻ ☆ Model-Based Safe Reinforcement Learning with Time-Varying State and + Control Constraints: An Application to Intelligent Vehicles + + +
+ Recently, safe reinforcement learning (RL) with the actor-critic structure +for continuous control tasks has received increasing attention. It is still +challenging to learn a near-optimal control policy with safety and convergence +guarantees. Also, few works have addressed the safe RL algorithm design under +time-varying safety constraints. This paper proposes a safe RL algorithm for +optimal control of nonlinear systems with time-varying state and control +constraints. In the proposed approach, we construct a novel barrier force-based +control policy structure to guarantee control safety. A multi-step policy +evaluation mechanism is proposed to predict the policy's safety risk under +time-varying safety constraints and guide the policy to update safely. +Theoretical results on stability and robustness are proven. Also, the +convergence of the actor-critic implementation is analyzed. The performance of +the proposed algorithm outperforms several state-of-the-art RL algorithms in +the simulated Safety Gym environment. Furthermore, the approach is applied to +the integrated path following and collision avoidance problem for two +real-world intelligent vehicles. A differential-drive vehicle and an +Ackermann-drive one are used to verify offline deployment and online learning +performance, respectively. Our approach shows an impressive sim-to-real +transfer capability and a satisfactory online control performance in the +experiment. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Multi-view Graph Convolutional Networks with Differentiable Node + Selection + + +
+ Multi-view data containing complementary and consensus information can +facilitate representation learning by exploiting the intact integration of +multi-view features. Because most objects in real world often have underlying +connections, organizing multi-view data as heterogeneous graphs is beneficial +to extracting latent information among different objects. Due to the powerful +capability to gather information of neighborhood nodes, in this paper, we apply +Graph Convolutional Network (GCN) to cope with heterogeneous-graph data +originating from multi-view data, which is still under-explored in the field of +GCN. In order to improve the quality of network topology and alleviate the +interference of noises yielded by graph fusion, some methods undertake sorting +operations before the graph convolution procedure. These GCN-based methods +generally sort and select the most confident neighborhood nodes for each +vertex, such as picking the top-k nodes according to pre-defined confidence +values. Nonetheless, this is problematic due to the non-differentiable sorting +operators and inflexible graph embedding learning, which may result in blocked +gradient computations and undesired performance. To cope with these issues, we +propose a joint framework dubbed Multi-view Graph Convolutional Network with +Differentiable Node Selection (MGCN-DNS), which is constituted of an adaptive +graph fusion layer, a graph learning module and a differentiable node selection +schema. MGCN-DNS accepts multi-channel graph-structural data as inputs and aims +to learn more robust graph fusion through a differentiable neural network. The +effectiveness of the proposed method is verified by rigorous comparisons with +considerable state-of-the-art approaches in terms of multi-view semi-supervised +classification tasks. + +
+
+
+
+
+ + ♻ ☆ SmartGD: A GAN-Based Graph Drawing Framework for Diverse Aesthetic Goals + + +
+ While a multitude of studies have been conducted on graph drawing, many +existing methods only focus on optimizing a single aesthetic aspect of graph +layouts, which can lead to sub-optimal results. There are a few existing +methods that have attempted to develop a flexible solution for optimizing +different aesthetic aspects measured by different aesthetic criteria. +Furthermore, thanks to the significant advance in deep learning techniques, +several deep learning-based layout methods were proposed recently. These +methods have demonstrated the advantages of deep learning approaches for graph +drawing. However, none of these existing methods can be directly applied to +optimizing non-differentiable criteria without special accommodation. In this +work, we propose a novel Generative Adversarial Network (GAN) based deep +learning framework for graph drawing, called SmartGD, which can optimize +different quantitative aesthetic goals, regardless of their differentiability. +To demonstrate the effectiveness and efficiency of SmartGD, we conducted +experiments on minimizing stress, minimizing edge crossing, maximizing crossing +angle, maximizing shape-based metrics, and a combination of multiple +aesthetics. Compared with several popular graph drawing algorithms, the +experimental results show that SmartGD achieves good performance both +quantitatively and qualitatively. + +
+
+
+
+
+ + ♻ ☆ Kairos: Practical Intrusion Detection and Investigation using + Whole-system Provenance + + +
+ Provenance graphs are structured audit logs that describe the history of a +system's execution. Recent studies have explored a variety of techniques to +analyze provenance graphs for automated host intrusion detection, focusing +particularly on advanced persistent threats. Sifting through their design +documents, we identify four common dimensions that drive the development of +provenance-based intrusion detection systems (PIDSes): scope (can PIDSes detect +modern attacks that infiltrate across application boundaries?), attack +agnosticity (can PIDSes detect novel attacks without a priori knowledge of +attack characteristics?), timeliness (can PIDSes efficiently monitor host +systems as they run?), and attack reconstruction (can PIDSes distill attack +activity from large provenance graphs so that sysadmins can easily understand +and quickly respond to system intrusion?). We present KAIROS, the first PIDS +that simultaneously satisfies the desiderata in all four dimensions, whereas +existing approaches sacrifice at least one and struggle to achieve comparable +detection performance. + Kairos leverages a novel graph neural network-based encoder-decoder +architecture that learns the temporal evolution of a provenance graph's +structural changes to quantify the degree of anomalousness for each system +event. Then, based on this fine-grained information, Kairos reconstructs attack +footprints, generating compact summary graphs that accurately describe +malicious activity over a stream of system audit logs. Using state-of-the-art +benchmark datasets, we demonstrate that Kairos outperforms previous approaches. + +
+
+ comment: 23 pages, 16 figures, to appear in the 45th IEEE Symposium on + Security and Privacy (S&P'24) +
+
+
+
+
+ + ♻ ☆ Overparameterized random feature regression with nearly orthogonal data + + +
+ We investigate the properties of random feature ridge regression (RFRR) given +by a two-layer neural network with random Gaussian initialization. We study the +non-asymptotic behaviors of the RFRR with nearly orthogonal deterministic +unit-length input data vectors in the overparameterized regime, where the width +of the first layer is much larger than the sample size. Our analysis shows +high-probability non-asymptotic concentration results for the training errors, +cross-validations, and generalization errors of RFRR centered around their +respective values for a kernel ridge regression (KRR). This KRR is derived from +an expected kernel generated by a nonlinear random feature map. We then +approximate the performance of the KRR by a polynomial kernel matrix obtained +from the Hermite polynomial expansion of the activation function, whose degree +only depends on the orthogonality among different data points. This polynomial +kernel determines the asymptotic behavior of the RFRR and the KRR. Our results +hold for a wide variety of activation functions and input data sets that +exhibit nearly orthogonal properties. Based on these approximations, we obtain +a lower bound for the generalization error of the RFRR for a nonlinear +student-teacher model. + +
+
+ comment: 39 pages. A condition on the activation function is added in + Assumption 2.2 +
+
+
+
+
+ + ♻ ☆ Review of medical data analysis based on spiking neural networks + + +
+ Medical data mainly includes various types of biomedical signals and medical +images, which can be used by professional doctors to make judgments on +patients' health conditions. However, the interpretation of medical data +requires a lot of human cost and there may be misjudgments, so many scholars +use neural networks and deep learning to classify and study medical data, which +can improve the efficiency and accuracy of doctors and detect diseases early +for early diagnosis, etc. Therefore, it has a wide range of application +prospects. However, traditional neural networks have disadvantages such as high +energy consumption and high latency (slow computation speed). This paper +presents recent research on signal classification and disease diagnosis based +on a third-generation neural network, the spiking neuron network, using medical +data including EEG signals, ECG signals, EMG signals and MRI images. The +advantages and disadvantages of pulsed neural networks compared with +traditional networks are summarized and its development orientation in the +future is prospected. + +
+
+
+
+
+ + ♻ ☆ Slice Transformer and Self-supervised Learning for 6DoF Localization in + 3D Point Cloud Maps ICRA + + +
+ Precise localization is critical for autonomous vehicles. We present a +self-supervised learning method that employs Transformers for the first time +for the task of outdoor localization using LiDAR data. We propose a pre-text +task that reorganizes the slices of a $360^\circ$ LiDAR scan to leverage its +axial properties. Our model, called Slice Transformer, employs multi-head +attention while systematically processing the slices. To the best of our +knowledge, this is the first instance of leveraging multi-head attention for +outdoor point clouds. We additionally introduce the Perth-WA dataset, which +provides a large-scale LiDAR map of Perth city in Western Australia, covering +$\sim$4km$^2$ area. Localization annotations are provided for Perth-WA. The +proposed localization method is thoroughly evaluated on Perth-WA and +Appollo-SouthBay datasets. We also establish the efficacy of our +self-supervised learning approach for the common downstream task of object +classification using ModelNet40 and ScanNN datasets. The code and Perth-WA data +will be publicly released. + +
+
+ comment: Accepted in IEEE International Conference on Robotics and Automation + (ICRA), 2023 +
+
+
+
+
+ + ♻ ☆ Scalable Decision-Focused Learning in Restless Multi-Armed Bandits with + Application to Maternal and Child Health + + +
+ This paper studies restless multi-armed bandit (RMAB) problems with unknown +arm transition dynamics but with known correlated arm features. The goal is to +learn a model to predict transition dynamics given features, where the Whittle +index policy solves the RMAB problems using predicted transitions. However, +prior works often learn the model by maximizing the predictive accuracy instead +of final RMAB solution quality, causing a mismatch between training and +evaluation objectives. To address this shortcoming, we propose a novel approach +for decision-focused learning in RMAB that directly trains the predictive model +to maximize the Whittle index solution quality. We present three key +contributions: (i) we establish differentiability of the Whittle index policy +to support decision-focused learning; (ii) we significantly improve the +scalability of decision-focused learning approaches in sequential problems, +specifically RMAB problems; (iii) we apply our algorithm to a previously +collected dataset of maternal and child health to demonstrate its performance. +Indeed, our algorithm is the first for decision-focused learning in RMAB that +scales to real-world problem sizes. + +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ UGC Quality Assessment: Exploring the Impact of Saliency in Deep + Feature-Based Quality Assessment + + +
+ The volume of User Generated Content (UGC) has increased in recent years. The +challenge with this type of content is assessing its quality. So far, the +state-of-the-art metrics are not exhibiting a very high correlation with +perceptual quality. In this paper, we explore state-of-the-art metrics that +extract/combine natural scene statistics and deep neural network features. We +experiment with these by introducing saliency maps to improve perceptibility. +We train and test our models using public datasets, namely, YouTube-UGC and +KoNViD-1k. Preliminary results indicate that high correlations are achieved by +using only deep features while adding saliency is not always boosting the +performance. Our results and code will be made publicly available to serve as a +benchmark for the research community and can be found on our project page: +https://github.com/xinyiW915/SPIE-2023-Supplementary. + +
+
+
+
+
+ + ☆ CLE Diffusion: Controllable Light Enhancement Diffusion Model + + +
+ Low light enhancement has gained increasing importance with the rapid +development of visual creation and editing. However, most existing enhancement +algorithms are designed to homogeneously increase the brightness of images to a +pre-defined extent, limiting the user experience. To address this issue, we +propose Controllable Light Enhancement Diffusion Model, dubbed CLE Diffusion, a +novel diffusion framework to provide users with rich controllability. Built +with a conditional diffusion model, we introduce an illumination embedding to +let users control their desired brightness level. Additionally, we incorporate +the Segment-Anything Model (SAM) to enable user-friendly region +controllability, where users can click on objects to specify the regions they +wish to enhance. Extensive experiments demonstrate that CLE Diffusion achieves +competitive performance regarding quantitative metrics, qualitative results, +and versatile controllability. Project page: +\url{https://yuyangyin.github.io/CLEDiffusion/} + +
+
+
+
+
+ + ☆ MACO: A Modality Adversarial and Contrastive Framework for + Modality-missing Multi-modal Knowledge Graph Completion NLPCC 2023 + + +
+ Recent years have seen significant advancements in multi-modal knowledge +graph completion (MMKGC). MMKGC enhances knowledge graph completion (KGC) by +integrating multi-modal entity information, thereby facilitating the discovery +of unobserved triples in the large-scale knowledge graphs (KGs). Nevertheless, +existing methods emphasize the design of elegant KGC models to facilitate +modality interaction, neglecting the real-life problem of missing modalities in +KGs. The missing modality information impedes modal interaction, consequently +undermining the model's performance. In this paper, we propose a modality +adversarial and contrastive framework (MACO) to solve the modality-missing +problem in MMKGC. MACO trains a generator and discriminator adversarially to +generate missing modality features that can be incorporated into the MMKGC +model. Meanwhile, we design a cross-modal contrastive loss to improve the +performance of the generator. Experiments on public benchmarks with further +explorations demonstrate that MACO could achieve state-of-the-art results and +serve as a versatile framework to bolster various MMKGC models. Our code and +benchmark data are available at https://github.com/zjukg/MACO. + +
+
+ comment: This is the ArXiv version of our paper accepted by NLPCC 2023. The + code will be released soon +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 30 + +
+
+
+ + ☆ Bio-SIEVE: Exploring Instruction Tuning Large Language Models for + Systematic Review Automation + + +
+ Medical systematic reviews can be very costly and resource intensive. We +explore how Large Language Models (LLMs) can support and be trained to perform +literature screening when provided with a detailed set of selection criteria. +Specifically, we instruction tune LLaMA and Guanaco models to perform abstract +screening for medical systematic reviews. Our best model, Bio-SIEVE, +outperforms both ChatGPT and trained traditional approaches, and generalises +better across medical domains. However, there remains the challenge of adapting +the model to safety-first scenarios. We also explore the impact of multi-task +training with Bio-SIEVE-Multi, including tasks such as PICO extraction and +exclusion reasoning, but find that it is unable to match single-task +Bio-SIEVE's performance. We see Bio-SIEVE as an important step towards +specialising LLMs for the biomedical systematic review process and explore its +future developmental opportunities. We release our models, code and a list of +DOIs to reconstruct our dataset for reproducibility. + +
+
+
+
+
+ + ☆ VisIT-Bench: A Benchmark for Vision-Language Instruction Following + Inspired by Real-World Use + + +
+ We introduce VisIT-Bench (Visual InsTruction Benchmark), a benchmark for +evaluation of instruction-following vision-language models for real-world use. +Our starting point is curating 70 'instruction families' that we envision +instruction tuned vision-language models should be able to address. Extending +beyond evaluations like VQAv2 and COCO, tasks range from basic recognition to +game playing and creative generation. Following curation, our dataset comprises +592 test queries, each with a human-authored instruction-conditioned caption. +These descriptions surface instruction-specific factors, e.g., for an +instruction asking about the accessibility of a storefront for wheelchair +users, the instruction-conditioned caption describes ramps/potential obstacles. +These descriptions enable 1) collecting human-verified reference outputs for +each instance; and 2) automatic evaluation of candidate multimodal generations +using a text-only LLM, aligning with human judgment. We quantify quality gaps +between models and references using both human and automatic evaluations; e.g., +the top-performing instruction-following model wins against the GPT-4 reference +in just 27% of the comparison. VisIT-Bench is dynamic to participate, +practitioners simply submit their model's response on the project website; +Data, code and leaderboard is available at visit-bench.github.io. + +
+
+
+
+
+ + ☆ MT4CrossOIE: Multi-stage Tuning for Cross-lingual Open Information + Extraction + + +
+ Cross-lingual open information extraction aims to extract structured +information from raw text across multiple languages. Previous work uses a +shared cross-lingual pre-trained model to handle the different languages but +underuses the potential of the language-specific representation. In this paper, +we propose an effective multi-stage tuning framework called MT4CrossIE, +designed for enhancing cross-lingual open information extraction by injecting +language-specific knowledge into the shared model. Specifically, the +cross-lingual pre-trained model is first tuned in a shared semantic space +(e.g., embedding matrix) in the fixed encoder and then other components are +optimized in the second stage. After enough training, we freeze the pre-trained +model and tune the multiple extra low-rank language-specific modules using +mixture-of-LoRAs for model-based cross-lingual transfer. In addition, we +leverage two-stage prompting to encourage the large language model (LLM) to +annotate the multi-lingual raw data for data-based cross-lingual transfer. The +model is trained with multi-lingual objectives on our proposed dataset +OpenIE4++ by combing the model-based and data-based transfer techniques. +Experimental results on various benchmarks emphasize the importance of +aggregating multiple plug-in-and-play language-specific modules and demonstrate +the effectiveness of MT4CrossIE in cross-lingual +OIE\footnote{\url{https://github.com/CSJianYang/Multilingual-Multimodal-NLP}}. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Alternative Pseudo-Labeling for Semi-Supervised Automatic Speech + Recognition + + +
+ When labeled data is insufficient, semi-supervised learning with the +pseudo-labeling technique can significantly improve the performance of +automatic speech recognition. However, pseudo-labels are often noisy, +containing numerous incorrect tokens. Taking noisy labels as ground-truth in +the loss function results in suboptimal performance. Previous works attempted +to mitigate this issue by either filtering out the nosiest pseudo-labels or +improving the overall quality of pseudo-labels. While these methods are +effective to some extent, it is unrealistic to entirely eliminate incorrect +tokens in pseudo-labels. In this work, we propose a novel framework named +alternative pseudo-labeling to tackle the issue of noisy pseudo-labels from the +perspective of the training objective. The framework comprises several +components. Firstly, a generalized CTC loss function is introduced to handle +noisy pseudo-labels by accepting alternative tokens in the positions of +incorrect tokens. Applying this loss function in pseudo-labeling requires +detecting incorrect tokens in the predicted pseudo-labels. In this work, we +adopt a confidence-based error detection method that identifies the incorrect +tokens by comparing their confidence scores with a given threshold, thus +necessitating the confidence score to be discriminative. Hence, the second +proposed technique is the contrastive CTC loss function that widens the +confidence gap between the correctly and incorrectly predicted tokens, thereby +improving the error detection ability. Additionally, obtaining satisfactory +performance with confidence-based error detection typically requires extensive +threshold tuning. Instead, we propose an automatic thresholding method that +uses labeled data as a proxy for determining the threshold, thus saving the +pain of manual tuning. + +
+
+ comment: Accepted by IEEE/ACM Transactions on Audio, Speech and Language + Processing (TASLP), 2023 +
+
+
+
+
+ + ☆ MC-DRE: Multi-Aspect Cross Integration for Drug Event/Entity Extraction + + +
+ Extracting meaningful drug-related information chunks, such as adverse drug +events (ADE), is crucial for preventing morbidity and saving many lives. Most +ADE are reported via an unstructured conversation with the medical context. +Hence, applying a general entity recognition approach is not sufficient enough. +The key is how to integrate and align multiple crucial aspects to detect drug +event information, including drug event semantics, syntactic structures, and +medical domain terminology. In this paper, we propose a new multi-aspect +cross-integration framework for drug entity/event detection by capturing and +aligning different context/language/knowledge properties from drug-related +documents. We first construct multi-aspect encoders to describe semantic, +syntactic, and medical document contextual information by conducting those slot +tagging tasks, main drug entity/event detection, part-of-speech tagging, and +general medical named entity recognition. Then, each encoder conducts cross +integration and alignment with other contextual information in three ways, +including the key-value cross, attention cross, and feedforward cross, so the +multi-encoders are integrated in depth. Then, we perform extensive experiments +on two widely used drug-related entity recognition downstream tasks, flat +entity detection and discontinuous event extraction. Our model significantly +outperforms all recent twelve state-of-the-art models. The implementation code +will be released at~\url{https://github.com/adlnlp/mc-dre}. + +
+
+
+
+
+ + ☆ With a Little Help from the Authors: Reproducing Human Evaluation of an + MT Error Detector + + +
+ This work presents our efforts to reproduce the results of the human +evaluation experiment presented in the paper of Vamvas and Sennrich (2022), +which evaluated an automatic system detecting over- and undertranslations +(translations containing more or less information than the original) in machine +translation (MT) outputs. Despite the high quality of the documentation and +code provided by the authors, we discuss some problems we found in reproducing +the exact experimental setup and offer recommendations for improving +reproducibility. Our replicated results generally confirm the conclusions of +the original study, but in some cases, statistically significant differences +were observed, suggesting a high variability of human annotation. + +
+
+ comment: Submitted to + https://www.aclweb.org/portal/content/repronlp-shared-task-reproducibility-evaluations-nlp-2023 +
+
+
+
+
+ + ☆ HyperFormer: Enhancing Entity and Relation Interaction for + Hyper-Relational Knowledge Graph Completion CIKM'23 + + +
+ Hyper-relational knowledge graphs (HKGs) extend standard knowledge graphs by +associating attribute-value qualifiers to triples, which effectively represent +additional fine-grained information about its associated triple. +Hyper-relational knowledge graph completion (HKGC) aims at inferring unknown +triples while considering its qualifiers. Most existing approaches to HKGC +exploit a global-level graph structure to encode hyper-relational knowledge +into the graph convolution message passing process. However, the addition of +multi-hop information might bring noise into the triple prediction process. To +address this problem, we propose HyperFormer, a model that considers +local-level sequential information, which encodes the content of the entities, +relations and qualifiers of a triple. More precisely, HyperFormer is composed +of three different modules: an entity neighbor aggregator module allowing to +integrate the information of the neighbors of an entity to capture different +perspectives of it; a relation qualifier aggregator module to integrate +hyper-relational knowledge into the corresponding relation to refine the +representation of relational content; a convolution-based bidirectional +interaction module based on a convolutional operation, capturing pairwise +bidirectional interactions of entity-relation, entity-qualifier, and +relation-qualifier. realize the depth perception of the content related to the +current statement. Furthermore, we introduce a Mixture-of-Experts strategy into +the feed-forward layers of HyperFormer to strengthen its representation +capabilities while reducing the amount of model parameters and computation. +Extensive experiments on three well-known datasets with four different +conditions demonstrate HyperFormer's effectiveness. Datasets and code are +available at https://github.com/zhiweihu1103/HKGC-HyperFormer. + +
+
+ comment: Accepted at CIKM'23 +
+
+
+
+
+ + ☆ AutoConv: Automatically Generating Information-seeking Conversations + with Large Language Models ACL 2023 + + +
+ Information-seeking conversation, which aims to help users gather information +through conversation, has achieved great progress in recent years. However, the +research is still stymied by the scarcity of training data. To alleviate this +problem, we propose AutoConv for synthetic conversation generation, which takes +advantage of the few-shot learning ability and generation capacity of large +language models (LLM). Specifically, we formulate the conversation generation +problem as a language modeling task, then finetune an LLM with a few human +conversations to capture the characteristics of the information-seeking process +and use it for generating synthetic conversations with high quality. +Experimental results on two frequently-used datasets verify that AutoConv has +substantial improvements over strong baselines and alleviates the dependence on +human annotation. In addition, we also provide several analysis studies to +promote future research. + +
+
+ comment: Accepted to ACL 2023 Main Conference (Short) +
+
+
+
+
+ + ☆ Three Ways of Using Large Language Models to Evaluate Chat + + +
+ This paper describes the systems submitted by team6 for ChatEval, the DSTC 11 +Track 4 competition. We present three different approaches to predicting +turn-level qualities of chatbot responses based on large language models +(LLMs). We report improvement over the baseline using dynamic few-shot examples +from a vector store for the prompts for ChatGPT. We also analyze the +performance of the other two approaches and report needed improvements for +future work. We developed the three systems over just two weeks, showing the +potential of LLMs for this task. An ablation study conducted after the +challenge deadline shows that the new Llama 2 models are closing the +performance gap between ChatGPT and open-source LLMs. However, we find that the +Llama 2 models do not benefit from few-shot examples in the same way as +ChatGPT. + +
+
+ comment: Accepted to DSTC11 workshop https://dstc11.dstc.community/ +
+
+
+
+
+ + ☆ NewsDialogues: Towards Proactive News Grounded Conversation ACL 2023 + + +
+ Hot news is one of the most popular topics in daily conversations. However, +news grounded conversation has long been stymied by the lack of well-designed +task definition and scarce data. In this paper, we propose a novel task, +Proactive News Grounded Conversation, in which a dialogue system can +proactively lead the conversation based on some key topics of the news. In +addition, both information-seeking and chit-chat scenarios are included +realistically, where the user may ask a series of questions about the news +details or express their opinions and be eager to chat. To further develop this +novel task, we collect a human-to-human Chinese dialogue dataset +\ts{NewsDialogues}, which includes 1K conversations with a total of 14.6K +utterances and detailed annotations for target topics and knowledge spans. +Furthermore, we propose a method named Predict-Generate-Rank, consisting of a +generator for grounded knowledge prediction and response generation, and a +ranker for the ranking of multiple responses to alleviate the exposure bias. We +conduct comprehensive experiments to demonstrate the effectiveness of the +proposed method and further present several key findings and challenges to +prompt future research. + +
+
+ comment: Accepted to ACL 2023 Conference (Long Paper; Findings) +
+
+
+
+
+ + ☆ Generating Faithful Text From a Knowledge Graph with Noisy Reference + Text + + +
+ Knowledge Graph (KG)-to-Text generation aims at generating fluent +natural-language text that accurately represents the information of a given +knowledge graph. While significant progress has been made in this task by +exploiting the power of pre-trained language models (PLMs) with appropriate +graph structure-aware modules, existing models still fall short of generating +faithful text, especially when the ground-truth natural-language text contains +additional information that is not present in the graph. In this paper, we +develop a KG-to-text generation model that can generate faithful +natural-language text from a given graph, in the presence of noisy reference +text. Our framework incorporates two core ideas: Firstly, we utilize +contrastive learning to enhance the model's ability to differentiate between +faithful and hallucinated information in the text, thereby encouraging the +decoder to generate text that aligns with the input graph. Secondly, we empower +the decoder to control the level of hallucination in the generated text by +employing a controllable text generation technique. We evaluate our model's +performance through the standard quantitative metrics as well as a +ChatGPT-based quantitative and qualitative analysis. Our evaluation +demonstrates the superior performance of our model over state-of-the-art +KG-to-text models on faithfulness. + +
+
+
+
+
+ + ☆ GPT-4 Is Too Smart To Be Safe: Stealthy Chat with LLMs via Cipher + + +
+ Safety lies at the core of the development of Large Language Models (LLMs). +There is ample work on aligning LLMs with human ethics and preferences, +including data filtering in pretraining, supervised fine-tuning, reinforcement +learning from human feedback, and red teaming, etc. In this study, we discover +that chat in cipher can bypass the safety alignment techniques of LLMs, which +are mainly conducted in natural languages. We propose a novel framework +CipherChat to systematically examine the generalizability of safety alignment +to non-natural languages -- ciphers. CipherChat enables humans to chat with +LLMs through cipher prompts topped with system role descriptions and few-shot +enciphered demonstrations. We use CipherChat to assess state-of-the-art LLMs, +including ChatGPT and GPT-4 for different representative human ciphers across +11 safety domains in both English and Chinese. Experimental results show that +certain ciphers succeed almost 100% of the time to bypass the safety alignment +of GPT-4 in several safety domains, demonstrating the necessity of developing +safety alignment for non-natural languages. Notably, we identify that LLMs seem +to have a ''secret cipher'', and propose a novel SelfCipher that uses only role +play and several demonstrations in natural language to evoke this capability. +SelfCipher surprisingly outperforms existing human ciphers in almost all cases. +Our code and data will be released at https://github.com/RobustNLP/CipherChat. + +
+
+ comment: 13 pages, 4 figures, 9 tables +
+
+
+
+
+ + ☆ Text-to-Video: a Two-stage Framework for Zero-shot Identity-agnostic + Talking-head Generation + + +
+ The advent of ChatGPT has introduced innovative methods for information +gathering and analysis. However, the information provided by ChatGPT is limited +to text, and the visualization of this information remains constrained. +Previous research has explored zero-shot text-to-video (TTV) approaches to +transform text into videos. However, these methods lacked control over the +identity of the generated audio, i.e., not identity-agnostic, hindering their +effectiveness. To address this limitation, we propose a novel two-stage +framework for person-agnostic video cloning, specifically focusing on TTV +generation. In the first stage, we leverage pretrained zero-shot models to +achieve text-to-speech (TTS) conversion. In the second stage, an audio-driven +talking head generation method is employed to produce compelling videos +privided the audio generated in the first stage. This paper presents a +comparative analysis of different TTS and audio-driven talking head generation +methods, identifying the most promising approach for future research and +development. Some audio and videos samples can be found in the following link: +https://github.com/ZhichaoWang970201/Text-to-Video/tree/main. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Demonstration-based learning for few-shot biomedical named entity + recognition under machine reading comprehension + + +
+ Although deep learning techniques have shown significant achievements, they +frequently depend on extensive amounts of hand-labeled data and tend to perform +inadequately in few-shot scenarios. The objective of this study is to devise a +strategy that can improve the model's capability to recognize biomedical +entities in scenarios of few-shot learning. By redefining biomedical named +entity recognition (BioNER) as a machine reading comprehension (MRC) problem, +we propose a demonstration-based learning method to address few-shot BioNER, +which involves constructing appropriate task demonstrations. In assessing our +proposed method, we compared the proposed method with existing advanced methods +using six benchmark datasets, including BC4CHEMD, BC5CDR-Chemical, +BC5CDR-Disease, NCBI-Disease, BC2GM, and JNLPBA. We examined the models' +efficacy by reporting F1 scores from both the 25-shot and 50-shot learning +experiments. In 25-shot learning, we observed 1.1% improvements in the average +F1 scores compared to the baseline method, reaching 61.7%, 84.1%, 69.1%, 70.1%, +50.6%, and 59.9% on six datasets, respectively. In 50-shot learning, we further +improved the average F1 scores by 1.0% compared to the baseline method, +reaching 73.1%, 86.8%, 76.1%, 75.6%, 61.7%, and 65.4%, respectively. We +reported that in the realm of few-shot learning BioNER, MRC-based language +models are much more proficient in recognizing biomedical entities compared to +the sequence labeling approach. Furthermore, our MRC-language models can +compete successfully with fully-supervised learning methodologies that rely +heavily on the availability of abundant annotated data. These results highlight +possible pathways for future advancements in few-shot BioNER methodologies. + +
+
+
+
+
+ + ☆ Simple Model Also Works: A Novel Emotion Recognition Network in Textual + Conversation Based on Curriculum Learning Strategy + + +
+ Emotion Recognition in Conversation (ERC) has emerged as a research hotspot +in domains such as conversational robots and question-answer systems. How to +efficiently and adequately retrieve contextual emotional cues has been one of +the key challenges in the ERC task. Existing efforts do not fully model the +context and employ complex network structures, resulting in excessive +computational resource overhead without substantial performance improvement. In +this paper, we propose a novel Emotion Recognition Network based on Curriculum +Learning strategy (ERNetCL). The proposed ERNetCL primarily consists of +Temporal Encoder (TE), Spatial Encoder (SE), and Curriculum Learning (CL) loss. +We utilize TE and SE to combine the strengths of previous methods in a +simplistic manner to efficiently capture temporal and spatial contextual +information in the conversation. To simulate the way humans learn curriculum +from easy to hard, we apply the idea of CL to the ERC task to progressively +optimize the network parameters of ERNetCL. At the beginning of training, we +assign lower learning weights to difficult samples. As the epoch increases, the +learning weights for these samples are gradually raised. Extensive experiments +on four datasets exhibit that our proposed method is effective and dramatically +beats other baseline models. + +
+
+ comment: 12 pages,9 figures +
+
+
+
+
+ + ☆ Performance Prediction for Multi-hop Questions + + +
+ We study the problem of Query Performance Prediction (QPP) for open-domain +multi-hop Question Answering (QA), where the task is to estimate the difficulty +of evaluating a multi-hop question over a corpus. Despite the extensive +research on predicting the performance of ad-hoc and QA retrieval models, there +has been a lack of study on the estimation of the difficulty of multi-hop +questions. The problem is challenging due to the multi-step nature of the +retrieval process, potential dependency of the steps and the reasoning +involved. To tackle this challenge, we propose multHP, a novel pre-retrieval +method for predicting the performance of open-domain multi-hop questions. Our +extensive evaluation on the largest multi-hop QA dataset using several modern +QA systems shows that the proposed model is a strong predictor of the +performance, outperforming traditional single-hop QPP models. Additionally, we +demonstrate that our approach can be effectively used to optimize the +parameters of QA systems, such as the number of documents to be retrieved, +resulting in improved overall retrieval performance. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Emergent communication for AR + + +
+ Mobile augmented reality (MAR) is widely acknowledged as one of the +ubiquitous interfaces to the digital twin and Metaverse, demanding unparalleled +levels of latency, computational power, and energy efficiency. The existing +solutions for realizing MAR combine multiple technologies like edge, cloud +computing, and fifth-generation (5G) networks. However, the inherent +communication latency of visual data imposes apparent limitations on the +quality of experience (QoE). To address the challenge, we propose an emergent +semantic communication framework to learn the communication protocols in MAR. +Specifically, we train two agents through a modified Lewis signaling game to +emerge a discrete communication protocol spontaneously. Based on this protocol, +two agents can communicate about the abstract idea of visual data through +messages with extremely small data sizes in a noisy channel, which leads to +message errors. To better simulate real-world scenarios, we incorporate channel +uncertainty into our training process. Experiments have shown that the proposed +scheme has better generalization on unseen objects than traditional object +recognition used in MAR and can effectively enhance communication efficiency +through the utilization of small-size messages. + +
+
+
+
+
+ + ♻ ☆ Learning Semantic Text Similarity to rank Hypernyms of Financial Terms + + +
+ Over the years, there has been a paradigm shift in how users access financial +services. With the advancement of digitalization more users have been +preferring the online mode of performing financial activities. This has led to +the generation of a huge volume of financial content. Most investors prefer to +go through these contents before making decisions. Every industry has terms +that are specific to the domain it operates in. Banking and Financial Services +are not an exception to this. In order to fully comprehend these contents, one +needs to have a thorough understanding of the financial terms. Getting a basic +idea about a term becomes easy when it is explained with the help of the broad +category to which it belongs. This broad category is referred to as hypernym. +For example, "bond" is a hypernym of the financial term "alternative +debenture". In this paper, we propose a system capable of extracting and +ranking hypernyms for a given financial term. The system has been trained with +financial text corpora obtained from various sources like DBpedia [4], +Investopedia, Financial Industry Business Ontology (FIBO), prospectus and so +on. Embeddings of these terms have been extracted using FinBERT [3], FinISH [1] +and fine-tuned using SentenceBERT [54]. A novel approach has been used to +augment the training set with negative samples. It uses the hierarchy present +in FIBO. Finally, we benchmark the system performance with that of the existing +ones. We establish that it performs better than the existing ones and is also +scalable. + +
+
+ comment: Our code base: + https://github.com/sohomghosh/FinSim_Financial_Hypernym_detection +
+
+
+
+
+ + ♻ ☆ Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of + Synthetic and Compositional Images ICCV 2023 + + +
+ Weird, unusual, and uncanny images pique the curiosity of observers because +they challenge commonsense. For example, an image released during the 2022 +world cup depicts the famous soccer stars Lionel Messi and Cristiano Ronaldo +playing chess, which playfully violates our expectation that their competition +should occur on the football field. Humans can easily recognize and interpret +these unconventional images, but can AI models do the same? We introduce +WHOOPS!, a new dataset and benchmark for visual commonsense. The dataset is +comprised of purposefully commonsense-defying images created by designers using +publicly-available image generation tools like Midjourney. We consider several +tasks posed over the dataset. In addition to image captioning, cross-modal +matching, and visual question answering, we introduce a difficult explanation +generation task, where models must identify and explain why a given image is +unusual. Our results show that state-of-the-art models such as GPT3 and BLIP2 +still lag behind human performance on WHOOPS!. We hope our dataset will inspire +the development of AI models with stronger visual commonsense reasoning +abilities. Data, models and code are available at the project website: +whoops-benchmark.github.io + +
+
+ comment: Accepted to ICCV 2023. Website: whoops-benchmark.github.io +
+
+
+
+
+ + ♻ ☆ Efficient Guided Generation for Large Language Models + + +
+ In this article we show how the problem of neural text generation can be +constructively reformulated in terms of transitions between the states of a +finite-state machine. This framework leads to an efficient approach to guiding +text generation with regular expressions and context-free grammars by allowing +the construction of an index over a language model's vocabulary. The approach +is model agnostic, allows one to enforce domain-specific knowledge and +constraints, and enables the construction of reliable interfaces by +guaranteeing the structure of the generated text. It adds little overhead to +the token sequence generation process and significantly outperforms existing +solutions. An implementation is provided in the open source Python library +Outlines + +
+
+
+
+
+ + ♻ ☆ MathBERT: A Pre-trained Language Model for General NLP Tasks in + Mathematics Education NeurIPS 2021 + + +
+ Since the introduction of the original BERT (i.e., BASE BERT), researchers +have developed various customized BERT models with improved performance for +specific domains and tasks by exploiting the benefits of transfer learning. Due +to the nature of mathematical texts, which often use domain specific vocabulary +along with equations and math symbols, we posit that the development of a new +BERT model for mathematics would be useful for many mathematical downstream +tasks. In this resource paper, we introduce our multi-institutional effort +(i.e., two learning platforms and three academic institutions in the US) toward +this need: MathBERT, a model created by pre-training the BASE BERT model on a +large mathematical corpus ranging from pre-kindergarten (pre-k), to +high-school, to college graduate level mathematical content. In addition, we +select three general NLP tasks that are often used in mathematics education: +prediction of knowledge component, auto-grading open-ended Q&A, and knowledge +tracing, to demonstrate the superiority of MathBERT over BASE BERT. Our +experiments show that MathBERT outperforms prior best methods by 1.2-22% and +BASE BERT by 2-8% on these tasks. In addition, we build a mathematics specific +vocabulary 'mathVocab' to train with MathBERT. We discover that MathBERT +pre-trained with 'mathVocab' outperforms MathBERT trained with the BASE BERT +vocabulary (i.e., 'origVocab'). MathBERT is currently being adopted at the +participated leaning platforms: Stride, Inc, a commercial educational resource +provider, and ASSISTments.org, a free online educational platform. We release +MathBERT for public usage at: https://github.com/tbs17/MathBERT. + +
+
+ comment: Accepted by NeurIPS 2021 MATHAI4ED Workshop (Best Paper) +
+
+
+
+
+ + ♻ ☆ A Stitch in Time Saves Nine: Detecting and Mitigating Hallucinations of + LLMs by Validating Low-Confidence Generation + + +
+ Recently developed large language models have achieved remarkable success in +generating fluent and coherent text. However, these models often tend to +'hallucinate' which critically hampers their reliability. In this work, we +address this crucial problem and propose an approach that actively detects and +mitigates hallucinations during the generation process. Specifically, we first +identify the candidates of potential hallucination leveraging the model's logit +output values, check their correctness through a validation procedure, mitigate +the detected hallucinations, and then continue with the generation process. +Through extensive experiments with GPT-3.5 (text-davinci-003) on the 'article +generation task', we first demonstrate the individual efficacy of our detection +and mitigation techniques. Specifically, the detection technique achieves a +recall of ~88% and the mitigation technique successfully mitigates 57.6% of the +correctly detected hallucinations. Importantly, our mitigation technique does +not introduce new hallucinations even in the case of incorrectly detected +hallucinations, i.e., false positives. Then, we show that the proposed active +detection and mitigation approach successfully reduces the hallucinations of +the GPT-3.5 model from 47.5% to 14.5% on average. We further demonstrate the +effectiveness and wide applicability of our approach through additional studies +including performance on different types of questions (multi-hop and false +premise questions) and with another LLM from a different model family (Vicuna). +In summary, our work contributes to improving the reliability and +trustworthiness of large language models, a crucial step en route to enabling +their widespread adoption in real-world applications. + +
+
+ comment: update to include additional experiments +
+
+
+
+
+ + ♻ ☆ Mismatching-Aware Unsupervised Translation Quality Estimation For + Low-Resource Languages + + +
+ Translation Quality Estimation (QE) is the task of predicting the quality of +machine translation (MT) output without any reference. This task has gained +increasing attention as an important component in the practical applications of +MT. In this paper, we first propose XLMRScore, which is a cross-lingual +counterpart of BERTScore computed via the XLM-RoBERTa (XLMR) model. This metric +can be used as a simple unsupervised QE method, while employing it results in +two issues: firstly, the untranslated tokens leading to unexpectedly high +translation scores, and secondly, the issue of mismatching errors between +source and hypothesis tokens when applying the greedy matching in XLMRScore. To +mitigate these issues, we suggest replacing untranslated words with the unknown +token and the cross-lingual alignment of the pre-trained model to represent +aligned words closer to each other, respectively. We evaluate the proposed +method on four low-resource language pairs of WMT21 QE shared task, as well as +a new English-Farsi test dataset introduced in this paper. Experiments show +that our method could get comparable results with the supervised baseline for +two zero-shot scenarios, i.e., with less than 0.01 difference in Pearson +correlation, while outperforming unsupervised rivals in all the low-resource +language pairs for above 8%, on average. + +
+
+ comment: Submitted to Language Resources and Evaluation +
+
+
+
+
+ + ♻ ☆ Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot + Task Generalization + + +
+ We investigate the emergent abilities of the recently proposed web-scale +speech model Whisper, by adapting it to unseen tasks with prompt engineering. +We selected three tasks: audio-visual speech recognition (AVSR), code-switched +speech recognition (CS-ASR), and speech translation (ST) on unseen language +pairs. We design task-specific prompts, by either leveraging another +large-scale model, or simply manipulating the special tokens in the default +prompts. Experiments show that compared to the default prompts, our proposed +prompts improve performance by 10% to 45% on the three zero-shot tasks, and +even outperform SotA supervised models on some datasets. In addition, our +experiments reveal many interesting properties of Whisper, including its +robustness to prompts, bias on accents, and the multilingual understanding in +its latent space. Code is available at +https://github.com/jasonppy/PromptingWhisper + +
+
+ comment: Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ A Bi-directional Multi-hop Inference Model for Joint Dialog Sentiment + Classification and Act Recognition NLPCC 2023 + + +
+ The joint task of Dialog Sentiment Classification (DSC) and Act Recognition +(DAR) aims to predict the sentiment label and act label for each utterance in a +dialog simultaneously. However, current methods encode the dialog context in +only one direction, which limits their ability to thoroughly comprehend the +context. Moreover, these methods overlook the explicit correlations between +sentiment and act labels, which leads to an insufficient ability to capture +rich sentiment and act clues and hinders effective and accurate reasoning. To +address these issues, we propose a Bi-directional Multi-hop Inference Model +(BMIM) that leverages a feature selection network and a bi-directional +multi-hop inference network to iteratively extract and integrate rich sentiment +and act clues in a bi-directional manner. We also employ contrastive learning +and dual learning to explicitly model the correlations of sentiment and act +labels. Our experiments on two widely-used datasets show that BMIM outperforms +state-of-the-art baselines by at least 2.6% on F1 score in DAR and 1.4% on F1 +score in DSC. Additionally, Our proposed model not only improves the +performance but also enhances the interpretability of the joint sentiment and +act prediction task. + +
+
+ comment: Accepted by NLPCC 2023 +
+
+
+
+
+ + ♻ ☆ DialogRE^C+: An Extension of DialogRE to Investigate How Much + Coreference Helps Relation Extraction in Dialogs NLPCC 2023 + + +
+ Dialogue relation extraction (DRE) that identifies the relations between +argument pairs in dialogue text, suffers much from the frequent occurrence of +personal pronouns, or entity and speaker coreference. This work introduces a +new benchmark dataset DialogRE^C+, introducing coreference resolution into the +DRE scenario. With the aid of high-quality coreference knowledge, the reasoning +of argument relations is expected to be enhanced. In DialogRE^C+ dataset, we +manually annotate total 5,068 coreference chains over 36,369 argument mentions +based on the existing DialogRE data, where four different coreference chain +types namely speaker chain, person chain, location chain and organization chain +are explicitly marked. We further develop 4 coreference-enhanced graph-based +DRE models, which learn effective coreference representations for improving the +DRE task. We also train a coreference resolution model based on our annotations +and evaluate the effect of automatically extracted coreference chains +demonstrating the practicality of our dataset and its potential to other +domains and tasks. + +
+
+ comment: Accepted by NLPCC 2023 +
+
+
+
+
+ + ♻ ☆ Revisiting Disentanglement and Fusion on Modality and Context in + Conversational Multimodal Emotion Recognition ACM MM 2023 + + +
+ It has been a hot research topic to enable machines to understand human +emotions in multimodal contexts under dialogue scenarios, which is tasked with +multimodal emotion analysis in conversation (MM-ERC). MM-ERC has received +consistent attention in recent years, where a diverse range of methods has been +proposed for securing better task performance. Most existing works treat MM-ERC +as a standard multimodal classification problem and perform multimodal feature +disentanglement and fusion for maximizing feature utility. Yet after revisiting +the characteristic of MM-ERC, we argue that both the feature multimodality and +conversational contextualization should be properly modeled simultaneously +during the feature disentanglement and fusion steps. In this work, we target +further pushing the task performance by taking full consideration of the above +insights. On the one hand, during feature disentanglement, based on the +contrastive learning technique, we devise a Dual-level Disentanglement +Mechanism (DDM) to decouple the features into both the modality space and +utterance space. On the other hand, during the feature fusion stage, we propose +a Contribution-aware Fusion Mechanism (CFM) and a Context Refusion Mechanism +(CRM) for multimodal and context integration, respectively. They together +schedule the proper integrations of multimodal and context features. +Specifically, CFM explicitly manages the multimodal feature contributions +dynamically, while CRM flexibly coordinates the introduction of dialogue +contexts. On two public MM-ERC datasets, our system achieves new +state-of-the-art performance consistently. Further analyses demonstrate that +all our proposed mechanisms greatly facilitate the MM-ERC task by making full +use of the multimodal and context features adaptively. Note that our proposed +methods have the great potential to facilitate a broader range of other +conversational multimodal tasks. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Constructing Holistic Spatio-Temporal Scene Graph for Video Semantic + Role Labeling ACM MM 2023 + + +
+ Video Semantic Role Labeling (VidSRL) aims to detect the salient events from +given videos, by recognizing the predict-argument event structures and the +interrelationships between events. While recent endeavors have put forth +methods for VidSRL, they can be mostly subject to two key drawbacks, including +the lack of fine-grained spatial scene perception and the insufficiently +modeling of video temporality. Towards this end, this work explores a novel +holistic spatio-temporal scene graph (namely HostSG) representation based on +the existing dynamic scene graph structures, which well model both the +fine-grained spatial semantics and temporal dynamics of videos for VidSRL. +Built upon the HostSG, we present a nichetargeting VidSRL framework. A +scene-event mapping mechanism is first designed to bridge the gap between the +underlying scene structure and the high-level event semantic structure, +resulting in an overall hierarchical scene-event (termed ICE) graph structure. +We further perform iterative structure refinement to optimize the ICE graph, +such that the overall structure representation can best coincide with end task +demand. Finally, three subtask predictions of VidSRL are jointly decoded, where +the end-to-end paradigm effectively avoids error propagation. On the benchmark +dataset, our framework boosts significantly over the current best-performing +model. Further analyses are shown for a better understanding of the advances of +our methods. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Summaries as Captions: Generating Figure Captions for Scientific + Documents with Automated Text Summarization + + +
+ Good figure captions help paper readers understand complex scientific +figures. Unfortunately, even published papers often have poorly written +captions. Automatic caption generation could aid paper writers by providing +good starting captions that can be refined for better quality. Prior work often +treated figure caption generation as a vision-to-language task. In this paper, +we show that it can be more effectively tackled as a text summarization task in +scientific documents. We fine-tuned PEGASUS, a pre-trained abstractive +summarization model, to specifically summarize figure-referencing paragraphs +(e.g., "Figure 3 shows...") into figure captions. Experiments on large-scale +arXiv figures show that our method outperforms prior vision methods in both +automatic and human evaluations. We further conducted an in-depth investigation +focused on two key challenges: (i) the common presence of low-quality +author-written captions and (ii) the lack of clear standards for good captions. +Our code and data are available at: +https://github.com/Crowd-AI-Lab/Generating-Figure-Captions-as-a-Text-Summarization-Task. + +
+
+ comment: Accepted by INLG-2023 +
+
+
+
+
+ + ♻ ☆ Answering Unseen Questions With Smaller Language Models Using Rationale + Generation and Dense Retrieval + + +
+ When provided with sufficient explanatory context, smaller Language Models +have been shown to exhibit strong reasoning ability on challenging short-answer +question-answering tasks where the questions are unseen in training. We +evaluate two methods for further improvement in this setting. Both methods +focus on combining rationales generated by a larger Language Model with longer +contexts created from a multi-hop dense retrieval system. The first method +($\textit{RR}$) involves training a Rationale Ranking model to score both +generated rationales and retrieved contexts with respect to relevance and +truthfulness. We then use the scores to derive combined contexts from both +knowledge sources using a number of combinatory strategies. For the second +method ($\textit{RATD}$) we train a smaller Reasoning model using +retrieval-augmented training datasets such that it becomes proficient at +utilising relevant information from longer text sequences that may be only +partially evidential and frequently contain many irrelevant sentences. +Generally we find that both methods are effective but that the $\textit{RATD}$ +method is more straightforward to apply and produces the strongest results in +the unseen setting on which we focus. Our single best Reasoning model using +only 440 million parameters materially improves upon strong comparable prior +baselines for unseen evaluation datasets (StrategyQA 58.9 $\rightarrow$ 61.7 +acc., CommonsenseQA 63.6 $\rightarrow$ 72.7 acc., ARC-DA 31.6 $\rightarrow$ +52.1 F1, IIRC 25.5 $\rightarrow$ 27.3 F1) and a version utilising our prior +knowledge of each type of question in selecting a context combination strategy +does even better. Our proposed models also generally outperform direct prompts +against much larger models (BLOOM 175B and StableVicuna 13B) in both few-shot +chain-of-thought and few-shot answer-only settings. + +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ Contrastive Learning for Cross-modal Artist Retrieval + + +
+ Music retrieval and recommendation applications often rely on content +features encoded as embeddings, which provide vector representations of items +in a music dataset. Numerous complementary embeddings can be derived from +processing items originally represented in several modalities, e.g., audio +signals, user interaction data, or editorial data. However, data of any given +modality might not be available for all items in any music dataset. In this +work, we propose a method based on contrastive learning to combine embeddings +from multiple modalities and explore the impact of the presence or absence of +embeddings from diverse modalities in an artist similarity task. Experiments on +two datasets suggest that our contrastive method outperforms single-modality +embeddings and baseline algorithms for combining modalities, both in terms of +artist retrieval accuracy and coverage. Improvements with respect to other +methods are particularly significant for less popular query artists. We +demonstrate our method successfully combines complementary information from +diverse modalities, and is more robust to missing modality data (i.e., it +better handles the retrieval of artists with different modality embeddings than +the query artist's). + +
+
+
+
+
+ + ☆ Context-aware Event Forecasting via Graph Disentanglement KDD 2023 + + +
+ Event forecasting has been a demanding and challenging task throughout the +entire human history. It plays a pivotal role in crisis alarming and disaster +prevention in various aspects of the whole society. The task of event +forecasting aims to model the relational and temporal patterns based on +historical events and makes forecasting to what will happen in the future. Most +existing studies on event forecasting formulate it as a problem of link +prediction on temporal event graphs. However, such pure structured formulation +suffers from two main limitations: 1) most events fall into general and +high-level types in the event ontology, and therefore they tend to be +coarse-grained and offers little utility which inevitably harms the forecasting +accuracy; and 2) the events defined by a fixed ontology are unable to retain +the out-of-ontology contextual information. To address these limitations, we +propose a novel task of context-aware event forecasting which incorporates +auxiliary contextual information. First, the categorical context provides +supplementary fine-grained information to the coarse-grained events. Second and +more importantly, the context provides additional information towards specific +situation and condition, which is crucial or even determinant to what will +happen next. However, it is challenging to properly integrate context into the +event forecasting framework, considering the complex patterns in the +multi-context scenario. Towards this end, we design a novel framework named +Separation and Collaboration Graph Disentanglement (short as SeCoGD) for +context-aware event forecasting. Since there is no available dataset for this +novel task, we construct three large-scale datasets based on GDELT. +Experimental results demonstrate that our model outperforms a list of SOTA +methods. + +
+
+ comment: KDD 2023, 9 pages, 7 figures, 4 tables +
+
+
+
+
+ + ☆ Performance Prediction for Multi-hop Questions + + +
+ We study the problem of Query Performance Prediction (QPP) for open-domain +multi-hop Question Answering (QA), where the task is to estimate the difficulty +of evaluating a multi-hop question over a corpus. Despite the extensive +research on predicting the performance of ad-hoc and QA retrieval models, there +has been a lack of study on the estimation of the difficulty of multi-hop +questions. The problem is challenging due to the multi-step nature of the +retrieval process, potential dependency of the steps and the reasoning +involved. To tackle this challenge, we propose multHP, a novel pre-retrieval +method for predicting the performance of open-domain multi-hop questions. Our +extensive evaluation on the largest multi-hop QA dataset using several modern +QA systems shows that the proposed model is a strong predictor of the +performance, outperforming traditional single-hop QPP models. Additionally, we +demonstrate that our approach can be effectively used to optimize the +parameters of QA systems, such as the number of documents to be retrieved, +resulting in improved overall retrieval performance. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Stationary Algorithmic Balancing For Dynamic Email Re-Ranking Problem KDD'23 + + +
+ Email platforms need to generate personalized rankings of emails that satisfy +user preferences, which may vary over time. We approach this as a +recommendation problem based on three criteria: closeness (how relevant the +sender and topic are to the user), timeliness (how recent the email is), and +conciseness (how brief the email is). We propose MOSR (Multi-Objective +Stationary Recommender), a novel online algorithm that uses an adaptive control +model to dynamically balance these criteria and adapt to preference changes. We +evaluate MOSR on the Enron Email Dataset, a large collection of real emails, +and compare it with other baselines. The results show that MOSR achieves better +performance, especially under non-stationary preferences, where users value +different criteria more or less over time. We also test MOSR's robustness on a +smaller down-sampled dataset that exhibits high variance in email +characteristics, and show that it maintains stable rankings across different +samples. Our work offers novel insights into how to design email re-ranking +systems that account for multiple objectives impacting user satisfaction. + +
+
+ comment: Published in KDD'23 +
+
+
+
+
+ + ♻ ☆ Social4Rec: Distilling User Preference from Social Graph for Video + Recommendation in Tencent + + +
+ Despite recommender systems play a key role in network content platforms, +mining the user's interests is still a significant challenge. Existing works +predict the user interest by utilizing user behaviors, i.e., clicks, views, +etc., but current solutions are ineffective when users perform unsettled +activities. The latter ones involve new users, which have few activities of any +kind, and sparse users who have low-frequency behaviors. We uniformly describe +both these user-types as "cold users", which are very common but often +neglected in network content platforms. To address this issue, we enhance the +representation of the user interest by combining his social interest, e.g., +friendship, following bloggers, interest groups, etc., with the activity +behaviors. Thus, in this work, we present a novel algorithm entitled SocialNet, +which adopts a two-stage method to progressively extract the coarse-grained and +fine-grained social interest. Our technique then concatenates SocialNet's +output with the original user representation to get the final user +representation that combines behavior interests and social interests. Offline +experiments on Tencent video's recommender system demonstrate the superiority +over the baseline behavior-based model. The online experiment also shows a +significant performance improvement in clicks and view time in the real-world +recommendation system. The source code is available at +https://github.com/Social4Rec/SocialNet. + +
+
+
+
+
+
+
+
+ + Multimedia 1 + +
+
+
+ + ☆ A One-dimensional HEVC video steganalysis method using the Optimality of + Predicted Motion Vectors + + +
+ Among steganalysis techniques, detection against motion vector (MV) +domain-based video steganography in High Efficiency Video Coding (HEVC) +standard remains a hot and challenging issue. For the purpose of improving the +detection performance, this paper proposes a steganalysis feature based on the +optimality of predicted MVs with a dimension of one. Firstly, we point out that +the motion vector prediction (MVP) of the prediction unit (PU) encoded using +the Advanced Motion Vector Prediction (AMVP) technique satisfies the local +optimality in the cover video. Secondly, we analyze that in HEVC video, message +embedding either using MVP index or motion vector differences (MVD) may destroy +the above optimality of MVP. And then, we define the optimal rate of MVP in +HEVC video as a steganalysis feature. Finally, we conduct steganalysis +detection experiments on two general datasets for three popular steganography +methods and compare the performance with four state-of-the-art steganalysis +methods. The experimental results show that the proposed optimal rate of MVP +for all cover videos is 100\%, while the optimal rate of MVP for all stego +videos is less than 100\%. Therefore, the proposed steganography scheme can +accurately distinguish between cover videos and stego videos, and it is +efficiently applied to practical scenarios with no model training and low +computational complexity. + +
+
+ comment: Submitted to TCSVT +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 49 + +
+
+
+ + ☆ Self-Alignment with Instruction Backtranslation + + +
+ We present a scalable method to build a high quality instruction following +language model by automatically labelling human-written text with corresponding +instructions. Our approach, named instruction backtranslation, starts with a +language model finetuned on a small amount of seed data, and a given web +corpus. The seed model is used to construct training examples by generating +instruction prompts for web documents (self-augmentation), and then selecting +high quality examples from among these candidates (self-curation). This data is +then used to finetune a stronger model. Finetuning LLaMa on two iterations of +our approach yields a model that outperforms all other LLaMa-based models on +the Alpaca leaderboard not relying on distillation data, demonstrating highly +effective self-alignment. + +
+
+
+
+
+ + ☆ KETM:A Knowledge-Enhanced Text Matching method IJCNN 2023 + + +
+ Text matching is the task of matching two texts and determining the +relationship between them, which has extensive applications in natural language +processing tasks such as reading comprehension, and Question-Answering systems. +The mainstream approach is to compute text representations or to interact with +the text through attention mechanism, which is effective in text matching +tasks. However, the performance of these models is insufficient for texts that +require commonsense knowledge-based reasoning. To this end, in this paper, We +introduce a new model for text matching called the Knowledge Enhanced Text +Matching model (KETM), to enrich contextual representations with real-world +common-sense knowledge from external knowledge sources to enhance our model +understanding and reasoning. First, we use Wiktionary to retrieve the text word +definitions as our external knowledge. Secondly, we feed text and knowledge to +the text matching module to extract their feature vectors. The text matching +module is used as an interaction module by integrating the encoder layer, the +co-attention layer, and the aggregation layer. Specifically, the interaction +process is iterated several times to obtain in-depth interaction information +and extract the feature vectors of text and knowledge by multi-angle pooling. +Then, we fuse text and knowledge using a gating mechanism to learn the ratio of +text and knowledge fusion by a neural network that prevents noise generated by +knowledge. After that, experimental validation on four datasets are carried +out, and the experimental results show that our proposed model performs well on +all four datasets, and the performance of our method is improved compared to +the base model without adding external knowledge, which validates the +effectiveness of our proposed method. The code is available at +https://github.com/1094701018/KETM + +
+
+ comment: Accepted to IJCNN 2023 +
+
+
+
+
+ + ☆ A Large Language Model Enhanced Conversational Recommender System + + +
+ Conversational recommender systems (CRSs) aim to recommend high-quality items +to users through a dialogue interface. It usually contains multiple sub-tasks, +such as user preference elicitation, recommendation, explanation, and item +information search. To develop effective CRSs, there are some challenges: 1) +how to properly manage sub-tasks; 2) how to effectively solve different +sub-tasks; and 3) how to correctly generate responses that interact with users. +Recently, Large Language Models (LLMs) have exhibited an unprecedented ability +to reason and generate, presenting a new opportunity to develop more powerful +CRSs. In this work, we propose a new LLM-based CRS, referred to as LLMCRS, to +address the above challenges. For sub-task management, we leverage the +reasoning ability of LLM to effectively manage sub-task. For sub-task solving, +we collaborate LLM with expert models of different sub-tasks to achieve the +enhanced performance. For response generation, we utilize the generation +ability of LLM as a language interface to better interact with users. +Specifically, LLMCRS divides the workflow into four stages: sub-task detection, +model matching, sub-task execution, and response generation. LLMCRS also +designs schema-based instruction, demonstration-based instruction, dynamic +sub-task and model matching, and summary-based generation to instruct LLM to +generate desired results in the workflow. Finally, to adapt LLM to +conversational recommendations, we also propose to fine-tune LLM with +reinforcement learning from CRSs performance feedback, referred to as RLPF. +Experimental results on benchmark datasets show that LLMCRS with RLPF +outperforms the existing methods. + +
+
+
+
+
+ + ☆ Thinking Like an Expert:Multimodal Hypergraph-of-Thought (HoT) Reasoning + to boost Foundation Modals + + +
+ Reasoning ability is one of the most crucial capabilities of a foundation +model, signifying its capacity to address complex reasoning tasks. +Chain-of-Thought (CoT) technique is widely regarded as one of the effective +methods for enhancing the reasoning ability of foundation models and has +garnered significant attention. However, the reasoning process of CoT is +linear, step-by-step, similar to personal logical reasoning, suitable for +solving general and slightly complicated problems. On the contrary, the +thinking pattern of an expert owns two prominent characteristics that cannot be +handled appropriately in CoT, i.e., high-order multi-hop reasoning and +multimodal comparative judgement. Therefore, the core motivation of this paper +is transcending CoT to construct a reasoning paradigm that can think like an +expert. The hyperedge of a hypergraph could connect various vertices, making it +naturally suitable for modelling high-order relationships. Inspired by this, +this paper innovatively proposes a multimodal Hypergraph-of-Thought (HoT) +reasoning paradigm, which enables the foundation models to possess the +expert-level ability of high-order multi-hop reasoning and multimodal +comparative judgement. Specifically, a textual hypergraph-of-thought is +constructed utilizing triple as the primary thought to model higher-order +relationships, and a hyperedge-of-thought is generated through multi-hop +walking paths to achieve multi-hop inference. Furthermore, we devise a visual +hypergraph-of-thought to interact with the textual hypergraph-of-thought via +Cross-modal Co-Attention Graph Learning for multimodal comparative +verification. Experimentations on the ScienceQA benchmark demonstrate the +proposed HoT-based T5 outperforms CoT-based GPT3.5 and chatGPT, which is on par +with CoT-based GPT4 with a lower model size. + +
+
+
+
+
+ + ☆ Weakly Supervised Text Classification on Free Text Comments in + Patient-Reported Outcome Measures + + +
+ Free text comments (FTC) in patient-reported outcome measures (PROMs) data +are typically analysed using manual methods, such as content analysis, which is +labour-intensive and time-consuming. Machine learning analysis methods are +largely unsupervised, necessitating post-analysis interpretation. Weakly +supervised text classification (WSTC) can be a valuable method of analysis to +classify domain-specific text data in which there is limited labelled data. In +this paper, we apply five WSTC techniques to FTC in PROMs data to identify +health-related quality of life (HRQoL) themes reported by colorectal cancer +patients. The WSTC methods label all the themes mentioned in the FTC. The +results showed moderate performance on the PROMs data, mainly due to the +precision of the models, and variation between themes. Evaluation of the +classification performance illustrated the potential and limitations of keyword +based WSTC to label PROMs FTC when labelled data is limited. + +
+
+ comment: Accepted and presented at Health Text Analytics conference 2023 (UK) +
+
+
+
+
+ + ☆ Assessing Guest Nationality Composition from Hotel Reviews + + +
+ Many hotels target guest acquisition efforts to specific markets in order to +best anticipate individual preferences and needs of their guests. Likewise, +such strategic positioning is a prerequisite for efficient marketing budget +allocation. Official statistics report on the number of visitors from different +countries, but no fine-grained information on the guest composition of +individual businesses exists. There is, however, growing interest in such data +from competitors, suppliers, researchers and the general public. We demonstrate +how machine learning can be leveraged to extract references to guest +nationalities from unstructured text reviews in order to dynamically assess and +monitor the dynamics of guest composition of individual businesses. In +particular, we show that a rather simple architecture of pre-trained embeddings +and stacked LSTM layers provides a better performance-runtime tradeoff than +more complex state-of-the-art language models. + +
+
+
+
+
+ + ☆ Task Conditioned BERT for Joint Intent Detection and Slot-filling + + +
+ Dialogue systems need to deal with the unpredictability of user intents to +track dialogue state and the heterogeneity of slots to understand user +preferences. In this paper we investigate the hypothesis that solving these +challenges as one unified model will allow the transfer of parameter support +data across the different tasks. The proposed principled model is based on a +Transformer encoder, trained on multiple tasks, and leveraged by a rich input +that conditions the model on the target inferences. Conditioning the +Transformer encoder on multiple target inferences over the same corpus, i.e., +intent and multiple slot types, allows learning richer language interactions +than a single-task model would be able to. In fact, experimental results +demonstrate that conditioning the model on an increasing number of dialogue +inference tasks leads to improved results: on the MultiWOZ dataset, the joint +intent and slot detection can be improved by 3.2\% by conditioning on intent, +10.8\% by conditioning on slot and 14.4\% by conditioning on both intent and +slots. Moreover, on real conversations with Farfetch costumers, the proposed +conditioned BERT can achieve high joint-goal and intent detection performance +throughout a dialogue. + +
+
+
+
+
+ + ☆ Identification of the Relevance of Comments in Codes Using Bag of Words + and Transformer Based Models + + +
+ The Forum for Information Retrieval (FIRE) started a shared task this year +for classification of comments of different code segments. This is binary text +classification task where the objective is to identify whether comments given +for certain code segments are relevant or not. The BioNLP-IISERB group at the +Indian Institute of Science Education and Research Bhopal (IISERB) participated +in this task and submitted five runs for five different models. The paper +presents the overview of the models and other significant findings on the +training corpus. The methods involve different feature engineering schemes and +text classification techniques. The performance of the classical bag of words +model and transformer-based models were explored to identify significant +features from the given training corpus. We have explored different classifiers +viz., random forest, support vector machine and logistic regression using the +bag of words model. Furthermore, the pre-trained transformer based models like +BERT, RoBERT and ALBERT were also used by fine-tuning them on the given +training corpus. The performance of different such models over the training +corpus were reported and the best five models were implemented on the given +test corpus. The empirical results show that the bag of words model outperforms +the transformer based models, however, the performance of our runs are not +reasonably well in both training and test corpus. This paper also addresses the +limitations of the models and scope for further improvement. + +
+
+
+
+
+ + ☆ Improving Joint Speech-Text Representations Without Alignment + + +
+ The last year has seen astonishing progress in text-prompted image generation +premised on the idea of a cross-modal representation space in which the text +and image domains are represented jointly. In ASR, this idea has found +application as joint speech-text encoders that can scale to the capacities of +very large parameter models by being trained on both unpaired speech and text. +While these methods show promise, they have required special treatment of the +sequence-length mismatch inherent in speech and text, either by up-sampling +heuristics or an explicit alignment model. In this work, we offer evidence that +joint speech-text encoders naturally achieve consistent representations across +modalities by disregarding sequence length, and argue that consistency losses +could forgive length differences and simply assume the best alignment. We show +that such a loss improves downstream WER in both a large-parameter monolingual +and multilingual system. + +
+
+
+
+
+ + ☆ Lip2Vec: Efficient and Robust Visual Speech Recognition via + Latent-to-Latent Visual to Audio Representation Mapping + + +
+ Visual Speech Recognition (VSR) differs from the common perception tasks as +it requires deeper reasoning over the video sequence, even by human experts. +Despite the recent advances in VSR, current approaches rely on labeled data to +fully train or finetune their models predicting the target speech. This hinders +their ability to generalize well beyond the training set and leads to +performance degeneration under out-of-distribution challenging scenarios. +Unlike previous works that involve auxiliary losses or complex training +procedures and architectures, we propose a simple approach, named Lip2Vec that +is based on learning a prior model. Given a robust visual speech encoder, this +network maps the encoded latent representations of the lip sequence to their +corresponding latents from the audio pair, which are sufficiently invariant for +effective text decoding. The generated audio representation is then decoded to +text using an off-the-shelf Audio Speech Recognition (ASR) model. The proposed +model compares favorably with fully-supervised learning methods on the LRS3 +dataset achieving 26 WER. Unlike SoTA approaches, our model keeps a reasonable +performance on the VoxCeleb test set. We believe that reprogramming the VSR as +an ASR task narrows the performance gap between the two and paves the way for +more flexible formulations of lip reading. + +
+
+
+
+
+ + ☆ Improving Zero-Shot Text Matching for Financial Auditing with Large + Language Models + + +
+ Auditing financial documents is a very tedious and time-consuming process. As +of today, it can already be simplified by employing AI-based solutions to +recommend relevant text passages from a report for each legal requirement of +rigorous accounting standards. However, these methods need to be fine-tuned +regularly, and they require abundant annotated data, which is often lacking in +industrial environments. Hence, we present ZeroShotALI, a novel recommender +system that leverages a state-of-the-art large language model (LLM) in +conjunction with a domain-specifically optimized transformer-based +text-matching solution. We find that a two-step approach of first retrieving a +number of best matching document sections per legal requirement with a custom +BERT-based model and second filtering these selections using an LLM yields +significant performance improvements over existing approaches. + +
+
+ comment: 4 pages, 1 figure, 2 tables +
+
+
+
+
+ + ☆ Neural Conversation Models and How to Rein Them in: A Survey of Failures + and Fixes + + +
+ Recent conditional language models are able to continue any kind of text +source in an often seemingly fluent way. This fact encouraged research in the +area of open-domain conversational systems that are based on powerful language +models and aim to imitate an interlocutor by generating appropriate +contributions to a written dialogue. From a linguistic perspective, however, +the complexity of contributing to a conversation is high. In this survey, we +interpret Grice's maxims of cooperative conversation from the perspective of +this specific research area and systematize the literature under the aspect of +what makes a contribution appropriate: A neural conversation model has to be +fluent, informative, consistent, coherent, and follow social norms. In order to +ensure these qualities, recent approaches try to tame the underlying language +models at various intervention points, such as data, training regime or +decoding. Sorted by these categories and intervention points, we discuss +promising attempts and suggest novel ways for future research. + +
+
+ comment: Represents the state of the field in 2022; partially based on the + first authors 2022 PhD thesis +
+
+
+
+
+ + ☆ Fly-Swat or Cannon? Cost-Effective Language Model Choice via + Meta-Modeling + + +
+ Generative language models (LMs) have become omnipresent across data science. +For a wide variety of tasks, inputs can be phrased as natural language prompts +for an LM, from whose output the solution can then be extracted. LM performance +has consistently been increasing with model size - but so has the monetary cost +of querying the ever larger models. Importantly, however, not all inputs are +equally hard: some require larger LMs for obtaining a satisfactory solution, +whereas for others smaller LMs suffice. Based on this fact, we design a +framework for Cost-Effective Language Model Choice (CELMOC). Given a set of +inputs and a set of candidate LMs, CELMOC judiciously assigns each input to an +LM predicted to do well on the input according to a so-called meta-model, +aiming to achieve high overall performance at low cost. The cost-performance +trade-off can be flexibly tuned by the user. Options include, among others, +maximizing total expected performance (or the number of processed inputs) while +staying within a given cost budget, or minimizing total cost while processing +all inputs. We evaluate CELMOC on 14 datasets covering five natural language +tasks, using four candidate LMs of vastly different size and cost. With CELMOC, +we match the performance of the largest available LM while achieving a cost +reduction of 63%. Via our publicly available library, researchers as well as +practitioners can thus save large amounts of money without sacrificing +performance. + +
+
+
+
+
+ + ☆ A Case Study on Context Encoding in Multi-Encoder based Document-Level + Neural Machine Translation + + +
+ Recent studies have shown that the multi-encoder models are agnostic to the +choice of context, and the context encoder generates noise which helps improve +the models in terms of BLEU score. In this paper, we further explore this idea +by evaluating with context-aware pronoun translation test set by training +multi-encoder models trained on three different context settings viz, previous +two sentences, random two sentences, and a mix of both as context. +Specifically, we evaluate the models on the ContraPro test set to study how +different contexts affect pronoun translation accuracy. The results show that +the model can perform well on the ContraPro test set even when the context is +random. We also analyze the source representations to study whether the context +encoder generates noise. Our analysis shows that the context encoder provides +sufficient information to learn discourse-level information. Additionally, we +observe that mixing the selected context (the previous two sentences in this +case) and the random context is generally better than the other settings. + +
+
+ comment: Accepted to MT Summit 2023 (oral) +
+
+
+
+
+ + ☆ Learning to Guide Human Experts via Personalized Large Language Models + + +
+ In learning to defer, a predictor identifies risky decisions and defers them +to a human expert. One key issue with this setup is that the expert may end up +over-relying on the machine's decisions, due to anchoring bias. At the same +time, whenever the machine chooses the deferral option the expert has to take +decisions entirely unassisted. As a remedy, we propose learning to guide (LTG), +an alternative framework in which -- rather than suggesting ready-made +decisions -- the machine provides guidance useful to guide decision-making, and +the human is entirely responsible for coming up with a decision. We also +introduce SLOG, an LTG implementation that leverages (a small amount of) human +supervision to convert a generic large language model into a module capable of +generating textual guidance, and present preliminary but promising results on a +medical diagnosis task. + +
+
+
+
+
+ + ☆ Evidence of Human-Like Visual-Linguistic Integration in Multimodal Large + Language Models During Predictive Language Processing + + +
+ The advanced language processing abilities of large language models (LLMs) +have stimulated debate over their capacity to replicate human-like cognitive +processes. One differentiating factor between language processing in LLMs and +humans is that language input is often grounded in more than one perceptual +modality, whereas most LLMs process solely text-based information. Multimodal +grounding allows humans to integrate - e.g. visual context with linguistic +information and thereby place constraints on the space of upcoming words, +reducing cognitive load and improving perception and comprehension. Recent +multimodal LLMs (mLLMs) combine visual and linguistic embedding spaces with a +transformer type attention mechanism for next-word prediction. To what extent +does predictive language processing based on multimodal input align in mLLMs +and humans? To answer this question, 200 human participants watched short +audio-visual clips and estimated the predictability of an upcoming verb or +noun. The same clips were processed by the mLLM CLIP, with predictability +scores based on a comparison of image and text feature vectors. Eye-tracking +was used to estimate what visual features participants attended to, and CLIP's +visual attention weights were recorded. We find that human estimates of +predictability align significantly with CLIP scores, but not for a unimodal LLM +of comparable parameter size. Further, alignment vanished when CLIP's visual +attention weights were perturbed, and when the same input was fed to a +multimodal model without attention. Analysing attention patterns, we find a +significant spatial overlap between CLIP's visual attention weights and human +eye-tracking data. Results suggest that comparable processes of integrating +multimodal information, guided by attention to relevant visual features, +supports predictive language processing in mLLMs and humans. + +
+
+ comment: 13 pages, 4 figures, submitted to journal +
+
+
+
+
+ + ☆ Large Language Models in Cryptocurrency Securities Cases: Can ChatGPT + Replace Lawyers? + + +
+ Large Language Models (LLMs) could enhance access to the legal system. +However, empirical research on their effectiveness in conducting legal tasks is +scant. We study securities cases involving cryptocurrencies as one of numerous +contexts where AI could support the legal process, studying LLMs' legal +reasoning and drafting capabilities. We examine whether a) an LLM can +accurately determine which laws are potentially being violated from a fact +pattern, and b) whether there is a difference in juror decision-making based on +complaints written by a lawyer compared to an LLM. We feed fact patterns from +real-life cases to GPT-3.5 and evaluate its ability to determine correct +potential violations from the scenario and exclude spurious violations. Second, +we had mock jurors assess complaints written by the LLM and lawyers. GPT-3.5's +legal reasoning skills proved weak, though we expect improvement in future +models, particularly given the violations it suggested tended to be correct (it +merely missed additional, correct violations). GPT-3.5 performed better at +legal drafting, and jurors' decisions were not statistically significantly +associated with the author of the document upon which they based their +decisions. Because LLMs cannot satisfactorily conduct legal reasoning tasks, +they would be unable to replace lawyers at this stage. However, their drafting +skills (though, perhaps, still inferior to lawyers), could provide access to +justice for more individuals by reducing the cost of legal services. Our +research is the first to systematically study LLMs' legal drafting and +reasoning capabilities in litigation, as well as in securities law and +cryptocurrency-related misconduct. + +
+
+
+
+
+ + ☆ Optimizing transformer-based machine translation model for single GPU + training: a hyperparameter ablation study + + +
+ In machine translation tasks, the relationship between model complexity and +performance is often presumed to be linear, driving an increase in the number +of parameters and consequent demands for computational resources like multiple +GPUs. To explore this assumption, this study systematically investigates the +effects of hyperparameters through ablation on a sequence-to-sequence machine +translation pipeline, utilizing a single NVIDIA A100 GPU. Contrary to +expectations, our experiments reveal that combinations with the most parameters +were not necessarily the most effective. This unexpected insight prompted a +careful reduction in parameter sizes, uncovering "sweet spots" that enable +training sophisticated models on a single GPU without compromising translation +quality. The findings demonstrate an intricate relationship between +hyperparameter selection, model size, and computational resource needs. The +insights from this study contribute to the ongoing efforts to make machine +translation more accessible and cost-effective, emphasizing the importance of +precise hyperparameter tuning over mere scaling. + +
+
+ comment: 12 pages, 15 figures, 1 Table +
+
+
+
+
+ + ☆ Tweet Sentiment Extraction using Viterbi Algorithm with Transfer + Learning + + +
+ Tweet sentiment extraction extracts the most significant portion of the +sentence, determining whether the sentiment is positive or negative. This +research aims to identify the part of tweet sentences that strikes any emotion. +To reach this objective, we continue improving the Viterbi algorithm previously +modified by the author to make it able to receive pre-trained model parameters. +We introduce the confidence score and vector as two indicators responsible for +evaluating the model internally before assessing the final results. We then +present a method to fine-tune this nonparametric model. We found that the model +gets highly explainable as the confidence score vector reveals precisely where +the least confidence predicted states are and if the modifications approved +ameliorate the confidence score or if the tuning is going in the wrong +direction. + +
+
+
+
+
+ + ☆ LittleMu: Deploying an Online Virtual Teaching Assistant via + Heterogeneous Sources Integration and Chain of Teach Prompts CIKM 23 + + +
+ Teaching assistants have played essential roles in the long history of +education. However, few MOOC platforms are providing human or virtual teaching +assistants to support learning for massive online students due to the +complexity of real-world online education scenarios and the lack of training +data. In this paper, we present a virtual MOOC teaching assistant, LittleMu +with minimum labeled training data, to provide question answering and chit-chat +services. Consisting of two interactive modules of heterogeneous retrieval and +language model prompting, LittleMu first integrates structural, semi- and +unstructured knowledge sources to support accurate answers for a wide range of +questions. Then, we design delicate demonstrations named "Chain of Teach" +prompts to exploit the large-scale pre-trained model to handle complex +uncollected questions. Except for question answering, we develop other +educational services such as knowledge-grounded chit-chat. We test the system's +performance via both offline evaluation and online deployment. Since May 2020, +our LittleMu system has served over 80,000 users with over 300,000 queries from +over 500 courses on XuetangX MOOC platform, which continuously contributes to a +more convenient and fair education. Our code, services, and dataset will be +available at https://github.com/THU-KEG/VTA. + +
+
+ comment: 7 pages, 3 figures, Accepted by CIKM 23 +
+
+
+
+
+ + ☆ PIPPA: A Partially Synthetic Conversational Dataset + + +
+ With the emergence of increasingly powerful large language models, there is a +burgeoning interest in leveraging these models for casual conversation and +role-play applications. However, existing conversational and role-playing +datasets often fail to capture the diverse and nuanced interactions typically +exhibited by real-world role-play participants. To address this limitation and +contribute to the rapidly growing field, we introduce a partially-synthetic +dataset named PIPPA (Personal Interaction Pairs between People and AI). PIPPA +is a result of a community-driven crowdsourcing effort involving a group of +role-play enthusiasts. The dataset comprises over 1 million utterances that are +distributed across 26,000 conversation sessions and provides a rich resource +for researchers and AI developers to explore and refine conversational AI +systems in the context of role-play scenarios. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Dynamic Planning with a LLM + + +
+ While Large Language Models (LLMs) can solve many NLP tasks in zero-shot +settings, applications involving embodied agents remain problematic. In +particular, complex plans that require multi-step reasoning become difficult +and too costly as the context window grows. Planning requires understanding the +likely effects of one's actions and identifying whether the current environment +satisfies the goal state. While symbolic planners find optimal solutions +quickly, they require a complete and accurate representation of the planning +problem, severely limiting their use in practical scenarios. In contrast, +modern LLMs cope with noisy observations and high levels of uncertainty when +reasoning about a task. Our work presents LLM Dynamic Planner (LLM-DP): a +neuro-symbolic framework where an LLM works hand-in-hand with a traditional +planner to solve an embodied task. Given action-descriptions, LLM-DP solves +Alfworld faster and more efficiently than a naive LLM ReAct baseline. + +
+
+
+
+
+ + ☆ ZYN: Zero-Shot Reward Models with Yes-No Questions + + +
+ In this work, we address the problem of directing the text generations of a +LLM towards a desired behavior, aligning the generated text with the +preferences of the human operator. We propose using another language model as a +critic, reward model in a zero-shot way thanks to the prompt of a Yes-No +question that represents the user preferences, without requiring further +labeled data. This zero-shot reward model provides the learning signal to +further fine-tune the base LLM using reinforcement learning, as in RLAIF; yet +our approach is also compatible in other contexts such as quality-diversity +search. Extensive evidence of the capabilities of the proposed ZYN framework is +provided through experiments in different domains related to text generation, +including detoxification; optimizing sentiment of movie reviews, or any other +attribute; steering the opinion about a particular topic the model may have; +and personalizing prompt generators for text-to-image tasks. Code to be +released at \url{https://github.com/vicgalle/zero-shot-reward-models/}. + +
+
+
+
+
+ + ☆ Large Language Models and Knowledge Graphs: Opportunities and Challenges + + +
+ Large Language Models (LLMs) have taken Knowledge Representation -- and the +world -- by storm. This inflection point marks a shift from explicit knowledge +representation to a renewed focus on the hybrid representation of both explicit +knowledge and parametric knowledge. In this position paper, we will discuss +some of the common debate points within the community on LLMs (parametric +knowledge) and Knowledge Graphs (explicit knowledge) and speculate on +opportunities and visions that the renewed focus brings, as well as related +research topics and challenges. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ☆ Large Language Models to Identify Social Determinants of Health in + Electronic Health Records + + +
+ Social determinants of health (SDoH) have an important impact on patient +outcomes but are incompletely collected from the electronic health records +(EHR). This study researched the ability of large language models to extract +SDoH from free text in EHRs, where they are most commonly documented, and +explored the role of synthetic clinical text for improving the extraction of +these scarcely documented, yet extremely valuable, clinical data. 800 patient +notes were annotated for SDoH categories, and several transformer-based models +were evaluated. The study also experimented with synthetic data generation and +assessed for algorithmic bias. Our best-performing models were fine-tuned +Flan-T5 XL (macro-F1 0.71) for any SDoH, and Flan-T5 XXL (macro-F1 0.70). The +benefit of augmenting fine-tuning with synthetic data varied across model +architecture and size, with smaller Flan-T5 models (base and large) showing the +greatest improvements in performance (delta F1 +0.12 to +0.23). Model +performance was similar on the in-hospital system dataset but worse on the +MIMIC-III dataset. Our best-performing fine-tuned models outperformed zero- and +few-shot performance of ChatGPT-family models for both tasks. These fine-tuned +models were less likely than ChatGPT to change their prediction when +race/ethnicity and gender descriptors were added to the text, suggesting less +algorithmic bias (p<0.05). At the patient-level, our models identified 93.8% of +patients with adverse SDoH, while ICD-10 codes captured 2.0%. Our method can +effectively extracted SDoH information from clinic notes, performing better +compare to GPT zero- and few-shot settings. These models could enhance +real-world evidence on SDoH and aid in identifying patients needing social +support. + +
+
+ comment: 38 pages, 5 figures, 5 tables in main, submitted for review +
+
+
+
+
+ + ☆ Bilingual Streaming ASR with Grapheme units and Auxiliary Monolingual + Loss + + +
+ We introduce a bilingual solution to support English as secondary locale for +most primary locales in hybrid automatic speech recognition (ASR) settings. Our +key developments constitute: (a) pronunciation lexicon with grapheme units +instead of phone units, (b) a fully bilingual alignment model and subsequently +bilingual streaming transformer model, (c) a parallel encoder structure with +language identification (LID) loss, (d) parallel encoder with an auxiliary loss +for monolingual projections. We conclude that in comparison to LID loss, our +proposed auxiliary loss is superior in specializing the parallel encoders to +respective monolingual locales, and that contributes to stronger bilingual +learning. We evaluate our work on large-scale training and test tasks for +bilingual Spanish (ES) and bilingual Italian (IT) applications. Our bilingual +models demonstrate strong English code-mixing capability. In particular, the +bilingual IT model improves the word error rate (WER) for a code-mix IT task +from 46.5% to 13.8%, while also achieving a close parity (9.6%) with the +monolingual IT model (9.5%) over IT tests. + +
+
+
+
+
+ + ☆ Learning Deductive Reasoning from Synthetic Corpus based on Formal Logic + + +
+ We study a synthetic corpus-based approach for language models (LMs) to +acquire logical deductive reasoning ability. The previous studies generated +deduction examples using specific sets of deduction rules. However, these rules +were limited or otherwise arbitrary. This can limit the generalizability of +acquired deductive reasoning ability. We rethink this and adopt a well-grounded +set of deduction rules based on formal logic theory, which can derive any other +deduction rules when combined in a multistep way. We empirically verify that +LMs trained on the proposed corpora, which we name $\textbf{FLD}$ +($\textbf{F}$ormal $\textbf{L}$ogic $\textbf{D}$eduction), acquire more +generalizable deductive reasoning ability. Furthermore, we identify the aspects +of deductive reasoning ability on which deduction corpora can enhance LMs and +those on which they cannot. Finally, on the basis of these results, we discuss +the future directions for applying deduction corpora or other approaches for +each aspect. We release the code, data, and models. + +
+
+
+
+
+ + ♻ ☆ RT-1: Robotics Transformer for Real-World Control at Scale + + +
+ By transferring knowledge from large, diverse, task-agnostic datasets, modern +machine learning models can solve specific downstream tasks either zero-shot or +with small task-specific datasets to a high level of performance. While this +capability has been demonstrated in other fields such as computer vision, +natural language processing or speech recognition, it remains to be shown in +robotics, where the generalization capabilities of the models are particularly +critical due to the difficulty of collecting real-world robotic data. We argue +that one of the keys to the success of such general robotic models lies with +open-ended task-agnostic training, combined with high-capacity architectures +that can absorb all of the diverse, robotic data. In this paper, we present a +model class, dubbed Robotics Transformer, that exhibits promising scalable +model properties. We verify our conclusions in a study of different model +classes and their ability to generalize as a function of the data size, model +size, and data diversity based on a large-scale data collection on real robots +performing real-world tasks. The project's website and videos can be found at +robotics-transformer1.github.io + +
+
+ comment: See website at robotics-transformer1.github.io +
+
+
+
+
+ + ♻ ☆ ML-SUPERB: Multilingual Speech Universal PERformance Benchmark + + +
+ Speech processing Universal PERformance Benchmark (SUPERB) is a leaderboard +to benchmark the performance of Self-Supervised Learning (SSL) models on +various speech processing tasks. However, SUPERB largely considers English +speech in its evaluation. This paper presents multilingual SUPERB (ML-SUPERB), +covering 143 languages (ranging from high-resource to endangered), and +considering both automatic speech recognition and language identification. +Following the concept of SUPERB, ML-SUPERB utilizes frozen SSL features and +employs a simple framework for multilingual tasks by learning a shallow +downstream model. Similar to the SUPERB benchmark, we find speech SSL models +can significantly improve performance compared to FBANK features. Furthermore, +we find that multilingual models do not always perform better than their +monolingual counterparts. We will release ML-SUPERB as a challenge with +organized datasets and reproducible training scripts for future multilingual +representation research. + +
+
+ comment: Accepted by Interspeech +
+
+
+
+
+ + ♻ ☆ CLASSLA-Stanza: The Next Step for Linguistic Processing of South Slavic + Languages + + +
+ We present CLASSLA-Stanza, a pipeline for automatic linguistic annotation of +the South Slavic languages, which is based on the Stanza natural language +processing pipeline. We describe the main improvements in CLASSLA-Stanza with +respect to Stanza, and give a detailed description of the model training +process for the latest 2.1 release of the pipeline. We also report performance +scores produced by the pipeline for different languages and varieties. +CLASSLA-Stanza exhibits consistently high performance across all the supported +languages and outperforms or expands its parent pipeline Stanza at all the +supported tasks. We also present the pipeline's new functionality enabling +efficient processing of web data and the reasons that led to its +implementation. + +
+
+ comment: 17 pages, 14 tables, 1 figure; Typos corrected +
+
+
+
+
+ + ♻ ☆ Ontology Enrichment from Texts: A Biomedical Dataset for Concept + Discovery and Placement CIKM 2023 + + +
+ Mentions of new concepts appear regularly in texts and require automated +approaches to harvest and place them into Knowledge Bases (KB), e.g., +ontologies and taxonomies. Existing datasets suffer from three issues, (i) +mostly assuming that a new concept is pre-discovered and cannot support +out-of-KB mention discovery; (ii) only using the concept label as the input +along with the KB and thus lacking the contexts of a concept label; and (iii) +mostly focusing on concept placement w.r.t a taxonomy of atomic concepts, +instead of complex concepts, i.e., with logical operators. To address these +issues, we propose a new benchmark, adapting MedMentions dataset (PubMed +abstracts) with SNOMED CT versions in 2014 and 2017 under the Diseases +sub-category and the broader categories of Clinical finding, Procedure, and +Pharmaceutical / biologic product. We provide usage on the evaluation with the +dataset for out-of-KB mention discovery and concept placement, adapting recent +Large Language Model based methods. + +
+
+ comment: 5 pages, 1 figure, accepted for CIKM 2023. The dataset, data + construction scripts, and baseline implementation are available at + https://zenodo.org/record/8228005 (Zenodo) and + https://github.com/KRR-Oxford/OET (GitHub) +
+
+
+
+
+ + ♻ ☆ Reveal the Unknown: Out-of-Knowledge-Base Mention Discovery with Entity + Linking CIKM 2023 + + +
+ Discovering entity mentions that are out of a Knowledge Base (KB) from texts +plays a critical role in KB maintenance, but has not yet been fully explored. +The current methods are mostly limited to the simple threshold-based approach +and feature-based classification, and the datasets for evaluation are +relatively rare. We propose BLINKout, a new BERT-based Entity Linking (EL) +method which can identify mentions that do not have corresponding KB entities +by matching them to a special NIL entity. To better utilize BERT, we propose +new techniques including NIL entity representation and classification, with +synonym enhancement. We also apply KB Pruning and Versioning strategies to +automatically construct out-of-KB datasets from common in-KB EL datasets. +Results on five datasets of clinical notes, biomedical publications, and +Wikipedia articles in various domains show the advantages of BLINKout over +existing methods to identify out-of-KB mentions for the medical ontologies, +UMLS, SNOMED CT, and the general KB, WikiData. + +
+
+ comment: 11 pages, 3 figures, accepted for CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Cross-modal Contrastive Learning for Multimodal Fake News Detection + + +
+ Automatic detection of multimodal fake news has gained a widespread attention +recently. Many existing approaches seek to fuse unimodal features to produce +multimodal news representations. However, the potential of powerful cross-modal +contrastive learning methods for fake news detection has not been well +exploited. Besides, how to aggregate features from different modalities to +boost the performance of the decision-making process is still an open question. +To address that, we propose COOLANT, a cross-modal contrastive learning +framework for multimodal fake news detection, aiming to achieve more accurate +image-text alignment. To further improve the alignment precision, we leverage +an auxiliary task to soften the loss term of negative samples during the +contrast process. A cross-modal fusion module is developed to learn the +cross-modality correlations. An attention mechanism with an attention guidance +module is implemented to help effectively and interpretably aggregate the +aligned unimodal representations and the cross-modality correlations. Finally, +we evaluate the COOLANT and conduct a comparative study on two widely used +datasets, Twitter and Weibo. The experimental results demonstrate that our +COOLANT outperforms previous approaches by a large margin and achieves new +state-of-the-art results on the two datasets. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ 3D-EX : A Unified Dataset of Definitions and Dictionary Examples + + +
+ Definitions are a fundamental building block in lexicography, linguistics and +computational semantics. In NLP, they have been used for retrofitting word +embeddings or augmenting contextual representations in language models. +However, lexical resources containing definitions exhibit a wide range of +properties, which has implications in the behaviour of models trained and +evaluated on them. In this paper, we introduce 3D- EX , a dataset that aims to +fill this gap by combining well-known English resources into one centralized +knowledge repository in the form of triples. 3D- EX +is a unified evaluation framework with carefully pre-computed +train/validation/test splits to prevent memorization. We report experimental +results that suggest that this dataset could be effectively leveraged in +downstream NLP tasks. Code and data are available at +https://github.com/F-Almeman/3D-EX . + +
+
+ comment: 11 pages (including references pages), 9 tables, and 1 figure. This + paper is submitted to RANLP2023 +
+
+
+
+
+ + ♻ ☆ Transformers are Short Text Classifiers: A Study of Inductive Short Text + Classifiers on Benchmarks and Real-world Datasets + + +
+ Short text classification is a crucial and challenging aspect of Natural +Language Processing. For this reason, there are numerous highly specialized +short text classifiers. However, in recent short text research, State of the +Art (SOTA) methods for traditional text classification, particularly the pure +use of Transformers, have been unexploited. In this work, we examine the +performance of a variety of short text classifiers as well as the top +performing traditional text classifier. We further investigate the effects on +two new real-world short text datasets in an effort to address the issue of +becoming overly dependent on benchmark datasets with a limited number of +characteristics. Our experiments unambiguously demonstrate that Transformers +achieve SOTA accuracy on short text classification tasks, raising the question +of whether specialized short text techniques are necessary. + +
+
+ comment: Accepted at CD-MAKE 2023 +
+
+
+
+
+ + ♻ ☆ Constraining Linear-chain CRFs to Regular Languages + + +
+ A major challenge in structured prediction is to represent the +interdependencies within output structures. When outputs are structured as +sequences, linear-chain conditional random fields (CRFs) are a widely used +model class which can learn \textit{local} dependencies in the output. However, +the CRF's Markov assumption makes it impossible for CRFs to represent +distributions with \textit{nonlocal} dependencies, and standard CRFs are unable +to respect nonlocal constraints of the data (such as global arity constraints +on output labels). We present a generalization of CRFs that can enforce a broad +class of constraints, including nonlocal ones, by specifying the space of +possible output structures as a regular language $\mathcal{L}$. The resulting +regular-constrained CRF (RegCCRF) has the same formal properties as a standard +CRF, but assigns zero probability to all label sequences not in $\mathcal{L}$. +Notably, RegCCRFs can incorporate their constraints during training, while +related models only enforce constraints during decoding. We prove that +constrained training is never worse than constrained decoding, and show +empirically that it can be substantially better in practice. Additionally, we +demonstrate a practical benefit on downstream tasks by incorporating a RegCCRF +into a deep neural model for semantic role labeling, exceeding state-of-the-art +results on a standard dataset. + +
+
+
+
+
+ + ♻ ☆ Personalised Language Modelling of Screen Characters Using Rich Metadata + Annotations + + +
+ Language models that are sensitive to external context can more effectively +capture the speaking patterns of individuals with specific characteristics or +in particular environments. However, obtaining and leveraging such annotations +can be challenging. In this work, we show how to leverage rich character and +film annotations to personalise language models in a scalable manner. Our best +model can reduce perplexity by up to 6.5% compared to a parameter-matched +language model. Our approach performs on par with speaker-specific fine-tuning +when the fine-tuning data (i.e. past dialogue) for individual speakers is +available. On top of that, it also generalises well to a scenario with no such +data, relying on combinations of demographic characteristics expressed via +metadata. Our findings are consistent across two corpora, one of which is also +a contribution of this paper: Cornell-rich contains rich manual annotations for +863 speaking characters from the Cornell Movie Dialog Corpus, including +features such as characteristic quotes and character descriptions, along with +six automatically extracted metadata features for over 95% of the featured +films. Finally, we also present a cost-benefit analysis highlighting which +annotations are most cost-effective in reducing perplexity. + +
+
+ comment: 9 pages; 4 figures; 6 tables. Preprint +
+
+
+
+
+ + ♻ ☆ Verifying the Robustness of Automatic Credibility Assessment + + +
+ Text classification methods have been widely investigated as a way to detect +content of low credibility: fake news, social media bots, propaganda, etc. +Quite accurate models (likely based on deep neural networks) help in moderating +public electronic platforms and often cause content creators to face rejection +of their submissions or removal of already published texts. Having the +incentive to evade further detection, content creators try to come up with a +slightly modified version of the text (known as an attack with an adversarial +example) that exploit the weaknesses of classifiers and result in a different +output. Here we systematically test the robustness of popular text classifiers +against available attacking techniques and discover that, indeed, in some cases +insignificant changes in input text can mislead the models. We also introduce +BODEGA: a benchmark for testing both victim models and attack methods on four +misinformation detection tasks in an evaluation framework designed to simulate +real use-cases of content moderation. Finally, we manually analyse a subset +adversarial examples and check what kinds of modifications are used in +successful attacks. The BODEGA code and data is openly shared in hope of +enhancing the comparability and replicability of further research in this area + +
+
+
+
+
+ + ♻ ☆ Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid + Essay in Education + + +
+ The recent large language models (LLMs), e.g., ChatGPT, have been able to +generate human-like and fluent responses when provided with specific +instructions. While admitting the convenience brought by technological +advancement, educators also have concerns that students might leverage LLMs to +complete their writing assignments and pass them off as their original work. +Although many AI content detection studies have been conducted as a result of +such concerns, most of these prior studies modeled AI content detection as a +classification problem, assuming that a text is either entirely human-written +or entirely AI-generated. In this study, we investigated AI content detection +in a rarely explored yet realistic setting where the text to be detected is +collaboratively written by human and generative LLMs (i.e., hybrid text). We +first formalized the detection task as identifying the transition points +between human-written content and AI-generated content from a given hybrid text +(boundary detection). Then we proposed a two-step approach where we (1) +separated AI-generated content from human-written content during the encoder +training process; and (2) calculated the distances between every two adjacent +prototypes and assumed that the boundaries exist between the two adjacent +prototypes that have the furthest distance from each other. Through extensive +experiments, we observed the following main findings: (1) the proposed approach +consistently outperformed the baseline methods across different experiment +settings; (2) the encoder training process can significantly boost the +performance of the proposed approach; (3) when detecting boundaries for +single-boundary hybrid essays, the proposed approach could be enhanced by +adopting a relatively large prototype size, leading to a 22% improvement in the +In-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation. + +
+
+ comment: 9 pages including references, 2 figures +
+
+
+
+
+ + ♻ ☆ Efficient Domain Adaptation of Sentence Embeddings Using Adapters + + +
+ Sentence embeddings enable us to capture the semantic similarity of short +texts. Most sentence embedding models are trained for general semantic textual +similarity (STS) tasks. Therefore, to use sentence embeddings in a particular +domain, the model must be adapted to it in order to achieve good results. +Usually, this is done by fine-tuning the entire sentence embedding model for +the domain of interest. While this approach yields state-of-the-art results, +all of the model's weights are updated during fine-tuning, making this method +resource-intensive. Therefore, instead of fine-tuning entire sentence embedding +models for each target domain individually, we propose to train lightweight +adapters. These domain-specific adapters do not require fine-tuning all +underlying sentence embedding model parameters. Instead, we only train a small +number of additional parameters while keeping the weights of the underlying +sentence embedding model fixed. Training domain-specific adapters allows always +using the same base model and only exchanging the domain-specific adapters to +adapt sentence embeddings to a specific domain. We show that using adapters for +parameter-efficient domain adaptation of sentence embeddings yields competitive +performance within 1% of a domain-adapted, entirely fine-tuned sentence +embedding model while only training approximately 3.6% of the parameters. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ LLM As DBA + + +
+ Database administrators (DBAs) play a crucial role in managing, maintaining +and optimizing a database system to ensure data availability, performance, and +reliability. However, it is hard and tedious for DBAs to manage a large number +of database instances (e.g., millions of instances on the cloud databases). +Recently large language models (LLMs) have shown great potential to understand +valuable documents and accordingly generate reasonable answers. Thus, we +propose D-Bot, a LLM-based database administrator that can continuously acquire +database maintenance experience from textual sources, and provide reasonable, +well-founded, in-time diagnosis and optimization advice for target databases. +This paper presents a revolutionary LLM-centric framework for database +maintenance, including (i) database maintenance knowledge detection from +documents and tools, (ii) tree of thought reasoning for root cause analysis, +and (iii) collaborative diagnosis among multiple LLMs. Our preliminary +experimental results that D-Bot can efficiently and effectively diagnose the +root causes and our code is available at +github.com/TsinghuaDatabaseGroup/DB-GPT. + +
+
+
+
+
+ + ♻ ☆ Evaluating the Generation Capabilities of Large Chinese Language Models + + +
+ This paper presents CG-Eval, the first comprehensive evaluation of the +generation capabilities of large Chinese language models across a wide range of +academic disciplines. The models' performance was assessed based on their +ability to generate accurate and relevant responses to different types of +questions in six disciplines, namely, Science and Engineering, Humanities and +Social Sciences, Mathematical Calculations, Medical Practitioner Qualification +Examination, Judicial Examination, and Certified Public Accountant Examination. +This paper also presents Gscore, a composite index derived from the weighted +sum of multiple metrics to measure the quality of model's generation against a +reference. The test data and test results can be found at +http://cgeval.besteasy.com/. + +
+
+
+
+
+ + ♻ ☆ Kuaipedia: a Large-scale Multi-modal Short-video Encyclopedia + + +
+ Online encyclopedias, such as Wikipedia, have been well-developed and +researched in the last two decades. One can find any attributes or other +information of a wiki item on a wiki page edited by a community of volunteers. +However, the traditional text, images and tables can hardly express some +aspects of an wiki item. For example, when we talk about ``Shiba Inu'', one may +care more about ``How to feed it'' or ``How to train it not to protect its +food''. Currently, short-video platforms have become a hallmark in the online +world. Whether you're on TikTok, Instagram, Kuaishou, or YouTube Shorts, +short-video apps have changed how we consume and create content today. Except +for producing short videos for entertainment, we can find more and more authors +sharing insightful knowledge widely across all walks of life. These short +videos, which we call knowledge videos, can easily express any aspects (e.g. +hair or how-to-feed) consumers want to know about an item (e.g. Shiba Inu), and +they can be systematically analyzed and organized like an online encyclopedia. +In this paper, we propose Kuaipedia, a large-scale multi-modal encyclopedia +consisting of items, aspects, and short videos lined to them, which was +extracted from billions of videos of Kuaishou (Kwai), a well-known short-video +platform in China. We first collected items from multiple sources and mined +user-centered aspects from millions of users' queries to build an item-aspect +tree. Then we propose a new task called ``multi-modal item-aspect linking'' as +an expansion of ``entity linking'' to link short videos into item-aspect pairs +and build the whole short-video encyclopedia. Intrinsic evaluations show that +our encyclopedia is of large scale and highly accurate. We also conduct +sufficient extrinsic experiments to show how Kuaipedia can help fundamental +applications such as entity typing and entity linking. + +
+
+
+
+
+ + ♻ ☆ Exploring Machine Learning and Transformer-based Approaches for + Deceptive Text Classification: A Comparative Analysis + + +
+ Deceptive text classification is a critical task in natural language +processing that aims to identify deceptive o fraudulent content. This study +presents a comparative analysis of machine learning and transformer-based +approaches for deceptive text classification. We investigate the effectiveness +of traditional machine learning algorithms and state-of-the-art transformer +models, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive +text. A labeled dataset consisting of deceptive and non-deceptive texts is used +for training and evaluation purposes. Through extensive experimentation, we +compare the performance metrics, including accuracy, precision, recall, and F1 +score, of the different approaches. The results of this study shed light on the +strengths and limitations of machine learning and transformer-based methods for +deceptive text classification, enabling researchers and practitioners to make +informed decisions when dealing with deceptive content. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Towards Generalist Foundation Model for Radiology + + +
+ In this study, we aim to initiate the development of Radiology Foundation +Model, termed as RadFM.We consider the construction of foundational models from +the perspectives of data, model design, and evaluation thoroughly. Our +contribution can be concluded as follows: (i), we construct a large-scale +Medical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans. +To the best of our knowledge, this is the first multi-modal dataset containing +3D medical scans. (ii), We propose an architecture that enables visually +conditioned generative pre-training, allowing for the integration of text input +interleaved with 2D or 3D medical scans to generate response for diverse +radiologic tasks. The model was initially pre-trained on MedMD and subsequently +domain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD, +containing 3M radiologic visual-language pairs. (iii), we propose a new +evaluation benchmark that comprises five tasks, aiming to comprehensively +assess the capability of foundation models in handling practical clinical +problems. Our experimental results confirm that RadFM significantly outperforms +existing multi-modal foundation models. The codes, data, and model checkpoint +will all be made publicly available to promote further research and development +in the field. + +
+
+
+
+
+ + ♻ ☆ Trained Transformers Learn Linear Models In-Context + + +
+ Attention-based neural networks such as transformers have demonstrated a +remarkable ability to exhibit in-context learning (ICL): Given a short prompt +sequence of tokens from an unseen task, they can formulate relevant per-token +and next-token predictions without any parameter updates. By embedding a +sequence of labeled training data and unlabeled test data as a prompt, this +allows for transformers to behave like supervised learning algorithms. Indeed, +recent work has shown that when training transformer architectures over random +instances of linear regression problems, these models' predictions mimic those +of ordinary least squares. + Towards understanding the mechanisms underlying this phenomenon, we +investigate the dynamics of ICL in transformers with a single linear +self-attention layer trained by gradient flow on linear regression tasks. We +show that despite non-convexity, gradient flow with a suitable random +initialization finds a global minimum of the objective function. At this global +minimum, when given a test prompt of labeled examples from a new prediction +task, the transformer achieves prediction error competitive with the best +linear predictor over the test prompt distribution. We additionally +characterize the robustness of the trained transformer to a variety of +distribution shifts and show that although a number of shifts are tolerated, +shifts in the covariate distribution of the prompts are not. Motivated by this, +we consider a generalized ICL setting where the covariate distributions can +vary across prompts. We show that although gradient flow succeeds at finding a +global minimum in this setting, the trained transformer is still brittle under +mild covariate shifts. We complement this finding with experiments on large, +nonlinear transformer architectures which we show are more robust under +covariate shifts. + +
+
+ comment: 50 pages, experiments added, reference added, typo corrected +
+
+
+
+
+ + ♻ ☆ What Can Transformers Learn In-Context? A Case Study of Simple Function + Classes + + +
+ In-context learning refers to the ability of a model to condition on a prompt +sequence consisting of in-context examples (input-output pairs corresponding to +some task) along with a new query input, and generate the corresponding output. +Crucially, in-context learning happens only at inference time without any +parameter updates to the model. While large language models such as GPT-3 +exhibit some ability to perform in-context learning, it is unclear what the +relationship is between tasks on which this succeeds and what is present in the +training data. To make progress towards understanding in-context learning, we +consider the well-defined problem of training a model to in-context learn a +function class (e.g., linear functions): that is, given data derived from some +functions in the class, can we train a model to in-context learn "most" +functions from this class? We show empirically that standard Transformers can +be trained from scratch to perform in-context learning of linear functions -- +that is, the trained model is able to learn unseen linear functions from +in-context examples with performance comparable to the optimal least squares +estimator. In fact, in-context learning is possible even under two forms of +distribution shift: (i) between the training data of the model and +inference-time prompts, and (ii) between the in-context examples and the query +input during inference. We also show that we can train Transformers to +in-context learn more complex function classes -- namely sparse linear +functions, two-layer neural networks, and decision trees -- with performance +that matches or exceeds task-specific learning algorithms. Our code and models +are available at https://github.com/dtsip/in-context-learning . + +
+
+
+
+
+ + ♻ ☆ Single-Sentence Reader: A Novel Approach for Addressing Answer Position + Bias + + +
+ Machine Reading Comprehension (MRC) models tend to take advantage of spurious +correlations (also known as dataset bias or annotation artifacts in the +research community). Consequently, these models may perform the MRC task +without fully comprehending the given context and question, which is +undesirable since it may result in low robustness against distribution shift. +This paper delves into the concept of answer-position bias, where a significant +percentage of training questions have answers located solely in the first +sentence of the context. We propose a Single-Sentence Reader as a new approach +for addressing answer position bias in MRC. We implement this approach using +six different models and thoroughly analyze their performance. Remarkably, our +proposed Single-Sentence Readers achieve results that nearly match those of +models trained on conventional training sets, proving their effectiveness. Our +study also discusses several challenges our Single-Sentence Readers encounter +and proposes a potential solution. + +
+
+ comment: 11 pages, 5 tables, 2 figures +
+
+
+
+
+ + ♻ ☆ Structured Chain-of-Thought Prompting for Code Generation + + +
+ Large Language Models (LLMs) (e.g., ChatGPT) have shown impressive +performance in code generation. LLMs take prompts as inputs, and +Chain-of-Thought (CoT) prompting is the state-of-the-art prompting technique. +CoT prompting asks LLMs first to generate CoTs (i.e., intermediate natural +language reasoning steps) and then output the code. However, CoT prompting is +designed for natural language generation and has low accuracy in code +generation. + In this paper, we propose Structured CoTs (SCoTs) and present a novel +prompting technique for code generation, named SCoT prompting. Our motivation +is source code contains rich structural information and any code can be +composed of three program structures (i.e., sequence, branch, and loop +structures). Intuitively, structured intermediate reasoning steps make for +structured source code. Thus, we ask LLMs to use program structures to build +CoTs, obtaining SCoTs. Then, LLMs generate the final code based on SCoTs. +Compared to CoT prompting, SCoT prompting explicitly constrains LLMs to think +about how to solve requirements from the view of source code and further the +performance of LLMs in code generation. We apply SCoT prompting to two LLMs +(i.e., ChatGPT and Codex) and evaluate it on three benchmarks (i.e., HumanEval, +MBPP, and MBCPP). (1) SCoT prompting outperforms the state-of-the-art baseline +- CoT prompting by up to 13.79% in Pass@1. (2) Human evaluation shows human +developers prefer programs from SCoT prompting. (3) SCoT prompting is robust to +examples and achieves substantial improvements. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.17780 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 91 + +
+
+
+ + ☆ FunnyBirds: A Synthetic Vision Dataset for a Part-Based Analysis of + Explainable AI Methods ICCV 2023 + + +
+ The field of explainable artificial intelligence (XAI) aims to uncover the +inner workings of complex deep neural models. While being crucial for +safety-critical domains, XAI inherently lacks ground-truth explanations, making +its automatic evaluation an unsolved problem. We address this challenge by +proposing a novel synthetic vision dataset, named FunnyBirds, and accompanying +automatic evaluation protocols. Our dataset allows performing semantically +meaningful image interventions, e.g., removing individual object parts, which +has three important implications. First, it enables analyzing explanations on a +part level, which is closer to human comprehension than existing methods that +evaluate on a pixel level. Second, by comparing the model output for inputs +with removed parts, we can estimate ground-truth part importances that should +be reflected in the explanations. Third, by mapping individual explanations +into a common space of part importances, we can analyze a variety of different +explanation types in a single common framework. Using our tools, we report +results for 24 different combinations of neural models and XAI methods, +demonstrating the strengths and weaknesses of the assessed methods in a fully +automatic and systematic manner. + +
+
+ comment: Accepted at ICCV 2023. Code: https://github.com/visinf/funnybirds +
+
+
+
+
+ + ☆ Continual Face Forgery Detection via Historical Distribution Preserving + + +
+ Face forgery techniques have advanced rapidly and pose serious security +threats. Existing face forgery detection methods try to learn generalizable +features, but they still fall short of practical application. Additionally, +finetuning these methods on historical training data is resource-intensive in +terms of time and storage. In this paper, we focus on a novel and challenging +problem: Continual Face Forgery Detection (CFFD), which aims to efficiently +learn from new forgery attacks without forgetting previous ones. Specifically, +we propose a Historical Distribution Preserving (HDP) framework that reserves +and preserves the distributions of historical faces. To achieve this, we use +universal adversarial perturbation (UAP) to simulate historical forgery +distribution, and knowledge distillation to maintain the distribution variation +of real faces across different models. We also construct a new benchmark for +CFFD with three evaluation protocols. Our extensive experiments on the +benchmarks show that our method outperforms the state-of-the-art competitors. + +
+
+
+
+
+ + ☆ Exploring Predicate Visual Context in Detecting of Human-Object + Interactions ICCV2023 + + +
+ Recently, the DETR framework has emerged as the dominant approach for +human--object interaction (HOI) research. In particular, two-stage +transformer-based HOI detectors are amongst the most performant and +training-efficient approaches. However, these often condition HOI +classification on object features that lack fine-grained contextual +information, eschewing pose and orientation information in favour of visual +cues about object identity and box extremities. This naturally hinders the +recognition of complex or ambiguous interactions. In this work, we study these +issues through visualisations and carefully designed experiments. Accordingly, +we investigate how best to re-introduce image features via cross-attention. +With an improved query design, extensive exploration of keys and values, and +box pair positional embeddings as spatial guidance, our model with enhanced +predicate visual context (PViC) outperforms state-of-the-art methods on the +HICO-DET and V-COCO benchmarks, while maintaining low training cost. + +
+
+ comment: To appear in ICCV2023 +
+
+
+
+
+ + ☆ DIG In: Evaluating Disparities in Image Generations with Indicators for + Geographic Diversity + + +
+ The unprecedented photorealistic results achieved by recent text-to-image +generative systems and their increasing use as plug-and-play content creation +solutions make it crucial to understand their potential biases. In this work, +we introduce three indicators to evaluate the realism, diversity and +prompt-generation consistency of text-to-image generative systems when prompted +to generate objects from across the world. Our indicators complement +qualitative analysis of the broader impact of such systems by enabling +automatic and efficient benchmarking of geographic disparities, an important +step towards building responsible visual content creation systems. We use our +proposed indicators to analyze potential geographic biases in state-of-the-art +visual content creation systems and find that: (1) models have less realism and +diversity of generations when prompting for Africa and West Asia than Europe, +(2) prompting with geographic information comes at a cost to prompt-consistency +and diversity of generated images, and (3) models exhibit more region-level +disparities for some objects than others. Perhaps most interestingly, our +indicators suggest that progress in image generation quality has come at the +cost of real-world geographic representation. Our comprehensive evaluation +constitutes a crucial step towards ensuring a positive experience of visual +content creation for everyone. + +
+
+
+
+
+ + ☆ Complex Facial Expression Recognition Using Deep Knowledge Distillation + of Basic Features + + +
+ Complex emotion recognition is a cognitive task that has so far eluded the +same excellent performance of other tasks that are at or above the level of +human cognition. Emotion recognition through facial expressions is particularly +difficult due to the complexity of emotions expressed by the human face. For a +machine to approach the same level of performance in this domain as a human, it +may need to synthesise knowledge and understand new concepts in real-time as +humans do. Humans are able to learn new concepts using only few examples, by +distilling the important information from memories and discarding the rest. +Similarly, continual learning methods learn new classes whilst retaining the +knowledge of known classes, whilst few-shot learning methods are able to learn +new classes using very few training examples. We propose a novel continual +learning method inspired by human cognition and learning that can accurately +recognise new compound expression classes using few training samples, by +building on and retaining its knowledge of basic expression classes. Using +GradCAM visualisations, we demonstrate the relationship between basic and +compound facial expressions, which our method leverages through knowledge +distillation and a novel Predictive Sorting Memory Replay. Our method achieves +the current state-of-the-art in continual learning for complex facial +expression recognition with 74.28% Overall Accuracy on new classes. We also +demonstrate that using continual learning for complex facial expression +recognition achieves far better performance than non-continual learning +methods, improving on state-of-the-art non-continual learning methods by +13.95%. To the best of our knowledge, our work is also the first to apply +few-shot learning to complex facial expression recognition, achieving the +state-of-the-art with 100% accuracy using a single training sample for each +expression class. + +
+
+ comment: 17 pages, 9 figures, 6 tables. Code available at + https://github.com/AngusMaiden/complex-FER +
+
+
+
+
+ + ☆ Physical Adversarial Attacks For Camera-based Smart Systems: Current + Trends, Categorization, Applications, Research Challenges, and Future Outlook + + +
+ In this paper, we present a comprehensive survey of the current trends +focusing specifically on physical adversarial attacks. We aim to provide a +thorough understanding of the concept of physical adversarial attacks, +analyzing their key characteristics and distinguishing features. Furthermore, +we explore the specific requirements and challenges associated with executing +attacks in the physical world. Our article delves into various physical +adversarial attack methods, categorized according to their target tasks in +different applications, including classification, detection, face recognition, +semantic segmentation and depth estimation. We assess the performance of these +attack methods in terms of their effectiveness, stealthiness, and robustness. +We examine how each technique strives to ensure the successful manipulation of +DNNs while mitigating the risk of detection and withstanding real-world +distortions. Lastly, we discuss the current challenges and outline potential +future research directions in the field of physical adversarial attacks. We +highlight the need for enhanced defense mechanisms, the exploration of novel +attack strategies, the evaluation of attacks in different application domains, +and the establishment of standardized benchmarks and evaluation criteria for +physical adversarial attacks. Through this comprehensive survey, we aim to +provide a valuable resource for researchers, practitioners, and policymakers to +gain a holistic understanding of physical adversarial attacks in computer +vision and facilitate the development of robust and secure DNN-based systems. + +
+
+
+
+
+ + ☆ Rethinking the Localization in Weakly Supervised Object Localization + + +
+ Weakly supervised object localization (WSOL) is one of the most popular and +challenging tasks in computer vision. This task is to localize the objects in +the images given only the image-level supervision. Recently, dividing WSOL into +two parts (class-agnostic object localization and object classification) has +become the state-of-the-art pipeline for this task. However, existing solutions +under this pipeline usually suffer from the following drawbacks: 1) they are +not flexible since they can only localize one object for each image due to the +adopted single-class regression (SCR) for localization; 2) the generated pseudo +bounding boxes may be noisy, but the negative impact of such noise is not well +addressed. To remedy these drawbacks, we first propose to replace SCR with a +binary-class detector (BCD) for localizing multiple objects, where the detector +is trained by discriminating the foreground and background. Then we design a +weighted entropy (WE) loss using the unlabeled data to reduce the negative +impact of noisy bounding boxes. Extensive experiments on the popular +CUB-200-2011 and ImageNet-1K datasets demonstrate the effectiveness of our +method. + +
+
+ comment: Accepted by ACM International Conference on Multimedia 2023 +
+
+
+
+
+ + ☆ DatasetDM: Synthesizing Data with Perception Annotations Using Diffusion + Models + + +
+ Current deep networks are very data-hungry and benefit from training on +largescale datasets, which are often time-consuming to collect and annotate. By +contrast, synthetic data can be generated infinitely using generative models +such as DALL-E and diffusion models, with minimal effort and cost. In this +paper, we present DatasetDM, a generic dataset generation model that can +produce diverse synthetic images and the corresponding high-quality perception +annotations (e.g., segmentation masks, and depth). Our method builds upon the +pre-trained diffusion model and extends text-guided image synthesis to +perception data generation. We show that the rich latent code of the diffusion +model can be effectively decoded as accurate perception annotations using a +decoder module. Training the decoder only needs less than 1% (around 100 +images) manually labeled images, enabling the generation of an infinitely large +annotated dataset. Then these synthetic data can be used for training various +perception models for downstream tasks. To showcase the power of the proposed +approach, we generate datasets with rich dense pixel-wise labels for a wide +range of downstream tasks, including semantic segmentation, instance +segmentation, and depth estimation. Notably, it achieves 1) state-of-the-art +results on semantic segmentation and instance segmentation; 2) significantly +more robust on domain generalization than using the real data alone; and +state-of-the-art results in zero-shot segmentation setting; and 3) flexibility +for efficient application and novel task composition (e.g., image editing). The +project website and code can be found at +https://weijiawu.github.io/DatasetDM_page/ and +https://github.com/showlab/DatasetDM, respectively + +
+
+
+
+
+ + ☆ Efficient Large-scale AUV-based Visual Seafloor Mapping + + +
+ Driven by the increasing number of marine data science applications, there is +a growing interest in surveying and exploring the vast, uncharted terrain of +the deep sea with robotic platforms. Despite impressive results achieved by +many on-land visual mapping algorithms in the past decades, transferring these +methods from land to the deep sea remains a challenge due to harsh +environmental conditions. Typically, deep-sea exploration involves the use of +autonomous underwater vehicles (AUVs) equipped with high-resolution cameras and +artificial illumination systems. However, images obtained in this manner often +suffer from heterogeneous illumination and quality degradation due to +attenuation and scattering, on top of refraction of light rays. All of this +together often lets on-land SLAM approaches fail underwater or makes +Structure-from-Motion approaches drift or omit difficult images, resulting in +gaps, jumps or weakly registered areas. In this work, we present a system that +incorporates recent developments in underwater imaging and visual mapping to +facilitate automated robotic 3D reconstruction of hectares of seafloor. Our +approach is efficient in that it detects and reconsiders difficult, weakly +registered areas, to avoid omitting images and to make better use of limited +dive time; on the other hand it is computationally efficient; leveraging a +hybrid approach combining benefits from SLAM and Structure-from-Motion that +runs much faster than incremental reconstructions while achieving at least +on-par performance. The proposed system has been extensively tested and +evaluated during several research cruises, demonstrating its robustness and +practicality in real-world conditions. + +
+
+ comment: 27 pages, 21 figures +
+
+
+
+
+ + ☆ CompTLL-UNet: Compressed Domain Text-Line Localization in Challenging + Handwritten Documents using Deep Feature Learning from JPEG Coefficients + + +
+ Automatic localization of text-lines in handwritten documents is still an +open and challenging research problem. Various writing issues such as uneven +spacing between the lines, oscillating and touching text, and the presence of +skew become much more challenging when the case of complex handwritten document +images are considered for segmentation directly in their respective compressed +representation. This is because, the conventional way of processing compressed +documents is through decompression, but here in this paper, we propose an idea +that employs deep feature learning directly from the JPEG compressed +coefficients without full decompression to accomplish text-line localization in +the JPEG compressed domain. A modified U-Net architecture known as Compressed +Text-Line Localization Network (CompTLL-UNet) is designed to accomplish it. The +model is trained and tested with JPEG compressed version of benchmark datasets +including ICDAR2017 (cBAD) and ICDAR2019 (cBAD), reporting the state-of-the-art +performance with reduced storage and computational costs in the JPEG compressed +domain. + +
+
+ comment: Accepted in 7th Asian Conference on Pattern Recognition (ACPR 2023), + 5-8 November 2023, Kitakyushu, Japan +
+
+
+
+
+ + ☆ Uncertainty Quantification for Image-based Traffic Prediction across + Cities + + +
+ Despite the strong predictive performance of deep learning models for traffic +prediction, their widespread deployment in real-world intelligent +transportation systems has been restrained by a lack of interpretability. +Uncertainty quantification (UQ) methods provide an approach to induce +probabilistic reasoning, improve decision-making and enhance model deployment +potential. To gain a comprehensive picture of the usefulness of existing UQ +methods for traffic prediction and the relation between obtained uncertainties +and city-wide traffic dynamics, we investigate their application to a +large-scale image-based traffic dataset spanning multiple cities and time +periods. We compare two epistemic and two aleatoric UQ methods on both temporal +and spatio-temporal transfer tasks, and find that meaningful uncertainty +estimates can be recovered. We further demonstrate how uncertainty estimates +can be employed for unsupervised outlier detection on changes in city traffic +dynamics. We find that our approach can capture both temporal and spatial +effects on traffic behaviour in a representative case study for the city of +Moscow. Our work presents a further step towards boosting uncertainty awareness +in traffic prediction tasks, and aims to highlight the value contribution of UQ +methods to a better understanding of city traffic dynamics. + +
+
+ comment: 39 pages, 22 figures. Code publicly available at: + https://github.com/alextimans/traffic4cast-uncertainty +
+
+
+
+
+ + ☆ Taming the Power of Diffusion Models for High-Quality Virtual Try-On + with Appearance Flow + + +
+ Virtual try-on is a critical image synthesis task that aims to transfer +clothes from one image to another while preserving the details of both humans +and clothes. While many existing methods rely on Generative Adversarial +Networks (GANs) to achieve this, flaws can still occur, particularly at high +resolutions. Recently, the diffusion model has emerged as a promising +alternative for generating high-quality images in various applications. +However, simply using clothes as a condition for guiding the diffusion model to +inpaint is insufficient to maintain the details of the clothes. To overcome +this challenge, we propose an exemplar-based inpainting approach that leverages +a warping module to guide the diffusion model's generation effectively. The +warping module performs initial processing on the clothes, which helps to +preserve the local details of the clothes. We then combine the warped clothes +with clothes-agnostic person image and add noise as the input of diffusion +model. Additionally, the warped clothes is used as local conditions for each +denoising process to ensure that the resulting output retains as much detail as +possible. Our approach, namely Diffusion-based Conditional Inpainting for +Virtual Try-ON (DCI-VTON), effectively utilizes the power of the diffusion +model, and the incorporation of the warping module helps to produce +high-quality and realistic virtual try-on results. Experimental results on +VITON-HD demonstrate the effectiveness and superiority of our method. + +
+
+ comment: Accepted by ACMMM 2023 +
+
+
+
+
+ + ☆ Diffusion-based Visual Counterfactual Explanations -- Towards Systematic + Quantitative Evaluation ECML 2023 + + +
+ Latest methods for visual counterfactual explanations (VCE) harness the power +of deep generative models to synthesize new examples of high-dimensional images +of impressive quality. However, it is currently difficult to compare the +performance of these VCE methods as the evaluation procedures largely vary and +often boil down to visual inspection of individual examples and small scale +user studies. In this work, we propose a framework for systematic, quantitative +evaluation of the VCE methods and a minimal set of metrics to be used. We use +this framework to explore the effects of certain crucial design choices in the +latest diffusion-based generative models for VCEs of natural image +classification (ImageNet). We conduct a battery of ablation-like experiments, +generating thousands of VCEs for a suite of classifiers of various complexity, +accuracy and robustness. Our findings suggest multiple directions for future +advancements and improvements of VCE methods. By sharing our methodology and +our approach to tackle the computational challenges of such a study on a +limited hardware setup (including the complete code base), we offer a valuable +guidance for researchers in the field fostering consistency and transparency in +the assessment of counterfactual explanations. + +
+
+ comment: Accepted at the 5th International Workshop on eXplainable Knowledge + Discovery in Data Mining @ ECML 2023 +
+
+
+
+
+ + ☆ Automated Construction of Time-Space Diagrams for Traffic Analysis Using + Street-View Video Sequence + + +
+ Time-space diagrams are essential tools for analyzing traffic patterns and +optimizing transportation infrastructure and traffic management strategies. +Traditional data collection methods for these diagrams have limitations in +terms of temporal and spatial coverage. Recent advancements in camera +technology have overcome these limitations and provided extensive urban data. +In this study, we propose an innovative approach to constructing time-space +diagrams by utilizing street-view video sequences captured by cameras mounted +on moving vehicles. Using the state-of-the-art YOLOv5, StrongSORT, and +photogrammetry techniques for distance calculation, we can infer vehicle +trajectories from the video data and generate time-space diagrams. To evaluate +the effectiveness of our proposed method, we utilized datasets from the KITTI +computer vision benchmark suite. The evaluation results demonstrate that our +approach can generate trajectories from video data, although there are some +errors that can be mitigated by improving the performance of the detector, +tracker, and distance calculation components. In conclusion, the utilization of +street-view video sequences captured by cameras mounted on moving vehicles, +combined with state-of-the-art computer vision techniques, has immense +potential for constructing comprehensive time-space diagrams. These diagrams +offer valuable insights into traffic patterns and contribute to the design of +transportation infrastructure and traffic management strategies. + +
+
+
+
+
+ + ☆ RIGID: Recurrent GAN Inversion and Editing of Real Face Videos ICCV2023 + + +
+ GAN inversion is indispensable for applying the powerful editability of GAN +to real images. However, existing methods invert video frames individually +often leading to undesired inconsistent results over time. In this paper, we +propose a unified recurrent framework, named \textbf{R}ecurrent v\textbf{I}deo +\textbf{G}AN \textbf{I}nversion and e\textbf{D}iting (RIGID), to explicitly and +simultaneously enforce temporally coherent GAN inversion and facial editing of +real videos. Our approach models the temporal relations between current and +previous frames from three aspects. To enable a faithful real video +reconstruction, we first maximize the inversion fidelity and consistency by +learning a temporal compensated latent code. Second, we observe incoherent +noises lie in the high-frequency domain that can be disentangled from the +latent space. Third, to remove the inconsistency after attribute manipulation, +we propose an \textit{in-between frame composition constraint} such that the +arbitrary frame must be a direct composite of its neighboring frames. Our +unified framework learns the inherent coherence between input frames in an +end-to-end manner, and therefore it is agnostic to a specific attribute and can +be applied to arbitrary editing of the same video without re-training. +Extensive experiments demonstrate that RIGID outperforms state-of-the-art +methods qualitatively and quantitatively in both inversion and editing tasks. +The deliverables can be found in \url{https://cnnlstm.github.io/RIGID} + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ☆ Experts Weights Averaging: A New General Training Scheme for Vision + Transformers + + +
+ Structural re-parameterization is a general training scheme for Convolutional +Neural Networks (CNNs), which achieves performance improvement without +increasing inference cost. As Vision Transformers (ViTs) are gradually +surpassing CNNs in various visual tasks, one may question: if a training scheme +specifically for ViTs exists that can also achieve performance improvement +without increasing inference cost? Recently, Mixture-of-Experts (MoE) has +attracted increasing attention, as it can efficiently scale up the capacity of +Transformers at a fixed cost through sparsely activated experts. Considering +that MoE can also be viewed as a multi-branch structure, can we utilize MoE to +implement a ViT training scheme similar to structural re-parameterization? In +this paper, we affirmatively answer these questions, with a new general +training strategy for ViTs. Specifically, we decouple the training and +inference phases of ViTs. During training, we replace some Feed-Forward +Networks (FFNs) of the ViT with specially designed, more efficient MoEs that +assign tokens to experts by random uniform partition, and perform Experts +Weights Averaging (EWA) on these MoEs at the end of each iteration. After +training, we convert each MoE into an FFN by averaging the experts, +transforming the model back into original ViT for inference. We further provide +a theoretical analysis to show why and how it works. Comprehensive experiments +across various 2D and 3D visual tasks, ViT architectures, and datasets validate +the effectiveness and generalizability of the proposed training scheme. +Besides, our training scheme can also be applied to improve performance when +fine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can +significantly improve the effectiveness of naive MoE in various 2D visual small +datasets and 3D visual tasks. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Versatile Face Animator: Driving Arbitrary 3D Facial Avatar in RGBD + Space ACM MM2023 + + +
+ Creating realistic 3D facial animation is crucial for various applications in +the movie production and gaming industry, especially with the burgeoning demand +in the metaverse. However, prevalent methods such as blendshape-based +approaches and facial rigging techniques are time-consuming, labor-intensive, +and lack standardized configurations, making facial animation production +challenging and costly. In this paper, we propose a novel self-supervised +framework, Versatile Face Animator, which combines facial motion capture with +motion retargeting in an end-to-end manner, eliminating the need for +blendshapes or rigs. Our method has the following two main characteristics: 1) +we propose an RGBD animation module to learn facial motion from raw RGBD videos +by hierarchical motion dictionaries and animate RGBD images rendered from 3D +facial mesh coarse-to-fine, enabling facial animation on arbitrary 3D +characters regardless of their topology, textures, blendshapes, and rigs; and +2) we introduce a mesh retarget module to utilize RGBD animation to create 3D +facial animation by manipulating facial mesh with controller transformations, +which are estimated from dense optical flow fields and blended together with +geodesic-distance-based weights. Comprehensive experiments demonstrate the +effectiveness of our proposed framework in generating impressive 3D facial +animation results, highlighting its potential as a promising solution for the +cost-effective and efficient production of facial animation in the metaverse. + +
+
+ comment: Accepted by ACM MM2023 +
+
+
+
+
+ + ☆ Out-of-Distribution Detection for Monocular Depth Estimation ICCV 2023 + + +
+ In monocular depth estimation, uncertainty estimation approaches mainly +target the data uncertainty introduced by image noise. In contrast to prior +work, we address the uncertainty due to lack of knowledge, which is relevant +for the detection of data not represented by the training distribution, the +so-called out-of-distribution (OOD) data. Motivated by anomaly detection, we +propose to detect OOD images from an encoder-decoder depth estimation model +based on the reconstruction error. Given the features extracted with the fixed +depth encoder, we train an image decoder for image reconstruction using only +in-distribution data. Consequently, OOD images result in a high reconstruction +error, which we use to distinguish between in- and out-of-distribution samples. +We built our experiments on the standard NYU Depth V2 and KITTI benchmarks as +in-distribution data. Our post hoc method performs astonishingly well on +different models and outperforms existing uncertainty estimation approaches +without modifying the trained encoder-decoder depth estimation model. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Head Rotation in Denoising Diffusion Models + + +
+ Denoising Diffusion Models (DDM) are emerging as the cutting-edge technology +in the realm of deep generative modeling, challenging the dominance of +Generative Adversarial Networks. However, effectively exploring the latent +space's semantics and identifying compelling trajectories for manipulating and +editing important attributes of the generated samples remains challenging, +primarily due to the high-dimensional nature of the latent space. In this +study, we specifically concentrate on face rotation, which is known to be one +of the most intricate editing operations. By leveraging a recent embedding +technique for Denoising Diffusion Implicit Models (DDIM), we achieve, in many +cases, noteworthy manipulations encompassing a wide rotation angle of $\pm +30^o$, preserving the distinct characteristics of the individual. Our +methodology exploits the computation of trajectories approximating clouds of +latent representations of dataset samples with different yaw rotations through +linear regression. Specific trajectories are obtained by restricting the +analysis to subsets of data sharing significant attributes with the source +image. One of these attributes is the light provenance: a byproduct of our +research is a labeling of CelebA, categorizing images into three major groups +based on the illumination direction: left, center, and right. + +
+
+
+
+
+ + ☆ Computer-Aided Cytology Diagnosis in Animals: CNN-Based Image Quality + Assessment for Accurate Disease Classification + + +
+ This paper presents a computer-aided cytology diagnosis system designed for +animals, focusing on image quality assessment (IQA) using Convolutional Neural +Networks (CNNs). The system's building blocks are tailored to seamlessly +integrate IQA, ensuring reliable performance in disease classification. We +extensively investigate the CNN's ability to handle various image variations +and scenarios, analyzing the impact on detecting low-quality input data. +Additionally, the network's capacity to differentiate valid cellular samples +from those with artifacts is evaluated. Our study employs a ResNet18 network +architecture and explores the effects of input sizes and cropping strategies on +model performance. The research sheds light on the significance of CNN-based +IQA in computer-aided cytology diagnosis for animals, enhancing the accuracy of +disease classification. + +
+
+
+
+
+ + ☆ Hardware Accelerators in Autonomous Driving + + +
+ Computing platforms in autonomous vehicles record large amounts of data from +many sensors, process the data through machine learning models, and make +decisions to ensure the vehicle's safe operation. Fast, accurate, and reliable +decision-making is critical. Traditional computer processors lack the power and +flexibility needed for the perception and machine vision demands of advanced +autonomous driving tasks. Hardware accelerators are special-purpose +coprocessors that help autonomous vehicles meet performance requirements for +higher levels of autonomy. This paper provides an overview of ML accelerators +with examples of their use for machine vision in autonomous vehicles. We offer +recommendations for researchers and practitioners and highlight a trajectory +for ongoing and future research in this emerging field. + +
+
+
+
+
+ + ☆ Towards Instance-adaptive Inference for Federated Learning + + +
+ Federated learning (FL) is a distributed learning paradigm that enables +multiple clients to learn a powerful global model by aggregating local +training. However, the performance of the global model is often hampered by +non-i.i.d. distribution among the clients, requiring extensive efforts to +mitigate inter-client data heterogeneity. Going beyond inter-client data +heterogeneity, we note that intra-client heterogeneity can also be observed on +complex real-world data and seriously deteriorate FL performance. In this +paper, we present a novel FL algorithm, i.e., FedIns, to handle intra-client +data heterogeneity by enabling instance-adaptive inference in the FL framework. +Instead of huge instance-adaptive models, we resort to a parameter-efficient +fine-tuning method, i.e., scale and shift deep features (SSF), upon a +pre-trained model. Specifically, we first train an SSF pool for each client, +and aggregate these SSF pools on the server side, thus still maintaining a low +communication cost. To enable instance-adaptive inference, for a given +instance, we dynamically find the best-matched SSF subsets from the pool and +aggregate them to generate an adaptive SSF specified for the instance, thereby +reducing the intra-client as well as the inter-client heterogeneity. Extensive +experiments show that our FedIns outperforms state-of-the-art FL algorithms, +e.g., a 6.64\% improvement against the top-performing method with less than +15\% communication cost on Tiny-ImageNet. Our code and models will be publicly +released. + +
+
+
+
+
+ + ☆ Diverse Data Augmentation with Diffusions for Effective Test-time Prompt + Tuning + + +
+ Benefiting from prompt tuning, recent years have witnessed the promising +performance of pre-trained vision-language models, e.g., CLIP, on versatile +downstream tasks. In this paper, we focus on a particular setting of learning +adaptive prompts on the fly for each test sample from an unseen new domain, +which is known as test-time prompt tuning (TPT). Existing TPT methods typically +rely on data augmentation and confidence selection. However, conventional data +augmentation techniques, e.g., random resized crops, suffers from the lack of +data diversity, while entropy-based confidence selection alone is not +sufficient to guarantee prediction fidelity. To address these issues, we +propose a novel TPT method, named DiffTPT, which leverages pre-trained +diffusion models to generate diverse and informative new data. Specifically, we +incorporate augmented data by both conventional method and pre-trained stable +diffusion to exploit their respective merits, improving the models ability to +adapt to unknown new test data. Moreover, to ensure the prediction fidelity of +generated data, we introduce a cosine similarity-based filtration technique to +select the generated data with higher similarity to the single test sample. Our +experiments on test datasets with distribution shifts and unseen categories +demonstrate that DiffTPT improves the zero-shot accuracy by an average of +5.13\% compared to the state-of-the-art TPT method. Our code and models will be +publicly released. + +
+
+ comment: 8 pages,9 figures +
+
+
+
+
+ + ☆ Masked-Attention Diffusion Guidance for Spatially Controlling + Text-to-Image Generation + + +
+ Text-to-image synthesis has achieved high-quality results with recent +advances in diffusion models. However, text input alone has high spatial +ambiguity and limited user controllability. Most existing methods allow spatial +control through additional visual guidance (e.g, sketches and semantic masks) +but require additional training with annotated images. In this paper, we +propose a method for spatially controlling text-to-image generation without +further training of diffusion models. Our method is based on the insight that +the cross-attention maps reflect the positional relationship between words and +pixels. Our aim is to control the attention maps according to given semantic +masks and text prompts. To this end, we first explore a simple approach of +directly swapping the cross-attention maps with constant maps computed from the +semantic regions. Moreover, we propose masked-attention guidance, which can +generate images more faithful to semantic masks than the first approach. +Masked-attention guidance indirectly controls attention to each word and pixel +according to the semantic regions by manipulating noise images fed to diffusion +models. Experiments show that our method enables more accurate spatial control +than baselines qualitatively and quantitatively. + +
+
+
+
+
+ + ☆ Spatial-information Guided Adaptive Context-aware Network for Efficient + RGB-D Semantic Segmentation + + +
+ Efficient RGB-D semantic segmentation has received considerable attention in +mobile robots, which plays a vital role in analyzing and recognizing +environmental information. According to previous studies, depth information can +provide corresponding geometric relationships for objects and scenes, but +actual depth data usually exist as noise. To avoid unfavorable effects on +segmentation accuracy and computation, it is necessary to design an efficient +framework to leverage cross-modal correlations and complementary cues. In this +paper, we propose an efficient lightweight encoder-decoder network that reduces +the computational parameters and guarantees the robustness of the algorithm. +Working with channel and spatial fusion attention modules, our network +effectively captures multi-level RGB-D features. A globally guided local +affinity context module is proposed to obtain sufficient high-level context +information. The decoder utilizes a lightweight residual unit that combines +short- and long-distance information with a few redundant computations. +Experimental results on NYUv2, SUN RGB-D, and Cityscapes datasets show that our +method achieves a better trade-off among segmentation accuracy, inference time, +and parameters than the state-of-the-art methods. The source code will be at +https://github.com/MVME-HBUT/SGACNet + +
+
+ comment: Accepted by IEEE Sensors Journal +
+
+
+
+
+ + ☆ Scale-Preserving Automatic Concept Extraction (SPACE) + + +
+ Convolutional Neural Networks (CNN) have become a common choice for +industrial quality control, as well as other critical applications in the +Industry 4.0. When these CNNs behave in ways unexpected to human users or +developers, severe consequences can arise, such as economic losses or an +increased risk to human life. Concept extraction techniques can be applied to +increase the reliability and transparency of CNNs through generating global +explanations for trained neural network models. The decisive features of image +datasets in quality control often depend on the feature's scale; for example, +the size of a hole or an edge. However, existing concept extraction methods do +not correctly represent scale, which leads to problems interpreting these +models as we show herein. To address this issue, we introduce the +Scale-Preserving Automatic Concept Extraction (SPACE) algorithm, as a +state-of-the-art alternative concept extraction technique for CNNs, focused on +industrial applications. SPACE is specifically designed to overcome the +aforementioned problems by avoiding scale changes throughout the concept +extraction process. SPACE proposes an approach based on square slices of input +images, which are selected and then tiled before being clustered into concepts. +Our method provides explanations of the models' decision-making process in the +form of human-understandable concepts. We evaluate SPACE on three image +classification datasets in the context of industrial quality control. Through +experimental results, we illustrate how SPACE outperforms other methods and +provides actionable insights on the decision mechanisms of CNNs. Finally, code +for the implementation of SPACE is provided. + +
+
+ comment: 22 pages, 7 figures +
+
+
+
+
+ + ☆ Enhancing Generalization of Universal Adversarial Perturbation through + Gradient Aggregation + + +
+ Deep neural networks are vulnerable to universal adversarial perturbation +(UAP), an instance-agnostic perturbation capable of fooling the target model +for most samples. Compared to instance-specific adversarial examples, UAP is +more challenging as it needs to generalize across various samples and models. +In this paper, we examine the serious dilemma of UAP generation methods from a +generalization perspective -- the gradient vanishing problem using small-batch +stochastic gradient optimization and the local optima problem using large-batch +optimization. To address these problems, we propose a simple and effective +method called Stochastic Gradient Aggregation (SGA), which alleviates the +gradient vanishing and escapes from poor local optima at the same time. +Specifically, SGA employs the small-batch training to perform multiple +iterations of inner pre-search. Then, all the inner gradients are aggregated as +a one-step gradient estimation to enhance the gradient stability and reduce +quantization errors. Extensive experiments on the standard ImageNet dataset +demonstrate that our method significantly enhances the generalization ability +of UAP and outperforms other state-of-the-art methods. The code is available at +https://github.com/liuxuannan/Stochastic-Gradient-Aggregation. + +
+
+
+
+
+ + ☆ ViGT: Proposal-free Video Grounding with Learnable Token in Transformer SC + + +
+ The video grounding (VG) task aims to locate the queried action or event in +an untrimmed video based on rich linguistic descriptions. Existing +proposal-free methods are trapped in complex interaction between video and +query, overemphasizing cross-modal feature fusion and feature correlation for +VG. In this paper, we propose a novel boundary regression paradigm that +performs regression token learning in a transformer. Particularly, we present a +simple but effective proposal-free framework, namely Video Grounding +Transformer (ViGT), which predicts the temporal boundary using a learnable +regression token rather than multi-modal or cross-modal features. In ViGT, the +benefits of a learnable token are manifested as follows. (1) The token is +unrelated to the video or the query and avoids data bias toward the original +video and query. (2) The token simultaneously performs global context +aggregation from video and query features. First, we employed a sharing feature +encoder to project both video and query into a joint feature space before +performing cross-modal co-attention (i.e., video-to-query attention and +query-to-video attention) to highlight discriminative features in each +modality. Furthermore, we concatenated a learnable regression token [REG] with +the video and query features as the input of a vision-language transformer. +Finally, we utilized the token [REG] to predict the target moment and visual +features to constrain the foreground and background probabilities at each +timestamp. The proposed ViGT performed well on three public datasets: ANet +Captions, TACoS and YouCookII. Extensive ablation studies and qualitative +analysis further validated the interpretability of ViGT. + +
+
+ comment: This paper has been accepted by SCIENCE CHINA Information Sciences +
+
+
+
+
+ + ☆ Image-based Geolocalization by Ground-to-2.5D Map Matching + + +
+ We study the image-based geolocalization problem that aims to locate +ground-view query images on cartographic maps. Previous methods often utilize +cross-view localization techniques to match ground-view query images with 2D +maps. However, the performance of these methods is frequently unsatisfactory +due to the significant cross-view appearance differences. In this paper, we +extend cross-view matching to 2.5D spaces, where the heights of the structures +- such as trees, buildings, and other objects - can provide additional +information to guide the cross-view matching. We present a new approach to +learning representative embeddings from multi-model data. Specifically, we +first align 2D maps to ground-view panoramic images with polar transform to +reduce the gap between panoramic images and maps. Then we leverage global +fusion to fuse the multi-modal features from 2D and 2.5D maps to increase the +distinctiveness of location embeddings. We construct the first large-scale +ground-to-2.5D map geolocalization dataset to validate our method and +facilitate the research. We test our learned embeddings on two popular +localization approaches, i.e., single-image based localization, and route based +localization. Extensive experiments demonstrate that our proposed method +achieves significantly higher localization accuracy and faster convergence than +previous 2D map-based approaches. + +
+
+
+
+
+ + ☆ Cyclic-Bootstrap Labeling for Weakly Supervised Object Detection ICCV 2023 + + +
+ Recent progress in weakly supervised object detection is featured by a +combination of multiple instance detection networks (MIDN) and ordinal online +refinement. However, with only image-level annotation, MIDN inevitably assigns +high scores to some unexpected region proposals when generating pseudo labels. +These inaccurate high-scoring region proposals will mislead the training of +subsequent refinement modules and thus hamper the detection performance. In +this work, we explore how to ameliorate the quality of pseudo-labeling in MIDN. +Formally, we devise Cyclic-Bootstrap Labeling (CBL), a novel weakly supervised +object detection pipeline, which optimizes MIDN with rank information from a +reliable teacher network. Specifically, we obtain this teacher network by +introducing a weighted exponential moving average strategy to take advantage of +various refinement modules. A novel class-specific ranking distillation +algorithm is proposed to leverage the output of weighted ensembled teacher +network for distilling MIDN with rank information. As a result, MIDN is guided +to assign higher scores to accurate proposals among their neighboring ones, +thus benefiting the subsequent pseudo labeling. Extensive experiments on the +prevalent PASCAL VOC 2007 \& 2012 and COCO datasets demonstrate the superior +performance of our CBL framework. Code will be available at +https://github.com/Yinyf0804/WSOD-CBL/. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ MS3D++: Ensemble of Experts for Multi-Source Unsupervised Domain + Adaption in 3D Object Detection + + +
+ Deploying 3D detectors in unfamiliar domains has been demonstrated to result +in a drastic drop of up to 70-90% in detection rate due to variations in lidar, +geographical region, or weather conditions from their original training +dataset. This domain gap leads to missing detections for densely observed +objects, misaligned confidence scores, and increased high-confidence false +positives, rendering the detector highly unreliable. To address this, we +introduce MS3D++, a self-training framework for multi-source unsupervised +domain adaptation in 3D object detection. MS3D++ provides a straightforward +approach to domain adaptation by generating high-quality pseudo-labels, +enabling the adaptation of 3D detectors to a diverse range of lidar types, +regardless of their density. Our approach effectively fuses predictions of an +ensemble of multi-frame pre-trained detectors from different source domains to +improve domain generalization. We subsequently refine the predictions +temporally to ensure temporal consistency in box localization and object +classification. Furthermore, we present an in-depth study into the performance +and idiosyncrasies of various 3D detector components in a cross-domain context, +providing valuable insights for improved cross-domain detector ensembling. +Experimental results on Waymo, nuScenes and Lyft demonstrate that detectors +trained with MS3D++ pseudo-labels achieve state-of-the-art performance, +comparable to training with human-annotated labels in Bird's Eye View (BEV) +evaluation for both low and high density lidar. + +
+
+ comment: Code is available at https://github.com/darrenjkt/MS3D +
+
+
+
+
+ + ☆ Face Encryption via Frequency-Restricted Identity-Agnostic Attacks + + +
+ Billions of people are sharing their daily live images on social media +everyday. However, malicious collectors use deep face recognition systems to +easily steal their biometric information (e.g., faces) from these images. Some +studies are being conducted to generate encrypted face photos using adversarial +attacks by introducing imperceptible perturbations to reduce face information +leakage. However, existing studies need stronger black-box scenario feasibility +and more natural visual appearances, which challenge the feasibility of privacy +protection. To address these problems, we propose a frequency-restricted +identity-agnostic (FRIA) framework to encrypt face images from unauthorized +face recognition without access to personal information. As for the weak +black-box scenario feasibility, we obverse that representations of the average +feature in multiple face recognition models are similar, thus we propose to +utilize the average feature via the crawled dataset from the Internet as the +target to guide the generation, which is also agnostic to identities of unknown +face recognition systems; in nature, the low-frequency perturbations are more +visually perceptible by the human vision system. Inspired by this, we restrict +the perturbation in the low-frequency facial regions by discrete cosine +transform to achieve the visual naturalness guarantee. Extensive experiments on +several face recognition models demonstrate that our FRIA outperforms other +state-of-the-art methods in generating more natural encrypted faces while +attaining high black-box attack success rates of 96%. In addition, we validate +the efficacy of FRIA using real-world black-box commercial API, which reveals +the potential of FRIA in practice. Our codes can be found in +https://github.com/XinDong10/FRIA. + +
+
+
+
+
+ + ☆ Zero-shot Text-driven Physically Interpretable Face Editing + + +
+ This paper proposes a novel and physically interpretable method for face +editing based on arbitrary text prompts. Different from previous +GAN-inversion-based face editing methods that manipulate the latent space of +GANs, or diffusion-based methods that model image manipulation as a reverse +diffusion process, we regard the face editing process as imposing vector flow +fields on face images, representing the offset of spatial coordinates and color +for each image pixel. Under the above-proposed paradigm, we represent the +vector flow field in two ways: 1) explicitly represent the flow vectors with +rasterized tensors, and 2) implicitly parameterize the flow vectors as +continuous, smooth, and resolution-agnostic neural fields, by leveraging the +recent advances of implicit neural representations. The flow vectors are +iteratively optimized under the guidance of the pre-trained Contrastive +Language-Image Pretraining~(CLIP) model by maximizing the correlation between +the edited image and the text prompt. We also propose a learning-based one-shot +face editing framework, which is fast and adaptable to any text prompt input. +Our method can also be flexibly extended to real-time video face editing. +Compared with state-of-the-art text-driven face editing methods, our method can +generate physically interpretable face editing results with high identity +consistency and image quality. Our code will be made publicly available. + +
+
+
+
+
+ + ☆ Focused Specific Objects NeRF + + +
+ Most NeRF-based models are designed for learning the entire scene, and +complex scenes can lead to longer learning times and poorer rendering effects. +This paper utilizes scene semantic priors to make improvements in fast +training, allowing the network to focus on the specific targets and not be +affected by complex backgrounds. The training speed can be increased by 7.78 +times with better rendering effect, and small to medium sized targets can be +rendered faster. In addition, this improvement applies to all NeRF-based +models. Considering the inherent multi-view consistency and smoothness of NeRF, +this paper also studies weak supervision by sparsely sampling negative ray +samples. With this method, training can be further accelerated and rendering +quality can be maintained. Finally, this paper extends pixel semantic and color +rendering formulas and proposes a new scene editing technique that can achieve +unique displays of the specific semantic targets or masking them in rendering. +To address the problem of unsupervised regions incorrect inferences in the +scene, we also designed a self-supervised loop that combines morphological +operations and clustering. + +
+
+ comment: 17 pages,32 figures +
+
+
+
+
+ + ☆ YOLOrtho -- A Unified Framework for Teeth Enumeration and Dental Disease + Detection + + +
+ Detecting dental diseases through panoramic X-rays images is a standard +procedure for dentists. Normally, a dentist need to identify diseases and find +the infected teeth. While numerous machine learning models adopting this +two-step procedure have been developed, there has not been an end-to-end model +that can identify teeth and their associated diseases at the same time. To fill +the gap, we develop YOLOrtho, a unified framework for teeth enumeration and +dental disease detection. We develop our model on Dentex Challenge 2023 data, +which consists of three distinct types of annotated data. The first part is +labeled with quadrant, and the second part is labeled with quadrant and +enumeration and the third part is labeled with quadrant, enumeration and +disease. To further improve detection, we make use of Tufts Dental public +dataset. To fully utilize the data and learn both teeth detection and disease +identification simultaneously, we formulate diseases as attributes attached to +their corresponding teeth. Due to the nature of position relation in teeth +enumeration, We replace convolution layer with CoordConv in our model to +provide more position information for the model. We also adjust the model +architecture and insert one more upsampling layer in FPN in favor of large +object detection. Finally, we propose a post-process strategy for teeth layout +that corrects teeth enumeration based on linear sum assignment. Results from +experiments show that our model exceeds large Diffusion-based model. + +
+
+
+
+
+ + ☆ Compositional Learning in Transformer-Based Human-Object Interaction + Detection + + +
+ Human-object interaction (HOI) detection is an important part of +understanding human activities and visual scenes. The long-tailed distribution +of labeled instances is a primary challenge in HOI detection, promoting +research in few-shot and zero-shot learning. Inspired by the combinatorial +nature of HOI triplets, some existing approaches adopt the idea of +compositional learning, in which object and action features are learned +individually and re-composed as new training samples. However, these methods +follow the CNN-based two-stage paradigm with limited feature extraction +ability, and often rely on auxiliary information for better performance. +Without introducing any additional information, we creatively propose a +transformer-based framework for compositional HOI learning. Human-object pair +representations and interaction representations are re-composed across +different HOI instances, which involves richer contextual information and +promotes the generalization of knowledge. Experiments show our simple but +effective method achieves state-of-the-art performance, especially on rare HOI +classes. + +
+
+
+
+
+ + ☆ Learned Point Cloud Compression for Classification SP 2023 + + +
+ Deep learning is increasingly being used to perform machine vision tasks such +as classification, object detection, and segmentation on 3D point cloud data. +However, deep learning inference is computationally expensive. The limited +computational capabilities of end devices thus necessitate a codec for +transmitting point cloud data over the network for server-side processing. Such +a codec must be lightweight and capable of achieving high compression ratios +without sacrificing accuracy. Motivated by this, we present a novel point cloud +codec that is highly specialized for the machine task of classification. Our +codec, based on PointNet, achieves a significantly better rate-accuracy +trade-off in comparison to alternative methods. In particular, it achieves a +94% reduction in BD-bitrate over non-specialized codecs on the ModelNet40 +dataset. For low-resource end devices, we also propose two lightweight +configurations of our encoder that achieve similar BD-bitrate reductions of 93% +and 92% with 3% and 5% drops in top-1 accuracy, while consuming only 0.470 and +0.048 encoder-side kMACs/point, respectively. Our codec demonstrates the +potential of specialized codecs for machine analysis of point clouds, and +provides a basis for extension to more complex tasks and datasets in the +future. + +
+
+ comment: 6 pages, 4 figures, IEEE MMSP 2023 +
+
+
+
+
+ + ☆ Uncertainty-Aware Cross-Modal Transfer Network for Sketch-Based 3D Shape + Retrieval + + +
+ In recent years, sketch-based 3D shape retrieval has attracted growing +attention. While many previous studies have focused on cross-modal matching +between hand-drawn sketches and 3D shapes, the critical issue of how to handle +low-quality and noisy samples in sketch data has been largely neglected. This +paper presents an uncertainty-aware cross-modal transfer network (UACTN) that +addresses this issue. UACTN decouples the representation learning of sketches +and 3D shapes into two separate tasks: classification-based sketch uncertainty +learning and 3D shape feature transfer. We first introduce an end-to-end +classification-based approach that simultaneously learns sketch features and +uncertainty, allowing uncertainty to prevent overfitting noisy sketches by +assigning different levels of importance to clean and noisy sketches. Then, 3D +shape features are mapped into the pre-learned sketch embedding space for +feature alignment. Extensive experiments and ablation studies on two benchmarks +demonstrate the superiority of our proposed method compared to state-of-the-art +methods. + +
+
+ comment: 6 pages, 7 figures; To be published in IEEE International Conference + on Multimedia and Expo 2023 +
+
+
+
+
+ + ☆ FoodSAM: Any Food Segmentation + + +
+ In this paper, we explore the zero-shot capability of the Segment Anything +Model (SAM) for food image segmentation. To address the lack of class-specific +information in SAM-generated masks, we propose a novel framework, called +FoodSAM. This innovative approach integrates the coarse semantic mask with +SAM-generated masks to enhance semantic segmentation quality. Besides, we +recognize that the ingredients in food can be supposed as independent +individuals, which motivated us to perform instance segmentation on food +images. Furthermore, FoodSAM extends its zero-shot capability to encompass +panoptic segmentation by incorporating an object detector, which renders +FoodSAM to effectively capture non-food object information. Drawing inspiration +from the recent success of promptable segmentation, we also extend FoodSAM to +promptable segmentation, supporting various prompt variants. Consequently, +FoodSAM emerges as an all-encompassing solution capable of segmenting food +items at multiple levels of granularity. Remarkably, this pioneering framework +stands as the first-ever work to achieve instance, panoptic, and promptable +segmentation on food images. Extensive experiments demonstrate the feasibility +and impressing performance of FoodSAM, validating SAM's potential as a +prominent and influential tool within the domain of food image segmentation. We +release our code at https://github.com/jamesjg/FoodSAM. + +
+
+ comment: Code is available at https://github.com/jamesjg/FoodSAM +
+
+
+
+
+ + ☆ Generalizing Event-Based Motion Deblurring in Real-World Scenarios ICCV 2023 + + +
+ Event-based motion deblurring has shown promising results by exploiting +low-latency events. However, current approaches are limited in their practical +usage, as they assume the same spatial resolution of inputs and specific +blurriness distributions. This work addresses these limitations and aims to +generalize the performance of event-based deblurring in real-world scenarios. +We propose a scale-aware network that allows flexible input spatial scales and +enables learning from different temporal scales of motion blur. A two-stage +self-supervised learning scheme is then developed to fit real-world data +distribution. By utilizing the relativity of blurriness, our approach +efficiently ensures the restored brightness and structure of latent images and +further generalizes deblurring performance to handle varying spatial and +temporal scales of motion blur in a self-distillation manner. Our method is +extensively evaluated, demonstrating remarkable performance, and we also +introduce a real-world dataset consisting of multi-scale blurry frames and +events to facilitate research in event-based deblurring. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ CaPhy: Capturing Physical Properties for Animatable Human Avatars + + +
+ We present CaPhy, a novel method for reconstructing animatable human avatars +with realistic dynamic properties for clothing. Specifically, we aim for +capturing the geometric and physical properties of the clothing from real +observations. This allows us to apply novel poses to the human avatar with +physically correct deformations and wrinkles of the clothing. To this end, we +combine unsupervised training with physics-based losses and 3D-supervised +training using scanned data to reconstruct a dynamic model of clothing that is +physically realistic and conforms to the human scans. We also optimize the +physical parameters of the underlying physical model from the scans by +introducing gradient constraints of the physics-based losses. In contrast to +previous work on 3D avatar reconstruction, our method is able to generalize to +novel poses with realistic dynamic cloth deformations. Experiments on several +subjects demonstrate that our method can estimate the physical properties of +the garments, resulting in superior quantitative and qualitative results +compared with previous methods. + +
+
+
+
+
+ + ☆ BATINet: Background-Aware Text to Image Synthesis and Manipulation + Network ICIP2023 + + +
+ Background-Induced Text2Image (BIT2I) aims to generate foreground content +according to the text on the given background image. Most studies focus on +generating high-quality foreground content, although they ignore the +relationship between the two contents. In this study, we analyzed a novel +Background-Aware Text2Image (BAT2I) task in which the generated content matches +the input background. We proposed a Background-Aware Text to Image synthesis +and manipulation Network (BATINet), which contains two key components: Position +Detect Network (PDN) and Harmonize Network (HN). The PDN detects the most +plausible position of the text-relevant object in the background image. The HN +harmonizes the generated content referring to background style information. +Finally, we reconstructed the generation network, which consists of the +multi-GAN and attention module to match more user preferences. Moreover, we can +apply BATINet to text-guided image manipulation. It solves the most challenging +task of manipulating the shape of an object. We demonstrated through +qualitative and quantitative evaluations on the CUB dataset that the proposed +model outperforms other state-of-the-art methods. + +
+
+ comment: Accepted to ICIP2023 +
+
+
+
+
+ + ☆ Semantics2Hands: Transferring Hand Motion Semantics between Avatars + + +
+ Human hands, the primary means of non-verbal communication, convey intricate +semantics in various scenarios. Due to the high sensitivity of individuals to +hand motions, even minor errors in hand motions can significantly impact the +user experience. Real applications often involve multiple avatars with varying +hand shapes, highlighting the importance of maintaining the intricate semantics +of hand motions across the avatars. Therefore, this paper aims to transfer the +hand motion semantics between diverse avatars based on their respective hand +models. To address this problem, we introduce a novel anatomy-based semantic +matrix (ASM) that encodes the semantics of hand motions. The ASM quantifies the +positions of the palm and other joints relative to the local frame of the +corresponding joint, enabling precise retargeting of hand motions. +Subsequently, we obtain a mapping function from the source ASM to the target +hand joint rotations by employing an anatomy-based semantics reconstruction +network (ASRN). We train the ASRN using a semi-supervised learning strategy on +the Mixamo and InterHand2.6M datasets. We evaluate our method in intra-domain +and cross-domain hand motion retargeting tasks. The qualitative and +quantitative results demonstrate the significant superiority of our ASRN over +the state-of-the-arts. + +
+
+ comment: Accepted to MM 2023, 9 pages, 10 figures. Project page: + https://abcyzj.github.io/S2H/ +
+
+
+
+
+ + ☆ Collaborative Tracking Learning for Frame-Rate-Insensitive Multi-Object + Tracking ICCV 2023 + + +
+ Multi-object tracking (MOT) at low frame rates can reduce computational, +storage and power overhead to better meet the constraints of edge devices. Many +existing MOT methods suffer from significant performance degradation in +low-frame-rate videos due to significant location and appearance changes +between adjacent frames. To this end, we propose to explore collaborative +tracking learning (ColTrack) for frame-rate-insensitive MOT in a query-based +end-to-end manner. Multiple historical queries of the same target jointly track +it with richer temporal descriptions. Meanwhile, we insert an information +refinement module between every two temporal blocking decoders to better fuse +temporal clues and refine features. Moreover, a tracking object consistency +loss is proposed to guide the interaction between historical queries. Extensive +experimental results demonstrate that in high-frame-rate videos, ColTrack +obtains higher performance than state-of-the-art methods on large-scale +datasets Dancetrack and BDD100K, and outperforms the existing end-to-end +methods on MOT17. More importantly, ColTrack has a significant advantage over +state-of-the-art methods in low-frame-rate videos, which allows it to obtain +faster processing speeds by reducing frame-rate requirements while maintaining +higher performance. Code will be released at +https://github.com/yolomax/ColTrack + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Semantic-embedded Similarity Prototype for Scene Recognition + + +
+ Due to the high inter-class similarity caused by the complex composition +within scenes and the co-existing objects across scenes, various studies have +explored object semantic knowledge within scenes to improve scene recognition. +However, a resulting issue arises as semantic segmentation or object detection +techniques demand heavy computational power, thereby burdening the network +considerably. This limitation often renders object-assisted approaches +incompatible with edge devices. In contrast, this paper proposes a +semantic-based similarity prototype that assists the scene recognition network +to achieve higher accuracy without increasing network parameters. It is simple +and can be plug-and-played into existing pipelines. More specifically, a +statistical strategy is introduced to depict semantic knowledge in scenes as +class-level semantic representations. These representations are utilized to +explore inter-class correlations, ultimately constructing a similarity +prototype. Furthermore, we propose two ways to use the similarity prototype to +support network training from the perspective of gradient label softening and +batch-level contrastive loss, respectively. Comprehensive evaluations on +multiple benchmarks show that our similarity prototype enhances the performance +of existing networks without adding any computational burden. Code and the +statistical similarity prototype will be available soon. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ RT-1: Robotics Transformer for Real-World Control at Scale + + +
+ By transferring knowledge from large, diverse, task-agnostic datasets, modern +machine learning models can solve specific downstream tasks either zero-shot or +with small task-specific datasets to a high level of performance. While this +capability has been demonstrated in other fields such as computer vision, +natural language processing or speech recognition, it remains to be shown in +robotics, where the generalization capabilities of the models are particularly +critical due to the difficulty of collecting real-world robotic data. We argue +that one of the keys to the success of such general robotic models lies with +open-ended task-agnostic training, combined with high-capacity architectures +that can absorb all of the diverse, robotic data. In this paper, we present a +model class, dubbed Robotics Transformer, that exhibits promising scalable +model properties. We verify our conclusions in a study of different model +classes and their ability to generalize as a function of the data size, model +size, and data diversity based on a large-scale data collection on real robots +performing real-world tasks. The project's website and videos can be found at +robotics-transformer1.github.io + +
+
+ comment: See website at robotics-transformer1.github.io +
+
+
+
+
+ + ♻ ☆ F?D: On understanding the role of deep feature spaces on face generation + evaluation + + +
+ Perceptual metrics, like the Fr\'echet Inception Distance (FID), are widely +used to assess the similarity between synthetically generated and ground truth +(real) images. The key idea behind these metrics is to compute errors in a deep +feature space that captures perceptually and semantically rich image features. +Despite their popularity, the effect that different deep features and their +design choices have on a perceptual metric has not been well studied. In this +work, we perform a causal analysis linking differences in semantic attributes +and distortions between face image distributions to Fr\'echet distances (FD) +using several popular deep feature spaces. A key component of our analysis is +the creation of synthetic counterfactual faces using deep face generators. Our +experiments show that the FD is heavily influenced by its feature space's +training dataset and objective function. For example, FD using features +extracted from ImageNet-trained models heavily emphasize hats over regions like +the eyes and mouth. Moreover, FD using features from a face gender classifier +emphasize hair length more than distances in an identity (recognition) feature +space. Finally, we evaluate several popular face generation models across +feature spaces and find that StyleGAN2 consistently ranks higher than other +face generators, except with respect to identity (recognition) features. This +suggests the need for considering multiple feature spaces when evaluating +generative models and using feature spaces that are tuned to nuances of the +domain of interest. + +
+
+ comment: Code and dataset to be released soon +
+
+
+
+
+ + ♻ ☆ Preventing Zero-Shot Transfer Degradation in Continual Learning of + Vision-Language Models ICCV 2023 + + +
+ Continual learning (CL) can help pre-trained vision-language models +efficiently adapt to new or under-trained data distributions without +re-training. Nevertheless, during the continual training of the Contrastive +Language-Image Pre-training (CLIP) model, we observe that the model's zero-shot +transfer ability significantly degrades due to catastrophic forgetting. +Existing CL methods can mitigate forgetting by replaying previous data. +However, since the CLIP dataset is private, replay methods cannot access the +pre-training dataset. In addition, replaying data of previously learned +downstream tasks can enhance their performance but comes at the cost of +sacrificing zero-shot performance. To address this challenge, we propose a +novel method ZSCL to prevent zero-shot transfer degradation in the continual +learning of vision-language models in both feature and parameter space. In the +feature space, a reference dataset is introduced for distillation between the +current and initial models. The reference dataset should have semantic +diversity but no need to be labeled, seen in pre-training, or matched +image-text pairs. In parameter space, we prevent a large parameter shift by +averaging weights during the training. We propose a more challenging +Multi-domain Task Incremental Learning (MTIL) benchmark to evaluate different +methods, where tasks are from various domains instead of class-separated in a +single dataset. Our method outperforms other methods in the traditional +class-incremental learning setting and the MTIL by 9.7% average score. Our code +locates at https://github.com/Thunderbeee/ZSCL. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ MAMAF-Net: Motion-Aware and Multi-Attention Fusion Network for Stroke + Diagnosis + + +
+ Stroke is a major cause of mortality and disability worldwide from which one +in four people are in danger of incurring in their lifetime. The pre-hospital +stroke assessment plays a vital role in identifying stroke patients accurately +to accelerate further examination and treatment in hospitals. Accordingly, the +National Institutes of Health Stroke Scale (NIHSS), Cincinnati Pre-hospital +Stroke Scale (CPSS) and Face Arm Speed Time (F.A.S.T.) are globally known tests +for stroke assessment. However, the validity of these tests is skeptical in the +absence of neurologists and access to healthcare may be limited. Therefore, in +this study, we propose a motion-aware and multi-attention fusion network +(MAMAF-Net) that can detect stroke from multimodal examination videos. Contrary +to other studies on stroke detection from video analysis, our study for the +first time proposes an end-to-end solution from multiple video recordings of +each subject with a dataset encapsulating stroke, transient ischemic attack +(TIA), and healthy controls. The proposed MAMAF-Net consists of motion-aware +modules to sense the mobility of patients, attention modules to fuse the +multi-input video data, and 3D convolutional layers to perform diagnosis from +the attention-based extracted features. Experimental results over the collected +Stroke-data dataset show that the proposed MAMAF-Net achieves a successful +detection of stroke with 93.62% sensitivity and 95.33% AUC score. + +
+
+
+
+
+ + ♻ ☆ Larger is not Better: A Survey on the Robustness of Computer Vision + Models against Common Corruptions + + +
+ The performance of computer vision models are susceptible to unexpected +changes in input images, known as common corruptions (e.g. noise, blur, +illumination changes, etc.), that can hinder their reliability when deployed in +real scenarios. These corruptions are not always considered to test model +generalization and robustness. In this survey, we present a comprehensive +overview of methods that improve the robustness of computer vision models +against common corruptions. We categorize methods into four groups based on the +model part and training method addressed: data augmentation, representation +learning, knowledge distillation, and network components. We also cover +indirect methods for generalization and mitigation of shortcut learning, +potentially useful for corruption robustness. We release a unified benchmark +framework to compare robustness performance on several datasets, and address +the inconsistencies of evaluation in the literature. We provide an experimental +overview of the base corruption robustness of popular vision backbones, and +show that corruption robustness does not necessarily scale with model size. The +very large models (above 100M parameters) gain negligible robustness, +considering the increased computational requirements. To achieve generalizable +and robust computer vision models, we foresee the need of developing new +learning strategies to efficiently exploit limited data and mitigate unwanted +or unreliable learning behaviors. + +
+
+
+
+
+ + ♻ ☆ Polarization Multi-Image Synthesis with Birefringent Metasurfaces + + +
+ Optical metasurfaces composed of precisely engineered nanostructures have +gained significant attention for their ability to manipulate light and +implement distinct functionalities based on the properties of the incident +field. Computational imaging systems have started harnessing this capability to +produce sets of coded measurements that benefit certain tasks when paired with +digital post-processing. Inspired by these works, we introduce a new system +that uses a birefringent metasurface with a polarizer-mosaicked photosensor to +capture four optically-coded measurements in a single exposure. We apply this +system to the task of incoherent opto-electronic filtering, where digital +spatial-filtering operations are replaced by simpler, per-pixel sums across the +four polarization channels, independent of the spatial filter size. In contrast +to previous work on incoherent opto-electronic filtering that can realize only +one spatial filter, our approach can realize a continuous family of filters +from a single capture, with filters being selected from the family by adjusting +the post-capture digital summation weights. To find a metasurface that can +realize a set of user-specified spatial filters, we introduce a form of +gradient descent with a novel regularizer that encourages light efficiency and +a high signal-to-noise ratio. We demonstrate several examples in simulation and +with fabricated prototypes, including some with spatial filters that have +prescribed variations with respect to depth and wavelength. + Visit the Project Page at +https://deanhazineh.github.io/publications/Multi_Image_Synthesis/MIS_Home.html + +
+
+ comment: Published in the Proceedings of the 2023 IEEE International + Conference of Computational Photography +
+
+
+
+
+ + ♻ ☆ DiT: Efficient Vision Transformers with Dynamic Token Routing + + +
+ Recently, the tokens of images share the same static data flow in many dense +networks. However, challenges arise from the variance among the objects in +images, such as large variations in the spatial scale and difficulties of +recognition for visual entities. In this paper, we propose a data-dependent +token routing strategy to elaborate the routing paths of image tokens for +Dynamic Vision Transformer, dubbed DiT. The proposed framework generates a +data-dependent path per token, adapting to the object scales and visual +discrimination of tokens. In feed-forward, the differentiable routing gates are +designed to select the scaling paths and feature transformation paths for image +tokens, leading to multi-path feature propagation. In this way, the impact of +object scales and visual discrimination of image representation can be +carefully tuned. Moreover, the computational cost can be further reduced by +giving budget constraints to the routing gate and early-stopping of feature +extraction. In experiments, our DiT achieves superior performance and favorable +complexity/accuracy trade-offs than many SoTA methods on ImageNet +classification, object detection, instance segmentation, and semantic +segmentation. Particularly, the DiT-B5 obtains 84.8\% top-1 Acc on ImageNet +with 10.3 GFLOPs, which is 1.0\% higher than that of the SoTA method with +similar computational complexity. These extensive results demonstrate that DiT +can serve as versatile backbones for various vision tasks. + +
+
+
+
+
+ + ♻ ☆ Heatmap-based Out-of-Distribution Detection WACV 2023 + + +
+ Our work investigates out-of-distribution (OOD) detection as a neural network +output explanation problem. We learn a heatmap representation for detecting OOD +images while visualizing in- and out-of-distribution image regions at the same +time. Given a trained and fixed classifier, we train a decoder neural network +to produce heatmaps with zero response for in-distribution samples and high +response heatmaps for OOD samples, based on the classifier features and the +class prediction. Our main innovation lies in the heatmap definition for an OOD +sample, as the normalized difference from the closest in-distribution sample. +The heatmap serves as a margin to distinguish between in- and +out-of-distribution samples. Our approach generates the heatmaps not only for +OOD detection, but also to indicate in- and out-of-distribution regions of the +input image. In our evaluations, our approach mostly outperforms the prior work +on fixed classifiers, trained on CIFAR-10, CIFAR-100 and Tiny ImageNet. The +code is publicly available at: https://github.com/jhornauer/heatmap_ood. + +
+
+ comment: Accepted to WACV 2023 +
+
+
+
+
+ + ♻ ☆ Towards Defending Multiple $\ell_p$-norm Bounded Adversarial + Perturbations via Gated Batch Normalization + + +
+ There has been extensive evidence demonstrating that deep neural networks are +vulnerable to adversarial examples, which motivates the development of defenses +against adversarial attacks. Existing adversarial defenses typically improve +model robustness against individual specific perturbation types (\eg, +$\ell_{\infty}$-norm bounded adversarial examples). However, adversaries are +likely to generate multiple types of perturbations in practice (\eg, $\ell_1$, +$\ell_2$, and $\ell_{\infty}$ perturbations). Some recent methods improve model +robustness against adversarial attacks in multiple $\ell_p$ balls, but their +performance against each perturbation type is still far from satisfactory. In +this paper, we observe that different $\ell_p$ bounded adversarial +perturbations induce different statistical properties that can be separated and +characterized by the statistics of Batch Normalization (BN). We thus propose +Gated Batch Normalization (GBN) to adversarially train a perturbation-invariant +predictor for defending multiple $\ell_p$ bounded adversarial perturbations. +GBN consists of a multi-branch BN layer and a gated sub-network. Each BN branch +in GBN is in charge of one perturbation type to ensure that the normalized +output is aligned towards learning perturbation-invariant representation. +Meanwhile, the gated sub-network is designed to separate inputs added with +different perturbation types. We perform an extensive evaluation of our +approach on commonly-used dataset including MNIST, CIFAR-10, and Tiny-ImageNet, +and demonstrate that GBN outperforms previous defense proposals against +multiple perturbation types (\ie, $\ell_1$, $\ell_2$, and $\ell_{\infty}$ +perturbations) by large margins. + +
+
+ comment: Accepted on IJCV +
+
+
+
+
+ + ♻ ☆ Can Self-Supervised Representation Learning Methods Withstand + Distribution Shifts and Corruptions? ICCV + + +
+ Self-supervised learning in computer vision aims to leverage the inherent +structure and relationships within data to learn meaningful representations +without explicit human annotation, enabling a holistic understanding of visual +scenes. Robustness in vision machine learning ensures reliable and consistent +performance, enhancing generalization, adaptability, and resistance to noise, +variations, and adversarial attacks. Self-supervised paradigms, namely +contrastive learning, knowledge distillation, mutual information maximization, +and clustering, have been considered to have shown advances in invariant +learning representations. This work investigates the robustness of learned +representations of self-supervised learning approaches focusing on distribution +shifts and image corruptions in computer vision. Detailed experiments have been +conducted to study the robustness of self-supervised learning methods on +distribution shifts and image corruptions. The empirical analysis demonstrates +a clear relationship between the performance of learned representations within +self-supervised paradigms and the severity of distribution shifts and +corruptions. Notably, higher levels of shifts and corruptions are found to +significantly diminish the robustness of the learned representations. These +findings highlight the critical impact of distribution shifts and image +corruptions on the performance and resilience of self-supervised learning +methods, emphasizing the need for effective strategies to mitigate their +adverse effects. The study strongly advocates for future research in the field +of self-supervised representation learning to prioritize the key aspects of +safety and robustness in order to ensure practical applicability. The source +code and results are available on GitHub. + +
+
+ comment: Accepted at 2023 IEEE/CVF International Conference on Computer Vision + Workshops (ICCVW). Corresponding author - prakash.chandra.chhipa@ltu.se +
+
+
+
+
+ + ♻ ☆ Improving the Transferability of Adversarial Examples via Direction + Tuning + + +
+ In the transfer-based adversarial attacks, adversarial examples are only +generated by the surrogate models and achieve effective perturbation in the +victim models. Although considerable efforts have been developed on improving +the transferability of adversarial examples generated by transfer-based +adversarial attacks, our investigation found that, the big deviation between +the actual and steepest update directions of the current transfer-based +adversarial attacks is caused by the large update step length, resulting in the +generated adversarial examples can not converge well. However, directly +reducing the update step length will lead to serious update oscillation so that +the generated adversarial examples also can not achieve great transferability +to the victim models. To address these issues, a novel transfer-based attack, +namely direction tuning attack, is proposed to not only decrease the update +deviation in the large step length, but also mitigate the update oscillation in +the small sampling step length, thereby making the generated adversarial +examples converge well to achieve great transferability on victim models. In +addition, a network pruning method is proposed to smooth the decision boundary, +thereby further decreasing the update oscillation and enhancing the +transferability of the generated adversarial examples. The experiment results +on ImageNet demonstrate that the average attack success rate (ASR) of the +adversarial examples generated by our method can be improved from 87.9\% to +94.5\% on five victim models without defenses, and from 69.1\% to 76.2\% on +eight advanced defense methods, in comparison with that of latest +gradient-based attacks. + +
+
+ comment: Accepted by INS 2023 +
+
+
+
+
+ + ♻ ☆ On the Design Fundamentals of Diffusion Models: A Survey + + +
+ Diffusion models are generative models, which gradually add and remove noise +to learn the underlying distribution of training data for data generation. The +components of diffusion models have gained significant attention with many +design choices proposed. Existing reviews have primarily focused on +higher-level solutions, thereby covering less on the design fundamentals of +components. This study seeks to address this gap by providing a comprehensive +and coherent review on component-wise design choices in diffusion models. +Specifically, we organize this review according to their three key components, +namely the forward process, the reverse process, and the sampling procedure. +This allows us to provide a fine-grained perspective of diffusion models, +benefiting future studies in the analysis of individual components, the +applicability of design choices, and the implementation of diffusion models. + +
+
+
+
+
+ + ♻ ☆ Structured 2D Representation of 3D Data for Shape Processing + + +
+ We represent 3D shape by structured 2D representations of fixed length making +it feasible to apply well investigated 2D convolutional neural networks (CNN) +for both discriminative and geometric tasks on 3D shapes. We first provide a +general introduction to such structured descriptors, analyze their different +forms and show how a simple 2D CNN can be used to achieve good classification +result. With a specialized classification network for images and our structured +representation, we achieve the classification accuracy of 99.7\% in the +ModelNet40 test set - improving the previous state-of-the-art by a large +margin. We finally provide a novel framework for performing the geometric task +of 3D segmentation using 2D CNNs and the structured representation - concluding +the utility of such descriptors for both discriminative and geometric tasks. + +
+
+ comment: Results of some of the experiments were incorrect +
+
+
+
+
+ + ♻ ☆ UNAEN: Unsupervised Abnormality Extraction Network for MRI Motion + Artifact Reduction + + +
+ Motion artifacts compromise the quality of magnetic resonance imaging (MRI) +and pose challenges to achieving diagnostic outcomes and image-guided +therapies. In recent years, supervised deep learning approaches have emerged as +successful solutions for motion artifact reduction (MAR). One disadvantage of +these methods is their dependency on acquiring paired sets of motion +artifact-corrupted (MA-corrupted) and motion artifact-free (MA-free) MR images +for training purposes. Obtaining such image pairs is difficult and therefore +limits the application of supervised training. In this paper, we propose a +novel UNsupervised Abnormality Extraction Network (UNAEN) to alleviate this +problem. Our network is capable of working with unpaired MA-corrupted and +MA-free images. It converts the MA-corrupted images to MA-reduced images by +extracting abnormalities from the MA-corrupted images using a proposed artifact +extractor, which intercepts the residual artifact maps from the MA-corrupted MR +images explicitly, and a reconstructor to restore the original input from the +MA-reduced images. The performance of UNAEN was assessed by experimenting on +various publicly available MRI datasets and comparing them with +state-of-the-art methods. The quantitative evaluation demonstrates the +superiority of UNAEN over alternative MAR methods and visually exhibits fewer +residual artifacts. Our results substantiate the potential of UNAEN as a +promising solution applicable in real-world clinical environments, with the +capability to enhance diagnostic accuracy and facilitate image-guided +therapies. + +
+
+
+
+
+ + ♻ ☆ Towards Unified Text-based Person Retrieval: A Large-scale + Multi-Attribute and Language Search Benchmark + + +
+ In this paper, we introduce a large Multi-Attribute and Language Search +dataset for text-based person retrieval, called MALS, and explore the +feasibility of performing pre-training on both attribute recognition and +image-text matching tasks in one stone. In particular, MALS contains 1,510,330 +image-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES, +and all images are annotated with 27 attributes. Considering the privacy +concerns and annotation costs, we leverage the off-the-shelf diffusion models +to generate the dataset. To verify the feasibility of learning from the +generated data, we develop a new joint Attribute Prompt Learning and Text +Matching Learning (APTM) framework, considering the shared knowledge between +attribute and text. As the name implies, APTM contains an attribute prompt +learning stream and a text matching learning stream. (1) The attribute prompt +learning leverages the attribute prompts for image-attribute alignment, which +enhances the text matching learning. (2) The text matching learning facilitates +the representation learning on fine-grained details, and in turn, boosts the +attribute prompt learning. Extensive experiments validate the effectiveness of +the pre-training on MALS, achieving state-of-the-art retrieval performance via +APTM on three challenging real-world benchmarks. In particular, APTM achieves a +consistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on +CUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively. + +
+
+
+
+
+ + ♻ ☆ HRFuser: A Multi-resolution Sensor Fusion Architecture for 2D Object + Detection SC + + +
+ Besides standard cameras, autonomous vehicles typically include multiple +additional sensors, such as lidars and radars, which help acquire richer +information for perceiving the content of the driving scene. While several +recent works focus on fusing certain pairs of sensors - such as camera with +lidar or radar - by using architectural components specific to the examined +setting, a generic and modular sensor fusion architecture is missing from the +literature. In this work, we propose HRFuser, a modular architecture for +multi-modal 2D object detection. It fuses multiple sensors in a +multi-resolution fashion and scales to an arbitrary number of input modalities. +The design of HRFuser is based on state-of-the-art high-resolution networks for +image-only dense prediction and incorporates a novel multi-window +cross-attention block as the means to perform fusion of multiple modalities at +multiple resolutions. We demonstrate via extensive experiments on nuScenes and +the adverse conditions DENSE datasets that our model effectively leverages +complementary features from additional modalities, substantially improving upon +camera-only performance and consistently outperforming state-of-the-art 3D and +2D fusion methods evaluated on 2D object detection metrics. The source code is +publicly available. + +
+
+ comment: IEEE International Conference on Intelligent Transportation Systems + (ITSC) 2023 +
+
+
+
+
+ + ♻ ☆ An Integral Projection-based Semantic Autoencoder for Zero-Shot Learning + + +
+ Zero-shot Learning (ZSL) classification categorizes or predicts classes +(labels) that are not included in the training set (unseen classes). Recent +works proposed different semantic autoencoder (SAE) models where the encoder +embeds a visual feature vector space into the semantic space and the decoder +reconstructs the original visual feature space. The objective is to learn the +embedding by leveraging a source data distribution, which can be applied +effectively to a different but related target data distribution. Such +embedding-based methods are prone to domain shift problems and are vulnerable +to biases. We propose an integral projection-based semantic autoencoder +(IP-SAE) where an encoder projects a visual feature space concatenated with the +semantic space into a latent representation space. We force the decoder to +reconstruct the visual-semantic data space. Due to this constraint, the +visual-semantic projection function preserves the discriminatory data included +inside the original visual feature space. The enriched projection forces a more +precise reconstitution of the visual feature space invariant to the domain +manifold. Consequently, the learned projection function is less domain-specific +and alleviates the domain shift problem. Our proposed IP-SAE model consolidates +a symmetric transformation function for embedding and projection, and thus, it +provides transparency for interpreting generative applications in ZSL. +Therefore, in addition to outperforming state-of-the-art methods considering +four benchmark datasets, our analytical approach allows us to investigate +distinct characteristics of generative-based methods in the unique context of +zero-shot inference. + +
+
+
+
+
+ + ♻ ☆ Human-to-Human Interaction Detection + + +
+ A comprehensive understanding of interested human-to-human interactions in +video streams, such as queuing, handshaking, fighting and chasing, is of +immense importance to the surveillance of public security in regions like +campuses, squares and parks. Different from conventional human interaction +recognition, which uses choreographed videos as inputs, neglects concurrent +interactive groups, and performs detection and recognition in separate stages, +we introduce a new task named human-to-human interaction detection (HID). HID +devotes to detecting subjects, recognizing person-wise actions, and grouping +people according to their interactive relations, in one model. First, based on +the popular AVA dataset created for action detection, we establish a new HID +benchmark, termed AVA-Interaction (AVA-I), by adding annotations on interactive +relations in a frame-by-frame manner. AVA-I consists of 85,254 frames and +86,338 interactive groups, and each image includes up to 4 concurrent +interactive groups. Second, we present a novel baseline approach SaMFormer for +HID, containing a visual feature extractor, a split stage which leverages a +Transformer-based model to decode action instances and interactive groups, and +a merging stage which reconstructs the relationship between instances and +groups. All SaMFormer components are jointly trained in an end-to-end manner. +Extensive experiments on AVA-I validate the superiority of SaMFormer over +representative methods. The dataset and code will be made public to encourage +more follow-up studies. + +
+
+
+
+
+ + ♻ ☆ DQS3D: Densely-matched Quantization-aware Semi-supervised 3D Detection ICCV 2023 + + +
+ In this paper, we study the problem of semi-supervised 3D object detection, +which is of great importance considering the high annotation cost for cluttered +3D indoor scenes. We resort to the robust and principled framework of +selfteaching, which has triggered notable progress for semisupervised learning +recently. While this paradigm is natural for image-level or pixel-level +prediction, adapting it to the detection problem is challenged by the issue of +proposal matching. Prior methods are based upon two-stage pipelines, matching +heuristically selected proposals generated in the first stage and resulting in +spatially sparse training signals. In contrast, we propose the first +semisupervised 3D detection algorithm that works in the singlestage manner and +allows spatially dense training signals. A fundamental issue of this new design +is the quantization error caused by point-to-voxel discretization, which +inevitably leads to misalignment between two transformed views in the voxel +domain. To this end, we derive and implement closed-form rules that compensate +this misalignment onthe-fly. Our results are significant, e.g., promoting +ScanNet mAP@0.5 from 35.2% to 48.5% using 20% annotation. Codes and data will +be publicly available. + +
+
+ comment: Accepted to ICCV 2023. Code: https://github.com/AIR-DISCOVER/DQS3D +
+
+
+
+
+ + ♻ ☆ DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic + Segmentation Using Diffusion Models + + +
+ Collecting and annotating images with pixel-wise labels is time-consuming and +laborious. In contrast, synthetic data can be freely available using a +generative model (e.g., DALL-E, Stable Diffusion). In this paper, we show that +it is possible to automatically obtain accurate semantic masks of synthetic +images generated by the Off-the-shelf Stable Diffusion model, which uses only +text-image pairs during training. Our approach, called DiffuMask, exploits the +potential of the cross-attention map between text and image, which is natural +and seamless to extend the text-driven image synthesis to semantic mask +generation. DiffuMask uses text-guided cross-attention information to localize +class/word-specific regions, which are combined with practical techniques to +create a novel high-resolution and class-discriminative pixel-wise mask. The +methods help to reduce data collection and annotation costs obviously. +Experiments demonstrate that the existing segmentation methods trained on +synthetic data of DiffuMask can achieve a competitive performance over the +counterpart of real data (VOC 2012, Cityscapes). For some classes (e.g., bird), +DiffuMask presents promising performance, close to the stateof-the-art result +of real data (within 3% mIoU gap). Moreover, in the open-vocabulary +segmentation (zero-shot) setting, DiffuMask achieves a new SOTA result on +Unseen class of VOC 2012. The project website can be found at +https://weijiawu.github.io/DiffusionMask/. + +
+
+
+
+
+ + ♻ ☆ Joint Multi-view Unsupervised Feature Selection and Graph Learning + + +
+ Despite significant progress, previous multi-view unsupervised feature +selection methods mostly suffer from two limitations. First, they generally +utilize either cluster structure or similarity structure to guide the feature +selection, which neglect the possibility of a joint formulation with mutual +benefits. Second, they often learn the similarity structure by either global +structure learning or local structure learning, which lack the capability of +graph learning with both global and local structural awareness. In light of +this, this paper presents a joint multi-view unsupervised feature selection and +graph learning (JMVFG) approach. Particularly, we formulate the multi-view +feature selection with orthogonal decomposition, where each target matrix is +decomposed into a view-specific basis matrix and a view-consistent cluster +indicator. The cross-space locality preservation is incorporated to bridge the +cluster structure learning in the projected space and the similarity learning +(i.e., graph learning) in the original space. Further, a unified objective +function is presented to enable the simultaneous learning of the cluster +structure, the global and local similarity structures, and the multi-view +consistency and inconsistency, upon which an alternating optimization algorithm +is developed with theoretically proved convergence. Extensive experiments on a +variety of real-world multi-view datasets demonstrate the superiority of our +approach for both the multi-view feature selection and graph learning tasks. +The code is available at https://github.com/huangdonghere/JMVFG. + +
+
+ comment: To appear in IEEE Transactions on Emerging Topics in Computational + Intelligence +
+
+
+
+
+ + ♻ ☆ Con$^{2}$DA: Simplifying Semi-supervised Domain Adaptation by Learning + Consistent and Contrastive Feature Representations NeurIPS 2021 + + +
+ In this work, we present Con$^{2}$DA, a simple framework that extends recent +advances in semi-supervised learning to the semi-supervised domain adaptation +(SSDA) problem. Our framework generates pairs of associated samples by +performing stochastic data transformations to a given input. Associated data +pairs are mapped to a feature representation space using a feature extractor. +We use different loss functions to enforce consistency between the feature +representations of associated data pairs of samples. We show that these learned +representations are useful to deal with differences in data distributions in +the domain adaptation problem. We performed experiments to study the main +components of our model and we show that (i) learning of the consistent and +contrastive feature representations is crucial to extract good discriminative +features across different domains, and ii) our model benefits from the use of +strong augmentation policies. With these findings, our method achieves +state-of-the-art performances in three benchmark datasets for SSDA. + +
+
+ comment: Accepted to NeurIPS 2021 Workshop on Distribution Shifts: Connecting + Methods and Applications +
+
+
+
+
+ + ♻ ☆ Adversarial Self-Attack Defense and Spatial-Temporal Relation Mining for + Visible-Infrared Video Person Re-Identification + + +
+ In visible-infrared video person re-identification (re-ID), extracting +features not affected by complex scenes (such as modality, camera views, +pedestrian pose, background, etc.) changes, and mining and utilizing motion +information are the keys to solving cross-modal pedestrian identity matching. +To this end, the paper proposes a new visible-infrared video person re-ID +method from a novel perspective, i.e., adversarial self-attack defense and +spatial-temporal relation mining. In this work, the changes of views, posture, +background and modal discrepancy are considered as the main factors that cause +the perturbations of person identity features. Such interference information +contained in the training samples is used as an adversarial perturbation. It +performs adversarial attacks on the re-ID model during the training to make the +model more robust to these unfavorable factors. The attack from the adversarial +perturbation is introduced by activating the interference information contained +in the input samples without generating adversarial samples, and it can be thus +called adversarial self-attack. This design allows adversarial attack and +defense to be integrated into one framework. This paper further proposes a +spatial-temporal information-guided feature representation network to use the +information in video sequences. The network cannot only extract the information +contained in the video-frame sequences but also use the relation of the local +information in space to guide the network to extract more robust features. The +proposed method exhibits compelling performance on large-scale cross-modality +video datasets. The source code of the proposed method will be released at +https://github.com/lhf12278/xxx. + +
+
+ comment: 11 pages,8 figures +
+
+
+
+
+ + ♻ ☆ ECLAD: Extracting Concepts with Local Aggregated Descriptors + + +
+ Convolutional neural networks (CNNs) are increasingly being used in critical +systems, where robustness and alignment are crucial. In this context, the field +of explainable artificial intelligence has proposed the generation of +high-level explanations of the prediction process of CNNs through concept +extraction. While these methods can detect whether or not a concept is present +in an image, they are unable to determine its location. What is more, a fair +comparison of such approaches is difficult due to a lack of proper validation +procedures. To address these issues, we propose a novel method for automatic +concept extraction and localization based on representations obtained through +pixel-wise aggregations of CNN activation maps. Further, we introduce a process +for the validation of concept-extraction techniques based on synthetic datasets +with pixel-wise annotations of their main components, reducing the need for +human intervention. Extensive experimentation on both synthetic and real-world +datasets demonstrates that our method outperforms state-of-the-art +alternatives. + +
+
+ comment: 34 pages, under review +
+
+
+
+
+ + ♻ ☆ Robust Lane Detection through Self Pre-training with Masked Sequential + Autoencoders and Fine-tuning with Customized PolyLoss + + +
+ Lane detection is crucial for vehicle localization which makes it the +foundation for automated driving and many intelligent and advanced driving +assistant systems. Available vision-based lane detection methods do not make +full use of the valuable features and aggregate contextual information, +especially the interrelationships between lane lines and other regions of the +images in continuous frames. To fill this research gap and upgrade lane +detection performance, this paper proposes a pipeline consisting of self +pre-training with masked sequential autoencoders and fine-tuning with +customized PolyLoss for the end-to-end neural network models using +multi-continuous image frames. The masked sequential autoencoders are adopted +to pre-train the neural network models with reconstructing the missing pixels +from a random masked image as the objective. Then, in the fine-tuning +segmentation phase where lane detection segmentation is performed, the +continuous image frames are served as the inputs, and the pre-trained model +weights are transferred and further updated using the backpropagation mechanism +with customized PolyLoss calculating the weighted errors between the output +lane detection results and the labeled ground truth. Extensive experiment +results demonstrate that, with the proposed pipeline, the lane detection model +performance on both normal and challenging scenes can be advanced beyond the +state-of-the-art, delivering the best testing accuracy (98.38%), precision +(0.937), and F1-measure (0.924) on the normal scene testing set, together with +the best overall accuracy (98.36%) and precision (0.844) in the challenging +scene test set, while the training time can be substantially shortened. + +
+
+ comment: 12 pages, 8 figures, accepted by journal of IEEE Transactions on + Intelligent Transportation Systems +
+
+
+
+
+ + ♻ ☆ VAST: Vivify Your Talking Avatar via Zero-Shot Expressive Facial Style + Transfer ICCV2023 + + +
+ Current talking face generation methods mainly focus on speech-lip +synchronization. However, insufficient investigation on the facial talking +style leads to a lifeless and monotonous avatar. Most previous works fail to +imitate expressive styles from arbitrary video prompts and ensure the +authenticity of the generated video. This paper proposes an unsupervised +variational style transfer model (VAST) to vivify the neutral photo-realistic +avatars. Our model consists of three key components: a style encoder that +extracts facial style representations from the given video prompts; a hybrid +facial expression decoder to model accurate speech-related movements; a +variational style enhancer that enhances the style space to be highly +expressive and meaningful. With our essential designs on facial style learning, +our model is able to flexibly capture the expressive facial style from +arbitrary video prompts and transfer it onto a personalized image renderer in a +zero-shot manner. Experimental results demonstrate the proposed approach +contributes to a more vivid talking avatar with higher authenticity and richer +expressiveness. + +
+
+ comment: Accepted by ICCV2023 Workshop +
+
+
+
+
+ + ♻ ☆ Deformable Mixer Transformer with Gating for Multi-Task Learning of + Dense Prediction AAAI 2023 + + +
+ CNNs and Transformers have their own advantages and both have been widely +used for dense prediction in multi-task learning (MTL). Most of the current +studies on MTL solely rely on CNN or Transformer. In this work, we present a +novel MTL model by combining both merits of deformable CNN and query-based +Transformer with shared gating for multi-task learning of dense prediction. +This combination may offer a simple and efficient solution owing to its +powerful and flexible task-specific learning and advantages of lower cost, less +complexity and smaller parameters than the traditional MTL methods. We +introduce deformable mixer Transformer with gating (DeMTG), a simple and +effective encoder-decoder architecture up-to-date that incorporates the +convolution and attention mechanism in a unified network for MTL. It is +exquisitely designed to use advantages of each block, and provide deformable +and comprehensive features for all tasks from local and global perspective. +First, the deformable mixer encoder contains two types of operators: the +channel-aware mixing operator leveraged to allow communication among different +channels, and the spatial-aware deformable operator with deformable convolution +applied to efficiently sample more informative spatial locations. Second, the +task-aware gating transformer decoder is used to perform the task-specific +predictions, in which task interaction block integrated with self-attention is +applied to capture task interaction features, and the task query block +integrated with gating attention is leveraged to select corresponding +task-specific features. Further, the experiment results demonstrate that the +proposed DeMTG uses fewer GFLOPs and significantly outperforms current +Transformer-based and CNN-based competitive models on a variety of metrics on +three dense prediction datasets. Our code and models are available at +https://github.com/yangyangxu0/DeMTG. + +
+
+ comment: submitted to IJCV; an extension to our previous AAAI 2023 paper + arXiv:2301.03461 +
+
+
+
+
+ + ♻ ☆ O2CTA: Introducing Annotations from OCT to CCTA in Coronary Plaque + Analysis MICCAI + + +
+ Targeted diagnosis and treatment plans for patients with coronary artery +disease vary according to atherosclerotic plaque component. Coronary CT +angiography (CCTA) is widely used for artery imaging and determining the +stenosis degree. However, the limited spatial resolution and susceptibility to +artifacts fail CCTA in obtaining lumen morphological characteristics and plaque +composition. It can be settled by invasive optical coherence tomography (OCT) +without much trouble for physicians, but bringing higher costs and potential +risks to patients. Therefore, it is clinically critical to introduce +annotations of plaque tissue and lumen characteristics from OCT to paired CCTA +scans, denoted as \textbf{the O2CTA problem} in this paper. We propose a method +to handle the O2CTA problem. CCTA scans are first reconstructed into +multi-planar reformatted (MPR) images, which agree with OCT images in term of +semantic contents. The artery segment in OCT, which is manually labelled, is +then spatially aligned with the entire artery in MPR images via the proposed +alignment strategy. Finally, a classification model involving a 3D CNN and a +Transformer, is learned to extract local features and capture dependence along +arteries. Experiments on 55 paired OCT and CCTA we curate demonstrate that it +is feasible to classify the CCTA based on the OCT labels, with an accuracy of +86.2%, while the manual readings of OCT and CCTA vary significantly, with a +Kappa coefficient of 0.113. We will make our source codes, models, data, and +results publicly available to benefit the research community. + +
+
+ comment: Accepted for oral presentation in MICCAI-BTSD 2023 workshop +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Branching for Motion Prediction using Motion Increments + + +
+ Human motion prediction (HMP) has emerged as a popular research topic due to +its diverse applications, but it remains a challenging task due to the +stochastic and aperiodic nature of future poses. Traditional methods rely on +hand-crafted features and machine learning techniques, which often struggle to +model the complex dynamics of human motion. Recent deep learning-based methods +have achieved success by learning spatio-temporal representations of motion, +but these models often overlook the reliability of motion data. Additionally, +the temporal and spatial dependencies of skeleton nodes are distinct. The +temporal relationship captures motion information over time, while the spatial +relationship describes body structure and the relationships between different +nodes. In this paper, we propose a novel spatio-temporal branching network +using incremental information for HMP, which decouples the learning of +temporal-domain and spatial-domain features, extracts more motion information, +and achieves complementary cross-domain knowledge learning through knowledge +distillation. Our approach effectively reduces noise interference and provides +more expressive information for characterizing motion by separately extracting +temporal and spatial features. We evaluate our approach on standard HMP +benchmarks and outperform state-of-the-art methods in terms of prediction +accuracy. + +
+
+
+
+
+ + ♻ ☆ Dual Aggregation Transformer for Image Super-Resolution ICCV 2023 + + +
+ Transformer has recently gained considerable popularity in low-level vision +tasks, including image super-resolution (SR). These networks utilize +self-attention along different dimensions, spatial or channel, and achieve +impressive performance. This inspires us to combine the two dimensions in +Transformer for a more powerful representation capability. Based on the above +idea, we propose a novel Transformer model, Dual Aggregation Transformer (DAT), +for image SR. Our DAT aggregates features across spatial and channel +dimensions, in the inter-block and intra-block dual manner. Specifically, we +alternately apply spatial and channel self-attention in consecutive Transformer +blocks. The alternate strategy enables DAT to capture the global context and +realize inter-block feature aggregation. Furthermore, we propose the adaptive +interaction module (AIM) and the spatial-gate feed-forward network (SGFN) to +achieve intra-block feature aggregation. AIM complements two self-attention +mechanisms from corresponding dimensions. Meanwhile, SGFN introduces additional +non-linear spatial information in the feed-forward network. Extensive +experiments show that our DAT surpasses current methods. Code and models are +obtainable at https://github.com/zhengchen1999/DAT. + +
+
+ comment: Accepted to ICCV 2023. Code is available at + https://github.com/zhengchen1999/DAT +
+
+
+
+
+ + ♻ ☆ Training Multimedia Event Extraction With Generated Images and Captions + + +
+ Contemporary news reporting increasingly features multimedia content, +motivating research on multimedia event extraction. However, the task lacks +annotated multimodal training data and artificially generated training data +suffer from distribution shift from real-world data. In this paper, we propose +Cross-modality Augmented Multimedia Event Learning (CAMEL), which successfully +utilizes artificially generated multimodal training data and achieves +state-of-the-art performance. We start with two labeled unimodal datasets in +text and image respectively, and generate the missing modality using +off-the-shelf image generators like Stable Diffusion and image captioners like +BLIP. After that, we train the network on the resultant multimodal datasets. In +order to learn robust features that are effective across domains, we devise an +iterative and gradual training strategy. Substantial experiments show that +CAMEL surpasses state-of-the-art (SOTA) baselines on the M2E2 benchmark. On +multimedia events in particular, we outperform the prior SOTA by 4.2% F1 on +event mention identification and by 9.8% F1 on argument identification, which +indicates that CAMEL learns synergistic representations from the two +modalities. Our work demonstrates a recipe to unleash the power of synthetic +training data in structured prediction. + +
+
+
+
+
+ + ♻ ☆ Generalised Co-Salient Object Detection + + +
+ We propose a new setting that relaxes an assumption in the conventional +Co-Salient Object Detection (CoSOD) setting by allowing the presence of "noisy +images" which do not show the shared co-salient object. We call this new +setting Generalised Co-Salient Object Detection (GCoSOD). We propose a novel +random sampling based Generalised CoSOD Training (GCT) strategy to distill the +awareness of inter-image absence of co-salient objects into CoSOD models. It +employs a Diverse Sampling Self-Supervised Learning (DS3L) that, in addition to +the provided supervised co-salient label, introduces additional self-supervised +labels for noisy images (being null, that no co-salient object is present). +Further, the random sampling process inherent in GCT enables the generation of +a high-quality uncertainty map highlighting potential false-positive +predictions at instance level. To evaluate the performance of CoSOD models +under the GCoSOD setting, we propose two new testing datasets, namely +CoCA-Common and CoCA-Zero, where a common salient object is partially present +in the former and completely absent in the latter. Extensive experiments +demonstrate that our proposed method significantly improves the performance of +CoSOD models in terms of the performance under the GCoSOD setting as well as +the model calibration degrees. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Coordinate Projection Network for Sparse-View Computed + Tomography + + +
+ In the present work, we propose a Self-supervised COordinate Projection +nEtwork (SCOPE) to reconstruct the artifacts-free CT image from a single SV +sinogram by solving the inverse tomography imaging problem. Compared with +recent related works that solve similar problems using implicit neural +representation network (INR), our essential contribution is an effective and +simple re-projection strategy that pushes the tomography image reconstruction +quality over supervised deep learning CT reconstruction works. The proposed +strategy is inspired by the simple relationship between linear algebra and +inverse problems. To solve the under-determined linear equation system, we +first introduce INR to constrain the solution space via image continuity prior +and achieve a rough solution. And secondly, we propose to generate a dense view +sinogram that improves the rank of the linear equation system and produces a +more stable CT image solution space. Our experiment results demonstrate that +the re-projection strategy significantly improves the image reconstruction +quality (+3 dB for PSNR at least). Besides, we integrate the recent hash +encoding into our SCOPE model, which greatly accelerates the model training. +Finally, we evaluate SCOPE in parallel and fan X-ray beam SVCT reconstruction +tasks. Experimental results indicate that the proposed SCOPE model outperforms +two latest INR-based methods and two well-popular supervised DL methods +quantitatively and qualitatively. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ Towards Segment Anything Model (SAM) for Medical Image Segmentation: A + Survey + + +
+ Due to the flexibility of prompting, foundation models have become the +dominant force in the domains of natural language processing and image +generation. With the recent introduction of the Segment Anything Model (SAM), +the prompt-driven paradigm has entered the realm of image segmentation, +bringing with a range of previously unexplored capabilities. However, it +remains unclear whether it can be applicable to medical image segmentation due +to the significant differences between natural images and medical images.In +this work, we summarize recent efforts to extend the success of SAM to medical +image segmentation tasks, including both empirical benchmarking and +methodological adaptations, and discuss potential future directions for SAM in +medical image segmentation. Although directly applying SAM to medical image +segmentation cannot obtain satisfying performance on multi-modal and +multi-target medical datasets, many insights are drawn to guide future research +to develop foundation models for medical image analysis. To facilitate future +research, we maintain an active repository that contains up-to-date paper list +and open-source project summary at https://github.com/YichiZhang98/SAM4MIS. + +
+
+
+
+
+ + ♻ ☆ Undercover Deepfakes: Detecting Fake Segments in Videos ICCV 2023 + + +
+ The recent renaissance in generative models, driven primarily by the advent +of diffusion models and iterative improvement in GAN methods, has enabled many +creative applications. However, each advancement is also accompanied by a rise +in the potential for misuse. In the arena of the deepfake generation, this is a +key societal issue. In particular, the ability to modify segments of videos +using such generative techniques creates a new paradigm of deepfakes which are +mostly real videos altered slightly to distort the truth.This paradigm has been +under-explored by the current deepfake detection methods in the academic +literature. In this paper, we present a deepfake detection method that can +address this issue by performing deepfake prediction at the frame and video +levels. To facilitate testing our method, we prepared a new benchmark dataset +where videos have both real and fake frame sequences with very subtle +transitions. We provide a benchmark on the proposed dataset with our detection +method which utilizes the Vision Transformer based on Scaling and Shifting to +learn spatial features, and a Timeseries Transformer to learn temporal features +of the videos to help facilitate the interpretation of possible deepfakes. +Extensive experiments on a variety of deepfake generation methods show +excellent results by the proposed method on temporal segmentation and classical +video-level predictions as well. In particular, the paradigm we address will +form a powerful tool for the moderation of deepfakes, where human oversight can +be better targeted to the parts of videos suspected of being deepfakes. All +experiments can be reproduced at: https://t.ly/\_bOh9. + +
+
+ comment: ICCV 2023 Workshop and Challenge on DeepFake Analysis and Detection +
+
+
+
+
+ + ♻ ☆ NIPD: A Federated Learning Person Detection Benchmark Based on + Real-World Non-IID Data IJCAI 23 + + +
+ Federated learning (FL), a privacy-preserving distributed machine learning, +has been rapidly applied in wireless communication networks. FL enables +Internet of Things (IoT) clients to obtain well-trained models while preventing +privacy leakage. Person detection can be deployed on edge devices with limited +computing power if combined with FL to process the video data directly at the +edge. However, due to the different hardware and deployment scenarios of +different cameras, the data collected by the camera present non-independent and +identically distributed (non-IID), and the global model derived from FL +aggregation is less effective. Meanwhile, existing research lacks public data +set for real-world FL object detection, which is not conducive to studying the +non-IID problem on IoT cameras. Therefore, we open source a non-IID IoT person +detection (NIPD) data set, which is collected from five different cameras. To +our knowledge, this is the first true device-based non-IID person detection +data set. Based on this data set, we explain how to establish a FL experimental +platform and provide a benchmark for non-IID person detection. NIPD is expected +to promote the application of FL and the security of smart city. + +
+
+ comment: 8 pages, 5 figures, 3 tables, FL-IJCAI 23 conference +
+
+
+
+
+ + ♻ ☆ CDistNet: Perceiving Multi-Domain Character Distance for Robust Text + Recognition + + +
+ The Transformer-based encoder-decoder framework is becoming popular in scene +text recognition, largely because it naturally integrates recognition clues +from both visual and semantic domains. However, recent studies show that the +two kinds of clues are not always well registered and therefore, feature and +character might be misaligned in difficult text (e.g., with a rare shape). As a +result, constraints such as character position are introduced to alleviate this +problem. Despite certain success, visual and semantic are still separately +modeled and they are merely loosely associated. In this paper, we propose a +novel module called Multi-Domain Character Distance Perception (MDCDP) to +establish a visually and semantically related position embedding. MDCDP uses +the position embedding to query both visual and semantic features following the +cross-attention mechanism. The two kinds of clues are fused into the position +branch, generating a content-aware embedding that well perceives character +spacing and orientation variants, character semantic affinities, and clues +tying the two kinds of information. They are summarized as the multi-domain +character distance. We develop CDistNet that stacks multiple MDCDPs to guide a +gradually precise distance modeling. Thus, the feature-character alignment is +well built even various recognition difficulties are presented. We verify +CDistNet on ten challenging public datasets and two series of augmented +datasets created by ourselves. The experiments demonstrate that CDistNet +performs highly competitively. It not only ranks top-tier in standard +benchmarks, but also outperforms recent popular methods by obvious margins on +real and augmented datasets presenting severe text deformation, poor linguistic +support, and rare character layouts. Code is available at +https://github.com/simplify23/CDistNet. + +
+
+ comment: Paper accepted for publication at IJCV 2023 +
+
+
+
+
+ + ♻ ☆ Optimal Linear Subspace Search: Learning to Construct Fast and + High-Quality Schedulers for Diffusion Models + + +
+ In recent years, diffusion models have become the most popular and powerful +methods in the field of image synthesis, even rivaling human artists in +artistic creativity. However, the key issue currently limiting the application +of diffusion models is its extremely slow generation process. Although several +methods were proposed to speed up the generation process, there still exists a +trade-off between efficiency and quality. In this paper, we first provide a +detailed theoretical and empirical analysis of the generation process of the +diffusion models based on schedulers. We transform the designing problem of +schedulers into the determination of several parameters, and further transform +the accelerated generation process into an expansion process of the linear +subspace. Based on these analyses, we consequently propose a novel method +called Optimal Linear Subspace Search (OLSS), which accelerates the generation +process by searching for the optimal approximation process of the complete +generation process in the linear subspaces spanned by latent variables. OLSS is +able to generate high-quality images with a very small number of steps. To +demonstrate the effectiveness of our method, we conduct extensive comparative +experiments on open-source diffusion models. Experimental results show that +with a given number of steps, OLSS can significantly improve the quality of +generated images. Using an NVIDIA A100 GPU, we make it possible to generate a +high-quality image by Stable Diffusion within only one second without other +optimization techniques. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ NeTO:Neural Reconstruction of Transparent Objects with Self-Occlusion + Aware Refraction-Tracing + + +
+ We present a novel method, called NeTO, for capturing 3D geometry of solid +transparent objects from 2D images via volume rendering. Reconstructing +transparent objects is a very challenging task, which is ill-suited for +general-purpose reconstruction techniques due to the specular light transport +phenomena. Although existing refraction-tracing based methods, designed +specially for this task, achieve impressive results, they still suffer from +unstable optimization and loss of fine details, since the explicit surface +representation they adopted is difficult to be optimized, and the +self-occlusion problem is ignored for refraction-tracing. In this paper, we +propose to leverage implicit Signed Distance Function (SDF) as surface +representation, and optimize the SDF field via volume rendering with a +self-occlusion aware refractive ray tracing. The implicit representation +enables our method to be capable of reconstructing high-quality reconstruction +even with a limited set of images, and the self-occlusion aware strategy makes +it possible for our method to accurately reconstruct the self-occluded regions. +Experiments show that our method achieves faithful reconstruction results and +outperforms prior works by a large margin. Visit our project page at +\url{https://www.xxlong.site/NeTO/} + +
+
+ comment: Experiments involving sparse views have some flaws, mainly including + Figure 1 in the introduction, Figure 7 and Table 1 in the experiments. In + order to maintain correctness and fairness, we would like to retract the + paper first +
+
+
+
+
+ + ♻ ☆ Learning to Relight Portrait Images via a Virtual Light Stage and + Synthetic-to-Real Adaptation SIGGRAPH + + +
+ Given a portrait image of a person and an environment map of the target +lighting, portrait relighting aims to re-illuminate the person in the image as +if the person appeared in an environment with the target lighting. To achieve +high-quality results, recent methods rely on deep learning. An effective +approach is to supervise the training of deep neural networks with a +high-fidelity dataset of desired input-output pairs, captured with a light +stage. However, acquiring such data requires an expensive special capture rig +and time-consuming efforts, limiting access to only a few resourceful +laboratories. To address the limitation, we propose a new approach that can +perform on par with the state-of-the-art (SOTA) relighting methods without +requiring a light stage. Our approach is based on the realization that a +successful relighting of a portrait image depends on two conditions. First, the +method needs to mimic the behaviors of physically-based relighting. Second, the +output has to be photorealistic. To meet the first condition, we propose to +train the relighting network with training data generated by a virtual light +stage that performs physically-based rendering on various 3D synthetic humans +under different environment maps. To meet the second condition, we develop a +novel synthetic-to-real approach to bring photorealism to the relighting +network output. In addition to achieving SOTA results, our approach offers +several advantages over the prior methods, including controllable glares on +glasses and more temporally-consistent results for relighting videos. + +
+
+ comment: To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21 + pages, 25 figures, 7 tables. Project page: + https://research.nvidia.com/labs/dir/lumos/ +
+
+
+
+
+ + ♻ ☆ Improving Statistical Fidelity for Neural Image Compression with + Implicit Local Likelihood Models + + +
+ Lossy image compression aims to represent images in as few bits as possible +while maintaining fidelity to the original. Theoretical results indicate that +optimizing distortion metrics such as PSNR or MS-SSIM necessarily leads to a +discrepancy in the statistics of original images from those of reconstructions, +in particular at low bitrates, often manifested by the blurring of the +compressed images. Previous work has leveraged adversarial discriminators to +improve statistical fidelity. Yet these binary discriminators adopted from +generative modeling tasks may not be ideal for image compression. In this +paper, we introduce a non-binary discriminator that is conditioned on quantized +local image representations obtained via VQ-VAE autoencoders. Our evaluations +on the CLIC2020, DIV2K and Kodak datasets show that our discriminator is more +effective for jointly optimizing distortion (e.g., PSNR) and statistical +fidelity (e.g., FID) than the PatchGAN of the state-of-the-art HiFiC model. On +CLIC2020, we obtain the same FID as HiFiC with 30-40\% fewer bits. + +
+
+ comment: Upload camera-ready to arXiv. Official version available at + https://proceedings.mlr.press/v202/muckley23a.html +
+
+
+
+
+ + ♻ ☆ Towards Generalist Foundation Model for Radiology + + +
+ In this study, we aim to initiate the development of Radiology Foundation +Model, termed as RadFM.We consider the construction of foundational models from +the perspectives of data, model design, and evaluation thoroughly. Our +contribution can be concluded as follows: (i), we construct a large-scale +Medical Multi-modal Dataset, MedMD, consisting of 16M 2D and 3D medical scans. +To the best of our knowledge, this is the first multi-modal dataset containing +3D medical scans. (ii), We propose an architecture that enables visually +conditioned generative pre-training, allowing for the integration of text input +interleaved with 2D or 3D medical scans to generate response for diverse +radiologic tasks. The model was initially pre-trained on MedMD and subsequently +domain-specific fine-tuned on RadMD, a radiologic cleaned version of MedMD, +containing 3M radiologic visual-language pairs. (iii), we propose a new +evaluation benchmark that comprises five tasks, aiming to comprehensively +assess the capability of foundation models in handling practical clinical +problems. Our experimental results confirm that RadFM significantly outperforms +existing multi-modal foundation models. The codes, data, and model checkpoint +will all be made publicly available to promote further research and development +in the field. + +
+
+
+
+
+ + ♻ ☆ Learning Non-Local Spatial-Angular Correlation for Light Field Image + Super-Resolution ICCV 2023 + + +
+ Exploiting spatial-angular correlation is crucial to light field (LF) image +super-resolution (SR), but is highly challenging due to its non-local property +caused by the disparities among LF images. Although many deep neural networks +(DNNs) have been developed for LF image SR and achieved continuously improved +performance, existing methods cannot well leverage the long-range +spatial-angular correlation and thus suffer a significant performance drop when +handling scenes with large disparity variations. In this paper, we propose a +simple yet effective method to learn the non-local spatial-angular correlation +for LF image SR. In our method, we adopt the epipolar plane image (EPI) +representation to project the 4D spatial-angular correlation onto multiple 2D +EPI planes, and then develop a Transformer network with repetitive +self-attention operations to learn the spatial-angular correlation by modeling +the dependencies between each pair of EPI pixels. Our method can fully +incorporate the information from all angular views while achieving a global +receptive field along the epipolar line. We conduct extensive experiments with +insightful visualizations to validate the effectiveness of our method. +Comparative results on five public datasets show that our method not only +achieves state-of-the-art SR performance, but also performs robust to disparity +variations. Code is publicly available at +https://github.com/ZhengyuLiang24/EPIT. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Person Re-Identification without Identification via Event Anonymization ICCV + + +
+ Wide-scale use of visual surveillance in public spaces puts individual +privacy at stake while increasing resource consumption (energy, bandwidth, and +computation). Neuromorphic vision sensors (event-cameras) have been recently +considered a valid solution to the privacy issue because they do not capture +detailed RGB visual information of the subjects in the scene. However, recent +deep learning architectures have been able to reconstruct images from event +cameras with high fidelity, reintroducing a potential threat to privacy for +event-based vision applications. In this paper, we aim to anonymize +event-streams to protect the identity of human subjects against such image +reconstruction attacks. To achieve this, we propose an end-to-end network +architecture jointly optimized for the twofold objective of preserving privacy +and performing a downstream task such as person ReId. Our network learns to +scramble events, enforcing the degradation of images recovered from the privacy +attacker. In this work, we also bring to the community the first ever +event-based person ReId dataset gathered to evaluate the performance of our +approach. We validate our approach with extensive experiments and report +results on the synthetic event data simulated from the publicly available +SoftBio dataset and our proposed Event-ReId dataset. + +
+
+ comment: Accepted at International Conference on Computer Vision (ICCV), 2023 +
+
+
+
+
+ + ♻ ☆ A Law of Data Separation in Deep Learning + + +
+ While deep learning has enabled significant advances in many areas of +science, its black-box nature hinders architecture design for future artificial +intelligence applications and interpretation for high-stakes decision makings. +We addressed this issue by studying the fundamental question of how deep neural +networks process data in the intermediate layers. Our finding is a simple and +quantitative law that governs how deep neural networks separate data according +to class membership throughout all layers for classification. This law shows +that each layer improves data separation at a constant geometric rate, and its +emergence is observed in a collection of network architectures and datasets +during training. This law offers practical guidelines for designing +architectures, improving model robustness and out-of-sample performance, as +well as interpreting the predictions. + +
+
+ comment: Accepted at PNAS +
+
+
+
+
+ + ♻ ☆ A Survey on Training Challenges in Generative Adversarial Networks for + Biomedical Image Analysis + + +
+ In biomedical image analysis, the applicability of deep learning methods is +directly impacted by the quantity of image data available. This is due to deep +learning models requiring large image datasets to provide high-level +performance. Generative Adversarial Networks (GANs) have been widely utilized +to address data limitations through the generation of synthetic biomedical +images. GANs consist of two models. The generator, a model that learns how to +produce synthetic images based on the feedback it receives. The discriminator, +a model that classifies an image as synthetic or real and provides feedback to +the generator. Throughout the training process, a GAN can experience several +technical challenges that impede the generation of suitable synthetic imagery. +First, the mode collapse problem whereby the generator either produces an +identical image or produces a uniform image from distinct input features. +Second, the non-convergence problem whereby the gradient descent optimizer +fails to reach a Nash equilibrium. Thirdly, the vanishing gradient problem +whereby unstable training behavior occurs due to the discriminator achieving +optimal classification performance resulting in no meaningful feedback being +provided to the generator. These problems result in the production of synthetic +imagery that is blurry, unrealistic, and less diverse. To date, there has been +no survey article outlining the impact of these technical challenges in the +context of the biomedical imagery domain. This work presents a review and +taxonomy based on solutions to the training problems of GANs in the biomedical +imaging domain. This survey highlights important challenges and outlines future +research directions about the training of GANs in the domain of biomedical +imagery. + +
+
+ comment: Submitted to the AI Review Journal +
+
+
+
+
+
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ A Large Language Model Enhanced Conversational Recommender System + + +
+ Conversational recommender systems (CRSs) aim to recommend high-quality items +to users through a dialogue interface. It usually contains multiple sub-tasks, +such as user preference elicitation, recommendation, explanation, and item +information search. To develop effective CRSs, there are some challenges: 1) +how to properly manage sub-tasks; 2) how to effectively solve different +sub-tasks; and 3) how to correctly generate responses that interact with users. +Recently, Large Language Models (LLMs) have exhibited an unprecedented ability +to reason and generate, presenting a new opportunity to develop more powerful +CRSs. In this work, we propose a new LLM-based CRS, referred to as LLMCRS, to +address the above challenges. For sub-task management, we leverage the +reasoning ability of LLM to effectively manage sub-task. For sub-task solving, +we collaborate LLM with expert models of different sub-tasks to achieve the +enhanced performance. For response generation, we utilize the generation +ability of LLM as a language interface to better interact with users. +Specifically, LLMCRS divides the workflow into four stages: sub-task detection, +model matching, sub-task execution, and response generation. LLMCRS also +designs schema-based instruction, demonstration-based instruction, dynamic +sub-task and model matching, and summary-based generation to instruct LLM to +generate desired results in the workflow. Finally, to adapt LLM to +conversational recommendations, we also propose to fine-tune LLM with +reinforcement learning from CRSs performance feedback, referred to as RLPF. +Experimental results on benchmark datasets show that LLMCRS with RLPF +outperforms the existing methods. + +
+
+
+
+
+ + ☆ Identification of the Relevance of Comments in Codes Using Bag of Words + and Transformer Based Models + + +
+ The Forum for Information Retrieval (FIRE) started a shared task this year +for classification of comments of different code segments. This is binary text +classification task where the objective is to identify whether comments given +for certain code segments are relevant or not. The BioNLP-IISERB group at the +Indian Institute of Science Education and Research Bhopal (IISERB) participated +in this task and submitted five runs for five different models. The paper +presents the overview of the models and other significant findings on the +training corpus. The methods involve different feature engineering schemes and +text classification techniques. The performance of the classical bag of words +model and transformer-based models were explored to identify significant +features from the given training corpus. We have explored different classifiers +viz., random forest, support vector machine and logistic regression using the +bag of words model. Furthermore, the pre-trained transformer based models like +BERT, RoBERT and ALBERT were also used by fine-tuning them on the given +training corpus. The performance of different such models over the training +corpus were reported and the best five models were implemented on the given +test corpus. The empirical results show that the bag of words model outperforms +the transformer based models, however, the performance of our runs are not +reasonably well in both training and test corpus. This paper also addresses the +limitations of the models and scope for further improvement. + +
+
+
+
+
+ + ☆ Toward a Better Understanding of Loss Functions for Collaborative + Filtering CIKM 2023 + + +
+ Collaborative filtering (CF) is a pivotal technique in modern recommender +systems. The learning process of CF models typically consists of three +components: interaction encoder, loss function, and negative sampling. Although +many existing studies have proposed various CF models to design sophisticated +interaction encoders, recent work shows that simply reformulating the loss +functions can achieve significant performance gains. This paper delves into +analyzing the relationship among existing loss functions. Our mathematical +analysis reveals that the previous loss functions can be interpreted as +alignment and uniformity functions: (i) the alignment matches user and item +representations, and (ii) the uniformity disperses user and item distributions. +Inspired by this analysis, we propose a novel loss function that improves the +design of alignment and uniformity considering the unique patterns of datasets +called Margin-aware Alignment and Weighted Uniformity (MAWU). The key novelty +of MAWU is two-fold: (i) margin-aware alignment (MA) mitigates +user/item-specific popularity biases, and (ii) weighted uniformity (WU) adjusts +the significance between user and item uniformities to reflect the inherent +characteristics of datasets. Extensive experimental results show that MF and +LightGCN equipped with MAWU are comparable or superior to state-of-the-art CF +models with various loss functions on three public datasets. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ☆ Deep Context Interest Network for Click-Through Rate Prediction CIKM 2023 + + +
+ Click-Through Rate (CTR) prediction, estimating the probability of a user +clicking on an item, is essential in industrial applications, such as online +advertising. Many works focus on user behavior modeling to improve CTR +prediction performance. However, most of those methods only model users' +positive interests from users' click items while ignoring the context +information, which is the display items around the clicks, resulting in +inferior performance. In this paper, we highlight the importance of context +information on user behavior modeling and propose a novel model named Deep +Context Interest Network (DCIN), which integrally models the click and its +display context to learn users' context-aware interests. DCIN consists of three +key modules: 1) Position-aware Context Aggregation Module (PCAM), which +performs aggregation of display items with an attention mechanism; 2) +Feedback-Context Fusion Module (FCFM), which fuses the representation of clicks +and display contexts through non-linear feature interaction; 3) Interest +Matching Module (IMM), which activates interests related with the target item. +Moreover, we provide our hands-on solution to implement our DCIN model on +large-scale industrial systems. The significant improvements in both offline +and online evaluations demonstrate the superiority of our proposed DCIN method. +Notably, DCIN has been deployed on our online advertising system serving the +main traffic, which brings 1.5% CTR and 1.5% RPM lift. + +
+
+ comment: accepted by CIKM 2023 +
+
+
+
+
+ + ☆ Designing a User Contextual Profile Ontology: A Focus on the Vehicle + Sales Domain + + +
+ In the digital age, it is crucial to understand and tailor experiences for +users interacting with systems and applications. This requires the creation of +user contextual profiles that combine user profiles with contextual +information. However, there is a lack of research on the integration of +contextual information with different user profiles. This study aims to address +this gap by designing a user contextual profile ontology that considers both +user profiles and contextual information on each profile. Specifically, we +present a design and development of the user contextual profile ontology with a +focus on the vehicle sales domain. Our designed ontology serves as a structural +foundation for standardizing the representation of user profiles and contextual +information, enhancing the system's ability to capture user preferences and +contextual information of the user accurately. Moreover, we illustrate a case +study using the User Contextual Profile Ontology in generating personalized +recommendations for vehicle sales domain. + +
+
+
+
+
+ + ☆ Augmented Negative Sampling for Collaborative Filtering + + +
+ Negative sampling is essential for implicit-feedback-based collaborative +filtering, which is used to constitute negative signals from massive unlabeled +data to guide supervised learning. The state-of-the-art idea is to utilize hard +negative samples that carry more useful information to form a better decision +boundary. To balance efficiency and effectiveness, the vast majority of +existing methods follow the two-pass approach, in which the first pass samples +a fixed number of unobserved items by a simple static distribution and then the +second pass selects the final negative items using a more sophisticated +negative sampling strategy. However, selecting negative samples from the +original items is inherently restricted, and thus may not be able to contrast +positive samples well. In this paper, we confirm this observation via +experiments and introduce two limitations of existing solutions: ambiguous trap +and information discrimination. Our response to such limitations is to +introduce augmented negative samples. This direction renders a substantial +technical challenge because constructing unconstrained negative samples may +introduce excessive noise that distorts the decision boundary. To this end, we +introduce a novel generic augmented negative sampling paradigm and provide a +concrete instantiation. First, we disentangle hard and easy factors of negative +items. Next, we generate new candidate negative samples by augmenting only the +easy factors in a regulated manner: the direction and magnitude of the +augmentation are carefully calibrated. Finally, we design an advanced negative +sampling strategy to identify the final augmented negative samples, which +considers not only the score function used in existing methods but also a new +metric called augmentation gain. Extensive experiments on real-world datasets +demonstrate that our method significantly outperforms state-of-the-art +baselines. + +
+
+ comment: 11 pages, 16 figures, +
+
+
+
+
+ + ☆ LittleMu: Deploying an Online Virtual Teaching Assistant via + Heterogeneous Sources Integration and Chain of Teach Prompts CIKM 23 + + +
+ Teaching assistants have played essential roles in the long history of +education. However, few MOOC platforms are providing human or virtual teaching +assistants to support learning for massive online students due to the +complexity of real-world online education scenarios and the lack of training +data. In this paper, we present a virtual MOOC teaching assistant, LittleMu +with minimum labeled training data, to provide question answering and chit-chat +services. Consisting of two interactive modules of heterogeneous retrieval and +language model prompting, LittleMu first integrates structural, semi- and +unstructured knowledge sources to support accurate answers for a wide range of +questions. Then, we design delicate demonstrations named "Chain of Teach" +prompts to exploit the large-scale pre-trained model to handle complex +uncollected questions. Except for question answering, we develop other +educational services such as knowledge-grounded chit-chat. We test the system's +performance via both offline evaluation and online deployment. Since May 2020, +our LittleMu system has served over 80,000 users with over 300,000 queries from +over 500 courses on XuetangX MOOC platform, which continuously contributes to a +more convenient and fair education. Our code, services, and dataset will be +available at https://github.com/THU-KEG/VTA. + +
+
+ comment: 7 pages, 3 figures, Accepted by CIKM 23 +
+
+
+
+
+ + ☆ LTP-MMF: Towards Long-term Provider Max-min Fairness Under + Recommendation Feedback Loops + + +
+ Multi-stakeholder recommender systems involve various roles, such as users, +providers. Previous work pointed out that max-min fairness (MMF) is a better +metric to support weak providers. However, when considering MMF, the features +or parameters of these roles vary over time, how to ensure long-term provider +MMF has become a significant challenge. We observed that recommendation +feedback loops (named RFL) will influence the provider MMF greatly in the long +term. RFL means that recommender system can only receive feedback on exposed +items from users and update recommender models incrementally based on this +feedback. When utilizing the feedback, the recommender model will regard +unexposed item as negative. In this way, tail provider will not get the +opportunity to be exposed, and its items will always be considered as negative +samples. Such phenomenons will become more and more serious in RFL. To +alleviate the problem, this paper proposes an online ranking model named +Long-Term Provider Max-min Fairness (named LTP-MMF). Theoretical analysis shows +that the long-term regret of LTP-MMF enjoys a sub-linear bound. Experimental +results on three public recommendation benchmarks demonstrated that LTP-MMF can +outperform the baselines in the long term. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.06660 +
+
+
+
+
+ + ☆ Topic-Level Bayesian Surprise and Serendipity for Recommender Systems + + +
+ A recommender system that optimizes its recommendations solely to fit a +user's history of ratings for consumed items can create a filter bubble, +wherein the user does not get to experience items from novel, unseen +categories. One approach to mitigate this undesired behavior is to recommend +items with high potential for serendipity, namely surprising items that are +likely to be highly rated. In this paper, we propose a content-based +formulation of serendipity that is rooted in Bayesian surprise and use it to +measure the serendipity of items after they are consumed and rated by the user. +When coupled with a collaborative-filtering component that identifies similar +users, this enables recommending items with high potential for serendipity. To +facilitate the evaluation of topic-level models for surprise and serendipity, +we introduce a dataset of book reading histories extracted from Goodreads, +containing over 26 thousand users and close to 1.3 million books, where we +manually annotate 449 books read by 4 users in terms of their time-dependent, +topic-level surprise. Experimental evaluations show that models that use +Bayesian surprise correlate much better with the manual annotations of +topic-level surprise than distance-based heuristics, and also obtain better +serendipitous item recommendation performance. + +
+
+
+
+
+ + ♻ ☆ Lib-SibGMU -- A University Library Circulation Dataset for Recommender + Systems Developmen + + +
+ We opensource under CC BY 4.0 license Lib-SibGMU - a university library +circulation dataset - for a wide research community, and benchmark major +algorithms for recommender systems on this dataset. For a recommender +architecture that consists of a vectorizer that turns the history of the books +borrowed into a vector, and a neighborhood-based recommender, trained +separately, we show that using the fastText model as a vectorizer delivers +competitive results. + +
+
+ comment: Dataset copyright discussion +
+
+
+
+
+ + ♻ ☆ Framework to Automatically Determine the Quality of Open Data Catalogs + + +
+ Data catalogs play a crucial role in modern data-driven organizations by +facilitating the discovery, understanding, and utilization of diverse data +assets. However, ensuring their quality and reliability is complex, especially +in open and large-scale data environments. This paper proposes a framework to +automatically determine the quality of open data catalogs, addressing the need +for efficient and reliable quality assessment mechanisms. Our framework can +analyze various core quality dimensions, such as accuracy, completeness, +consistency, scalability, and timeliness, offer several alternatives for the +assessment of compatibility and similarity across such catalogs as well as the +implementation of a set of non-core quality dimensions such as provenance, +readability, and licensing. The goal is to empower data-driven organizations to +make informed decisions based on trustworthy and well-curated data assets. The +source code that illustrates our approach can be downloaded from +https://www.github.com/jorge-martinez-gil/dataq/. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ AdaMCT: Adaptive Mixture of CNN-Transformer for Sequential + Recommendation CIKM 2023 + + +
+ Sequential recommendation (SR) aims to model users dynamic preferences from a +series of interactions. A pivotal challenge in user modeling for SR lies in the +inherent variability of user preferences. An effective SR model is expected to +capture both the long-term and short-term preferences exhibited by users, +wherein the former can offer a comprehensive understanding of stable interests +that impact the latter. To more effectively capture such information, we +incorporate locality inductive bias into the Transformer by amalgamating its +global attention mechanism with a local convolutional filter, and adaptively +ascertain the mixing importance on a personalized basis through layer-aware +adaptive mixture units, termed as AdaMCT. Moreover, as users may repeatedly +browse potential purchases, it is expected to consider multiple relevant items +concurrently in long-/short-term preferences modeling. Given that softmax-based +attention may promote unimodal activation, we propose the Squeeze-Excitation +Attention (with sigmoid activation) into SR models to capture multiple +pertinent items (keys) simultaneously. Extensive experiments on three widely +employed benchmarks substantiate the effectiveness and efficiency of our +proposed approach. Source code is available at +https://github.com/juyongjiang/AdaMCT. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ♻ ☆ A Survey on Popularity Bias in Recommender Systems + + +
+ Recommender systems help people find relevant content in a personalized way. +One main promise of such systems is that they are able to increase the +visibility of items in the long tail, i.e., the lesser-known items in a +catalogue. Existing research, however, suggests that in many situations today's +recommendation algorithms instead exhibit a popularity bias, meaning that they +often focus on rather popular items in their recommendations. Such a bias may +not only lead to limited value of the recommendations for consumers and +providers in the short run, but it may also cause undesired reinforcement +effects over time. In this paper, we discuss the potential reasons for +popularity bias and we review existing approaches to detect, quantify and +mitigate popularity bias in recommender systems. Our survey therefore includes +both an overview of the computational metrics used in the literature as well as +a review of the main technical approaches to reduce the bias. We furthermore +critically discuss today's literature, where we observe that the research is +almost entirely based on computational experiments and on certain assumptions +regarding the practical effects of including long-tail items in the +recommendations. + +
+
+
+
+
+ + ♻ ☆ Kuaipedia: a Large-scale Multi-modal Short-video Encyclopedia + + +
+ Online encyclopedias, such as Wikipedia, have been well-developed and +researched in the last two decades. One can find any attributes or other +information of a wiki item on a wiki page edited by a community of volunteers. +However, the traditional text, images and tables can hardly express some +aspects of an wiki item. For example, when we talk about ``Shiba Inu'', one may +care more about ``How to feed it'' or ``How to train it not to protect its +food''. Currently, short-video platforms have become a hallmark in the online +world. Whether you're on TikTok, Instagram, Kuaishou, or YouTube Shorts, +short-video apps have changed how we consume and create content today. Except +for producing short videos for entertainment, we can find more and more authors +sharing insightful knowledge widely across all walks of life. These short +videos, which we call knowledge videos, can easily express any aspects (e.g. +hair or how-to-feed) consumers want to know about an item (e.g. Shiba Inu), and +they can be systematically analyzed and organized like an online encyclopedia. +In this paper, we propose Kuaipedia, a large-scale multi-modal encyclopedia +consisting of items, aspects, and short videos lined to them, which was +extracted from billions of videos of Kuaishou (Kwai), a well-known short-video +platform in China. We first collected items from multiple sources and mined +user-centered aspects from millions of users' queries to build an item-aspect +tree. Then we propose a new task called ``multi-modal item-aspect linking'' as +an expansion of ``entity linking'' to link short videos into item-aspect pairs +and build the whole short-video encyclopedia. Intrinsic evaluations show that +our encyclopedia is of large scale and highly accurate. We also conduct +sufficient extrinsic experiments to show how Kuaipedia can help fundamental +applications such as entity typing and entity linking. + +
+
+
+
+
+
+
+
+ + Machine Learning 103 + +
+
+
+ + ☆ Foundation Model is Efficient Multimodal Multitask Model Selector + + +
+ This paper investigates an under-explored but important problem: given a +collection of pre-trained neural networks, predicting their performance on each +multi-modal task without fine-tuning them, such as image recognition, +referring, captioning, visual question answering, and text question answering. +A brute-force approach is to finetune all models on all target datasets, +bringing high computational costs. Although recent-advanced approaches employed +lightweight metrics to measure models' transferability,they often depend +heavily on the prior knowledge of a single task, making them inapplicable in a +multi-modal multi-task scenario. To tackle this issue, we propose an efficient +multi-task model selector (EMMS), which employs large-scale foundation models +to transform diverse label formats such as categories, texts, and bounding +boxes of different downstream tasks into a unified noisy label embedding. EMMS +can estimate a model's transferability through a simple weighted linear +regression, which can be efficiently solved by an alternating minimization +algorithm with a convergence guarantee. Extensive experiments on 5 downstream +tasks with 24 datasets show that EMMS is fast, effective, and generic enough to +assess the transferability of pre-trained models, making it the first model +selection method in the multi-task scenario. For instance, compared with the +state-of-the-art method LogME enhanced by our label embeddings, EMMS achieves +9.0\%, 26.3\%, 20.1\%, 54.8\%, 12.2\% performance gain on image recognition, +referring, captioning, visual question answering, and text question answering, +while bringing 5.13x, 6.29x, 3.59x, 6.19x, and 5.66x speedup in wall-clock +time, respectively. The code is available at +https://github.com/OpenGVLab/Multitask-Model-Selector. + +
+
+
+
+
+ + ☆ FunnyBirds: A Synthetic Vision Dataset for a Part-Based Analysis of + Explainable AI Methods ICCV 2023 + + +
+ The field of explainable artificial intelligence (XAI) aims to uncover the +inner workings of complex deep neural models. While being crucial for +safety-critical domains, XAI inherently lacks ground-truth explanations, making +its automatic evaluation an unsolved problem. We address this challenge by +proposing a novel synthetic vision dataset, named FunnyBirds, and accompanying +automatic evaluation protocols. Our dataset allows performing semantically +meaningful image interventions, e.g., removing individual object parts, which +has three important implications. First, it enables analyzing explanations on a +part level, which is closer to human comprehension than existing methods that +evaluate on a pixel level. Second, by comparing the model output for inputs +with removed parts, we can estimate ground-truth part importances that should +be reflected in the explanations. Third, by mapping individual explanations +into a common space of part importances, we can analyze a variety of different +explanation types in a single common framework. Using our tools, we report +results for 24 different combinations of neural models and XAI methods, +demonstrating the strengths and weaknesses of the assessed methods in a fully +automatic and systematic manner. + +
+
+ comment: Accepted at ICCV 2023. Code: https://github.com/visinf/funnybirds +
+
+
+
+
+ + ☆ Private Distribution Learning with Public Data: The View from Sample + Compression + + +
+ We study the problem of private distribution learning with access to public +data. In this setup, which we refer to as public-private learning, the learner +is given public and private samples drawn from an unknown distribution $p$ +belonging to a class $\mathcal Q$, with the goal of outputting an estimate of +$p$ while adhering to privacy constraints (here, pure differential privacy) +only with respect to the private samples. + We show that the public-private learnability of a class $\mathcal Q$ is +connected to the existence of a sample compression scheme for $\mathcal Q$, as +well as to an intermediate notion we refer to as list learning. Leveraging this +connection: (1) approximately recovers previous results on Gaussians over +$\mathbb R^d$; and (2) leads to new ones, including sample complexity upper +bounds for arbitrary $k$-mixtures of Gaussians over $\mathbb R^d$, results for +agnostic and distribution-shift resistant learners, as well as closure +properties for public-private learnability under taking mixtures and products +of distributions. Finally, via the connection to list learning, we show that +for Gaussians in $\mathbb R^d$, at least $d$ public samples are necessary for +private learnability, which is close to the known upper bound of $d+1$ public +samples. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ☆ MaxFloodCast: Ensemble Machine Learning Model for Predicting Peak + Inundation Depth And Decoding Influencing Features + + +
+ Timely, accurate, and reliable information is essential for decision-makers, +emergency managers, and infrastructure operators during flood events. This +study demonstrates a proposed machine learning model, MaxFloodCast, trained on +physics-based hydrodynamic simulations in Harris County, offers efficient and +interpretable flood inundation depth predictions. Achieving an average +R-squared of 0.949 and a Root Mean Square Error of 0.61 ft on unseen data, it +proves reliable in forecasting peak flood inundation depths. Validated against +Hurricane Harvey and Storm Imelda, MaxFloodCast shows the potential in +supporting near-time floodplain management and emergency operations. The +model's interpretability aids decision-makers in offering critical information +to inform flood mitigation strategies, to prioritize areas with critical +facilities and to examine how rainfall in other watersheds influences flood +exposure in one area. The MaxFloodCast model enables accurate and interpretable +inundation depth predictions while significantly reducing computational time, +thereby supporting emergency response efforts and flood risk management more +effectively. + +
+
+
+
+
+ + ☆ Automated Sizing and Training of Efficient Deep Autoencoders using + Second Order Algorithms + + +
+ We propose a multi-step training method for designing generalized linear +classifiers. First, an initial multi-class linear classifier is found through +regression. Then validation error is minimized by pruning of unnecessary +inputs. Simultaneously, desired outputs are improved via a method similar to +the Ho-Kashyap rule. Next, the output discriminants are scaled to be net +functions of sigmoidal output units in a generalized linear classifier. We then +develop a family of batch training algorithm for the multi layer perceptron +that optimizes its hidden layer size and number of training epochs. Next, we +combine pruning with a growing approach. Later, the input units are scaled to +be the net function of the sigmoidal output units that are then feed into as +input to the MLP. We then propose resulting improvements in each of the deep +learning blocks thereby improving the overall performance of the deep +architecture. We discuss the principles and formulation regarding learning +algorithms for deep autoencoders. We investigate several problems in deep +autoencoders networks including training issues, the theoretical, mathematical +and experimental justification that the networks are linear, optimizing the +number of hidden units in each layer and determining the depth of the deep +learning model. A direct implication of the current work is the ability to +construct fast deep learning models using desktop level computational +resources. This, in our opinion, promotes our design philosophy of building +small but powerful algorithms. Performance gains are demonstrated at each step. +Using widely available datasets, the final network's ten fold testing error is +shown to be less than that of several other linear, generalized linear +classifiers, multi layer perceptron and deep learners reported in the +literature. + +
+
+
+
+
+ + ☆ Change Point Detection With Conceptors + + +
+ Offline change point detection seeks to identify points in a time series +where the data generating process changes. This problem is well studied for +univariate i.i.d. data, but becomes challenging with increasing dimension and +temporal dependence. For the at most one change point problem, we propose the +use of a conceptor matrix to learn the characteristic dynamics of a specified +training window in a time series. The associated random recurrent neural +network acts as a featurizer of the data, and change points are identified from +a univariate quantification of the distance between the featurization and the +space spanned by a representative conceptor matrix. This model agnostic method +can suggest potential locations of interest that warrant further study. We +prove that, under mild assumptions, the method provides a consistent estimate +of the true change point, and quantile estimates for statistics are produced +via a moving block bootstrap of the original data. The method is tested on +simulations from several classes of processes, and we evaluate performance with +clustering metrics, graphical methods, and observed Type 1 error control. We +apply our method to publicly available neural data from rats experiencing bouts +of non-REM sleep prior to exploration of a radial maze. + +
+
+ comment: Main Text 30 pages, 9 figures; Supplementary Material 29 pages, 2 + figures +
+
+
+
+
+ + ☆ Safety in Traffic Management Systems: A Comprehensive Survey + + +
+ Traffic management systems play a vital role in ensuring safe and efficient +transportation on roads. However, the use of advanced technologies in traffic +management systems has introduced new safety challenges. Therefore, it is +important to ensure the safety of these systems to prevent accidents and +minimize their impact on road users. In this survey, we provide a comprehensive +review of the literature on safety in traffic management systems. Specifically, +we discuss the different safety issues that arise in traffic management +systems, the current state of research on safety in these systems, and the +techniques and methods proposed to ensure the safety of these systems. We also +identify the limitations of the existing research and suggest future research +directions. + +
+
+ comment: Accepted by MDPI Designs journal, the Special Issue Design and + Application of Intelligent Transportation Systems. 30 pages, 6 figures, + published on 10 August 2023 +
+
+
+
+
+ + ☆ Towards a Causal Probabilistic Framework for Prediction, + Action-Selection & Explanations for Robot Block-Stacking Tasks IROS + + +
+ Uncertainties in the real world mean that is impossible for system designers +to anticipate and explicitly design for all scenarios that a robot might +encounter. Thus, robots designed like this are fragile and fail outside of +highly-controlled environments. Causal models provide a principled framework to +encode formal knowledge of the causal relationships that govern the robot's +interaction with its environment, in addition to probabilistic representations +of noise and uncertainty typically encountered by real-world robots. Combined +with causal inference, these models permit an autonomous agent to understand, +reason about, and explain its environment. In this work, we focus on the +problem of a robot block-stacking task due to the fundamental perception and +manipulation capabilities it demonstrates, required by many applications +including warehouse logistics and domestic human support robotics. We propose a +novel causal probabilistic framework to embed a physics simulation capability +into a structural causal model to permit robots to perceive and assess the +current state of a block-stacking task, reason about the next-best action from +placement candidates, and generate post-hoc counterfactual explanations. We +provide exemplar next-best action selection results and outline planned +experimentation in simulated and real-world robot block-stacking tasks. + +
+
+ comment: 3 pages, 3 figures, accepted to the "Causality for Robotics: + Answering the Question of Why" workshop at the 2023 IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS) +
+
+
+
+
+ + ☆ Exploring Predicate Visual Context in Detecting of Human-Object + Interactions ICCV2023 + + +
+ Recently, the DETR framework has emerged as the dominant approach for +human--object interaction (HOI) research. In particular, two-stage +transformer-based HOI detectors are amongst the most performant and +training-efficient approaches. However, these often condition HOI +classification on object features that lack fine-grained contextual +information, eschewing pose and orientation information in favour of visual +cues about object identity and box extremities. This naturally hinders the +recognition of complex or ambiguous interactions. In this work, we study these +issues through visualisations and carefully designed experiments. Accordingly, +we investigate how best to re-introduce image features via cross-attention. +With an improved query design, extensive exploration of keys and values, and +box pair positional embeddings as spatial guidance, our model with enhanced +predicate visual context (PViC) outperforms state-of-the-art methods on the +HICO-DET and V-COCO benchmarks, while maintaining low training cost. + +
+
+ comment: To appear in ICCV2023 +
+
+
+
+
+ + ☆ Complex Facial Expression Recognition Using Deep Knowledge Distillation + of Basic Features + + +
+ Complex emotion recognition is a cognitive task that has so far eluded the +same excellent performance of other tasks that are at or above the level of +human cognition. Emotion recognition through facial expressions is particularly +difficult due to the complexity of emotions expressed by the human face. For a +machine to approach the same level of performance in this domain as a human, it +may need to synthesise knowledge and understand new concepts in real-time as +humans do. Humans are able to learn new concepts using only few examples, by +distilling the important information from memories and discarding the rest. +Similarly, continual learning methods learn new classes whilst retaining the +knowledge of known classes, whilst few-shot learning methods are able to learn +new classes using very few training examples. We propose a novel continual +learning method inspired by human cognition and learning that can accurately +recognise new compound expression classes using few training samples, by +building on and retaining its knowledge of basic expression classes. Using +GradCAM visualisations, we demonstrate the relationship between basic and +compound facial expressions, which our method leverages through knowledge +distillation and a novel Predictive Sorting Memory Replay. Our method achieves +the current state-of-the-art in continual learning for complex facial +expression recognition with 74.28% Overall Accuracy on new classes. We also +demonstrate that using continual learning for complex facial expression +recognition achieves far better performance than non-continual learning +methods, improving on state-of-the-art non-continual learning methods by +13.95%. To the best of our knowledge, our work is also the first to apply +few-shot learning to complex facial expression recognition, achieving the +state-of-the-art with 100% accuracy using a single training sample for each +expression class. + +
+
+ comment: 17 pages, 9 figures, 6 tables. Code available at + https://github.com/AngusMaiden/complex-FER +
+
+
+
+
+ + ☆ Assessing Guest Nationality Composition from Hotel Reviews + + +
+ Many hotels target guest acquisition efforts to specific markets in order to +best anticipate individual preferences and needs of their guests. Likewise, +such strategic positioning is a prerequisite for efficient marketing budget +allocation. Official statistics report on the number of visitors from different +countries, but no fine-grained information on the guest composition of +individual businesses exists. There is, however, growing interest in such data +from competitors, suppliers, researchers and the general public. We demonstrate +how machine learning can be leveraged to extract references to guest +nationalities from unstructured text reviews in order to dynamically assess and +monitor the dynamics of guest composition of individual businesses. In +particular, we show that a rather simple architecture of pre-trained embeddings +and stacked LSTM layers provides a better performance-runtime tradeoff than +more complex state-of-the-art language models. + +
+
+
+
+
+ + ☆ Physical Adversarial Attacks For Camera-based Smart Systems: Current + Trends, Categorization, Applications, Research Challenges, and Future Outlook + + +
+ In this paper, we present a comprehensive survey of the current trends +focusing specifically on physical adversarial attacks. We aim to provide a +thorough understanding of the concept of physical adversarial attacks, +analyzing their key characteristics and distinguishing features. Furthermore, +we explore the specific requirements and challenges associated with executing +attacks in the physical world. Our article delves into various physical +adversarial attack methods, categorized according to their target tasks in +different applications, including classification, detection, face recognition, +semantic segmentation and depth estimation. We assess the performance of these +attack methods in terms of their effectiveness, stealthiness, and robustness. +We examine how each technique strives to ensure the successful manipulation of +DNNs while mitigating the risk of detection and withstanding real-world +distortions. Lastly, we discuss the current challenges and outline potential +future research directions in the field of physical adversarial attacks. We +highlight the need for enhanced defense mechanisms, the exploration of novel +attack strategies, the evaluation of attacks in different application domains, +and the establishment of standardized benchmarks and evaluation criteria for +physical adversarial attacks. Through this comprehensive survey, we aim to +provide a valuable resource for researchers, practitioners, and policymakers to +gain a holistic understanding of physical adversarial attacks in computer +vision and facilitate the development of robust and secure DNN-based systems. + +
+
+
+
+
+ + ☆ Phased Deep Spatio-temporal Learning for Highway Traffic Volume + Prediction + + +
+ Inter-city highway transportation is significant for citizens' modern urban +life and generates heterogeneous sensory data with spatio-temporal +characteristics. As a routine analysis in transportation domain, daily traffic +volume estimation faces challenges for highway toll stations including lacking +of exploration of correlative spatio-temporal features from a long-term +perspective and effective means to deal with data imbalance which always +deteriorates the predictive performance. In this paper, a deep spatio-temporal +learning method is proposed to predict daily traffic volume in three phases. In +feature pre-processing phase, data is normalized elaborately according to +latent long-tail distribution. In spatio-temporal learning phase, a hybrid +model is employed combining fully convolution network (FCN) and long short-term +memory (LSTM), which considers time, space, meteorology, and calendar from +heterogeneous data. In decision phase, traffic volumes on a coming day at +network-wide toll stations would be achieved effectively, which is especially +calibrated for vital few highway stations. Using real-world data from one +Chinese provincial highway, extensive experiments show our method has distinct +improvement for predictive accuracy than various traditional models, reaching +5.269 and 0.997 in MPAE and R-squre metrics, respectively. + +
+
+
+
+
+ + ☆ Gaussian Process Regression for Maximum Entropy Distribution + + +
+ Maximum-Entropy Distributions offer an attractive family of probability +densities suitable for moment closure problems. Yet finding the Lagrange +multipliers which parametrize these distributions, turns out to be a +computational bottleneck for practical closure settings. Motivated by recent +success of Gaussian processes, we investigate the suitability of Gaussian +priors to approximate the Lagrange multipliers as a map of a given set of +moments. Examining various kernel functions, the hyperparameters are optimized +by maximizing the log-likelihood. The performance of the devised data-driven +Maximum-Entropy closure is studied for couple of test cases including +relaxation of non-equilibrium distributions governed by Bhatnagar-Gross-Krook +and Boltzmann kinetic equations. + +
+
+
+
+
+ + ☆ Identification of the Relevance of Comments in Codes Using Bag of Words + and Transformer Based Models + + +
+ The Forum for Information Retrieval (FIRE) started a shared task this year +for classification of comments of different code segments. This is binary text +classification task where the objective is to identify whether comments given +for certain code segments are relevant or not. The BioNLP-IISERB group at the +Indian Institute of Science Education and Research Bhopal (IISERB) participated +in this task and submitted five runs for five different models. The paper +presents the overview of the models and other significant findings on the +training corpus. The methods involve different feature engineering schemes and +text classification techniques. The performance of the classical bag of words +model and transformer-based models were explored to identify significant +features from the given training corpus. We have explored different classifiers +viz., random forest, support vector machine and logistic regression using the +bag of words model. Furthermore, the pre-trained transformer based models like +BERT, RoBERT and ALBERT were also used by fine-tuning them on the given +training corpus. The performance of different such models over the training +corpus were reported and the best five models were implemented on the given +test corpus. The empirical results show that the bag of words model outperforms +the transformer based models, however, the performance of our runs are not +reasonably well in both training and test corpus. This paper also addresses the +limitations of the models and scope for further improvement. + +
+
+
+
+
+ + ☆ CompTLL-UNet: Compressed Domain Text-Line Localization in Challenging + Handwritten Documents using Deep Feature Learning from JPEG Coefficients + + +
+ Automatic localization of text-lines in handwritten documents is still an +open and challenging research problem. Various writing issues such as uneven +spacing between the lines, oscillating and touching text, and the presence of +skew become much more challenging when the case of complex handwritten document +images are considered for segmentation directly in their respective compressed +representation. This is because, the conventional way of processing compressed +documents is through decompression, but here in this paper, we propose an idea +that employs deep feature learning directly from the JPEG compressed +coefficients without full decompression to accomplish text-line localization in +the JPEG compressed domain. A modified U-Net architecture known as Compressed +Text-Line Localization Network (CompTLL-UNet) is designed to accomplish it. The +model is trained and tested with JPEG compressed version of benchmark datasets +including ICDAR2017 (cBAD) and ICDAR2019 (cBAD), reporting the state-of-the-art +performance with reduced storage and computational costs in the JPEG compressed +domain. + +
+
+ comment: Accepted in 7th Asian Conference on Pattern Recognition (ACPR 2023), + 5-8 November 2023, Kitakyushu, Japan +
+
+
+
+
+ + ☆ Application of Artificial Neural Networks for Investigation of Pressure + Filtration Performance, a Zinc Leaching Filter Cake Moisture Modeling + + +
+ Machine Learning (ML) is a powerful tool for material science applications. +Artificial Neural Network (ANN) is a machine learning technique that can +provide high prediction accuracy. This study aimed to develop an ANN model to +predict the cake moisture of the pressure filtration process of zinc +production. The cake moisture was influenced by seven parameters: temperature +(35 and 65 Celsius), solid concentration (0.2 and 0.38 g/L), pH (2, 3.5, and +5), air-blow time (2, 10, and 15 min), cake thickness (14, 20, 26, and 34 mm), +pressure, and filtration time. The study conducted 288 tests using two types of +fabrics: polypropylene (S1) and polyester (S2). The ANN model was evaluated by +the Coefficient of determination (R2), the Mean Square Error (MSE), and the +Mean Absolute Error (MAE) metrics for both datasets. The results showed R2 +values of 0.88 and 0.83, MSE values of 6.243x10-07 and 1.086x10-06, and MAE +values of 0.00056 and 0.00088 for S1 and S2, respectively. These results +indicated that the ANN model could predict the cake moisture of pressure +filtration in the zinc leaching process with high accuracy. + +
+
+
+
+
+ + ☆ PDE Discovery for Soft Sensors Using Coupled Physics-Informed Neural + Network with Akaike's Information Criterion + + +
+ Soft sensors have been extensively used to monitor key variables using +easy-to-measure variables and mathematical models. Partial differential +equations (PDEs) are model candidates for soft sensors in industrial processes +with spatiotemporal dependence. However, gaps often exist between idealized +PDEs and practical situations. Discovering proper structures of PDEs, including +the differential operators and source terms, can remedy the gaps. To this end, +a coupled physics-informed neural network with Akaike's criterion information +(CPINN-AIC) is proposed for PDE discovery of soft sensors. First, CPINN is +adopted for obtaining solutions and source terms satisfying PDEs. Then, we +propose a data-physics-hybrid loss function for training CPINN, in which +undetermined combinations of differential operators are involved. Consequently, +AIC is used to discover the proper combination of differential operators. +Finally, the artificial and practical datasets are used to verify the +feasibility and effectiveness of CPINN-AIC for soft sensors. The proposed +CPINN-AIC is a data-driven method to discover proper PDE structures and neural +network-based solutions for soft sensors. + +
+
+
+
+
+ + ☆ Uncertainty Quantification for Image-based Traffic Prediction across + Cities + + +
+ Despite the strong predictive performance of deep learning models for traffic +prediction, their widespread deployment in real-world intelligent +transportation systems has been restrained by a lack of interpretability. +Uncertainty quantification (UQ) methods provide an approach to induce +probabilistic reasoning, improve decision-making and enhance model deployment +potential. To gain a comprehensive picture of the usefulness of existing UQ +methods for traffic prediction and the relation between obtained uncertainties +and city-wide traffic dynamics, we investigate their application to a +large-scale image-based traffic dataset spanning multiple cities and time +periods. We compare two epistemic and two aleatoric UQ methods on both temporal +and spatio-temporal transfer tasks, and find that meaningful uncertainty +estimates can be recovered. We further demonstrate how uncertainty estimates +can be employed for unsupervised outlier detection on changes in city traffic +dynamics. We find that our approach can capture both temporal and spatial +effects on traffic behaviour in a representative case study for the city of +Moscow. Our work presents a further step towards boosting uncertainty awareness +in traffic prediction tasks, and aims to highlight the value contribution of UQ +methods to a better understanding of city traffic dynamics. + +
+
+ comment: 39 pages, 22 figures. Code publicly available at: + https://github.com/alextimans/traffic4cast-uncertainty +
+
+
+
+
+ + ☆ Learning Control Policies for Variable Objectives from Offline Data + + +
+ Offline reinforcement learning provides a viable approach to obtain advanced +control strategies for dynamical systems, in particular when direct interaction +with the environment is not available. In this paper, we introduce a conceptual +extension for model-based policy search methods, called variable objective +policy (VOP). With this approach, policies are trained to generalize +efficiently over a variety of objectives, which parameterize the reward +function. We demonstrate that by altering the objectives passed as input to the +policy, users gain the freedom to adjust its behavior or re-balance +optimization targets at runtime, without need for collecting additional +observation batches or re-training. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Hawkes Processes with Delayed Granger Causality + + +
+ We aim to explicitly model the delayed Granger causal effects based on +multivariate Hawkes processes. The idea is inspired by the fact that a causal +event usually takes some time to exert an effect. Studying this time lag itself +is of interest. Given the proposed model, we first prove the identifiability of +the delay parameter under mild conditions. We further investigate a model +estimation method under a complex setting, where we want to infer the posterior +distribution of the time lags and understand how this distribution varies +across different scenarios. We treat the time lags as latent variables and +formulate a Variational Auto-Encoder (VAE) algorithm to approximate the +posterior distribution of the time lags. By explicitly modeling the time lags +in Hawkes processes, we add flexibility to the model. The inferred time-lag +posterior distributions are of scientific meaning and help trace the original +causal time that supports the root cause analysis. We empirically evaluate our +model's event prediction and time-lag inference accuracy on synthetic and real +data, achieving promising results. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Composable Function-preserving Expansions for Transformer Architectures + + +
+ Training state-of-the-art neural networks requires a high cost in terms of +compute and time. Model scale is recognized to be a critical factor to achieve +and improve the state-of-the-art. Increasing the scale of a neural network +normally requires restarting from scratch by randomly initializing all the +parameters of the model, as this implies a change of architecture's parameters +that does not allow for a straightforward transfer of knowledge from smaller +size models. In this work, we propose six composable transformations to +incrementally increase the size of transformer-based neural networks while +preserving functionality, allowing to expand the capacity of the model as +needed. We provide proof of exact function preservation under minimal +initialization constraints for each transformation. The proposed methods may +enable efficient training pipelines for larger and more powerful models by +progressively expanding the architecture throughout training. + +
+
+
+
+
+ + ☆ Diffusion-based Visual Counterfactual Explanations -- Towards Systematic + Quantitative Evaluation ECML 2023 + + +
+ Latest methods for visual counterfactual explanations (VCE) harness the power +of deep generative models to synthesize new examples of high-dimensional images +of impressive quality. However, it is currently difficult to compare the +performance of these VCE methods as the evaluation procedures largely vary and +often boil down to visual inspection of individual examples and small scale +user studies. In this work, we propose a framework for systematic, quantitative +evaluation of the VCE methods and a minimal set of metrics to be used. We use +this framework to explore the effects of certain crucial design choices in the +latest diffusion-based generative models for VCEs of natural image +classification (ImageNet). We conduct a battery of ablation-like experiments, +generating thousands of VCEs for a suite of classifiers of various complexity, +accuracy and robustness. Our findings suggest multiple directions for future +advancements and improvements of VCE methods. By sharing our methodology and +our approach to tackle the computational challenges of such a study on a +limited hardware setup (including the complete code base), we offer a valuable +guidance for researchers in the field fostering consistency and transparency in +the assessment of counterfactual explanations. + +
+
+ comment: Accepted at the 5th International Workshop on eXplainable Knowledge + Discovery in Data Mining @ ECML 2023 +
+
+
+
+
+ + ☆ Neural Conversation Models and How to Rein Them in: A Survey of Failures + and Fixes + + +
+ Recent conditional language models are able to continue any kind of text +source in an often seemingly fluent way. This fact encouraged research in the +area of open-domain conversational systems that are based on powerful language +models and aim to imitate an interlocutor by generating appropriate +contributions to a written dialogue. From a linguistic perspective, however, +the complexity of contributing to a conversation is high. In this survey, we +interpret Grice's maxims of cooperative conversation from the perspective of +this specific research area and systematize the literature under the aspect of +what makes a contribution appropriate: A neural conversation model has to be +fluent, informative, consistent, coherent, and follow social norms. In order to +ensure these qualities, recent approaches try to tame the underlying language +models at various intervention points, such as data, training regime or +decoding. Sorted by these categories and intervention points, we discuss +promising attempts and suggest novel ways for future research. + +
+
+ comment: Represents the state of the field in 2022; partially based on the + first authors 2022 PhD thesis +
+
+
+
+
+ + ☆ Reinforcement Logic Rule Learning for Temporal Point Processes + + +
+ We propose a framework that can incrementally expand the explanatory temporal +logic rule set to explain the occurrence of temporal events. Leveraging the +temporal point process modeling and learning framework, the rule content and +weights will be gradually optimized until the likelihood of the observational +event sequences is optimal. The proposed algorithm alternates between a master +problem, where the current rule set weights are updated, and a subproblem, +where a new rule is searched and included to best increase the likelihood. The +formulated master problem is convex and relatively easy to solve using +continuous optimization, whereas the subproblem requires searching the huge +combinatorial rule predicate and relationship space. To tackle this challenge, +we propose a neural search policy to learn to generate the new rule content as +a sequence of actions. The policy parameters will be trained end-to-end using +the reinforcement learning framework, where the reward signals can be +efficiently queried by evaluating the subproblem objective. The trained policy +can be used to generate new rules in a controllable way. We evaluate our +methods on both synthetic and real healthcare datasets, obtaining promising +results. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ☆ Experts Weights Averaging: A New General Training Scheme for Vision + Transformers + + +
+ Structural re-parameterization is a general training scheme for Convolutional +Neural Networks (CNNs), which achieves performance improvement without +increasing inference cost. As Vision Transformers (ViTs) are gradually +surpassing CNNs in various visual tasks, one may question: if a training scheme +specifically for ViTs exists that can also achieve performance improvement +without increasing inference cost? Recently, Mixture-of-Experts (MoE) has +attracted increasing attention, as it can efficiently scale up the capacity of +Transformers at a fixed cost through sparsely activated experts. Considering +that MoE can also be viewed as a multi-branch structure, can we utilize MoE to +implement a ViT training scheme similar to structural re-parameterization? In +this paper, we affirmatively answer these questions, with a new general +training strategy for ViTs. Specifically, we decouple the training and +inference phases of ViTs. During training, we replace some Feed-Forward +Networks (FFNs) of the ViT with specially designed, more efficient MoEs that +assign tokens to experts by random uniform partition, and perform Experts +Weights Averaging (EWA) on these MoEs at the end of each iteration. After +training, we convert each MoE into an FFN by averaging the experts, +transforming the model back into original ViT for inference. We further provide +a theoretical analysis to show why and how it works. Comprehensive experiments +across various 2D and 3D visual tasks, ViT architectures, and datasets validate +the effectiveness and generalizability of the proposed training scheme. +Besides, our training scheme can also be applied to improve performance when +fine-tuning ViTs. Lastly, but equally important, the proposed EWA technique can +significantly improve the effectiveness of naive MoE in various 2D visual small +datasets and 3D visual tasks. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Toward a Better Understanding of Loss Functions for Collaborative + Filtering CIKM 2023 + + +
+ Collaborative filtering (CF) is a pivotal technique in modern recommender +systems. The learning process of CF models typically consists of three +components: interaction encoder, loss function, and negative sampling. Although +many existing studies have proposed various CF models to design sophisticated +interaction encoders, recent work shows that simply reformulating the loss +functions can achieve significant performance gains. This paper delves into +analyzing the relationship among existing loss functions. Our mathematical +analysis reveals that the previous loss functions can be interpreted as +alignment and uniformity functions: (i) the alignment matches user and item +representations, and (ii) the uniformity disperses user and item distributions. +Inspired by this analysis, we propose a novel loss function that improves the +design of alignment and uniformity considering the unique patterns of datasets +called Margin-aware Alignment and Weighted Uniformity (MAWU). The key novelty +of MAWU is two-fold: (i) margin-aware alignment (MA) mitigates +user/item-specific popularity biases, and (ii) weighted uniformity (WU) adjusts +the significance between user and item uniformities to reflect the inherent +characteristics of datasets. Extensive experimental results show that MF and +LightGCN equipped with MAWU are comparable or superior to state-of-the-art CF +models with various loss functions on three public datasets. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ☆ Safeguarding Learning-based Control for Smart Energy Systems with + Sampling Specifications + + +
+ We study challenges using reinforcement learning in controlling energy +systems, where apart from performance requirements, one has additional safety +requirements such as avoiding blackouts. We detail how these safety +requirements in real-time temporal logic can be strengthened via discretization +into linear temporal logic (LTL), such that the satisfaction of the LTL +formulae implies the satisfaction of the original safety requirements. The +discretization enables advanced engineering methods such as synthesizing +shields for safe reinforcement learning as well as formal verification, where +for statistical model checking, the probabilistic guarantee acquired by LTL +model checking forms a lower bound for the satisfaction of the original +real-time safety requirements. + +
+
+
+
+
+ + ☆ Adaptive SGD with Polyak stepsize and Line-search: Robust Convergence + and Variance Reduction + + +
+ The recently proposed stochastic Polyak stepsize (SPS) and stochastic +line-search (SLS) for SGD have shown remarkable effectiveness when training +over-parameterized models. However, in non-interpolation settings, both +algorithms only guarantee convergence to a neighborhood of a solution which may +result in a worse output than the initial guess. While artificially decreasing +the adaptive stepsize has been proposed to address this issue (Orvieto et al. +[2022]), this approach results in slower convergence rates for convex and +over-parameterized models. In this work, we make two contributions: Firstly, we +propose two new variants of SPS and SLS, called AdaSPS and AdaSLS, which +guarantee convergence in non-interpolation settings and maintain sub-linear and +linear convergence rates for convex and strongly convex functions when training +over-parameterized models. AdaSLS requires no knowledge of problem-dependent +parameters, and AdaSPS requires only a lower bound of the optimal function +value as input. Secondly, we equip AdaSPS and AdaSLS with a novel variance +reduction technique and obtain algorithms that require +$\smash{\widetilde{\mathcal{O}}}(n+1/\epsilon)$ gradient evaluations to achieve +an $\mathcal{O}(\epsilon)$-suboptimality for convex functions, which improves +upon the slower $\mathcal{O}(1/\epsilon^2)$ rates of AdaSPS and AdaSLS without +variance reduction in the non-interpolation regimes. Moreover, our result +matches the fast rates of AdaSVRG but removes the inner-outer-loop structure, +which is easier to implement and analyze. Finally, numerical experiments on +synthetic and real datasets validate our theory and demonstrate the +effectiveness and robustness of our algorithms. + +
+
+
+
+
+ + ☆ Cost-effective On-device Continual Learning over Memory Hierarchy with + Miro + + +
+ Continual learning (CL) trains NN models incrementally from a continuous +stream of tasks. To remember previously learned knowledge, prior studies store +old samples over a memory hierarchy and replay them when new tasks arrive. Edge +devices that adopt CL to preserve data privacy are typically energy-sensitive +and thus require high model accuracy while not compromising energy efficiency, +i.e., cost-effectiveness. Our work is the first to explore the design space of +hierarchical memory replay-based CL to gain insights into achieving +cost-effectiveness on edge devices. We present Miro, a novel system runtime +that carefully integrates our insights into the CL framework by enabling it to +dynamically configure the CL system based on resource states for the best +cost-effectiveness. To reach this goal, Miro also performs online profiling on +parameters with clear accuracy-energy trade-offs and adapts to optimal values +with low overhead. Extensive evaluations show that Miro significantly +outperforms baseline systems we build for comparison, consistently achieving +higher cost-effectiveness. + +
+
+ comment: This paper is submitted for publication to MobiCom 2023 +
+
+
+
+
+ + ☆ Towards Instance-adaptive Inference for Federated Learning + + +
+ Federated learning (FL) is a distributed learning paradigm that enables +multiple clients to learn a powerful global model by aggregating local +training. However, the performance of the global model is often hampered by +non-i.i.d. distribution among the clients, requiring extensive efforts to +mitigate inter-client data heterogeneity. Going beyond inter-client data +heterogeneity, we note that intra-client heterogeneity can also be observed on +complex real-world data and seriously deteriorate FL performance. In this +paper, we present a novel FL algorithm, i.e., FedIns, to handle intra-client +data heterogeneity by enabling instance-adaptive inference in the FL framework. +Instead of huge instance-adaptive models, we resort to a parameter-efficient +fine-tuning method, i.e., scale and shift deep features (SSF), upon a +pre-trained model. Specifically, we first train an SSF pool for each client, +and aggregate these SSF pools on the server side, thus still maintaining a low +communication cost. To enable instance-adaptive inference, for a given +instance, we dynamically find the best-matched SSF subsets from the pool and +aggregate them to generate an adaptive SSF specified for the instance, thereby +reducing the intra-client as well as the inter-client heterogeneity. Extensive +experiments show that our FedIns outperforms state-of-the-art FL algorithms, +e.g., a 6.64\% improvement against the top-performing method with less than +15\% communication cost on Tiny-ImageNet. Our code and models will be publicly +released. + +
+
+
+
+
+ + ☆ Controlling Character Motions without Observable Driving Source + + +
+ How to generate diverse, life-like, and unlimited long head/body sequences +without any driving source? We argue that this under-investigated research +problem is non-trivial at all, and has unique technical challenges behind it. +Without semantic constraints from the driving sources, using the standard +autoregressive model to generate infinitely long sequences would easily result +in 1) out-of-distribution (OOD) issue due to the accumulated error, 2) +insufficient diversity to produce natural and life-like motion sequences and 3) +undesired periodic patterns along the time. To tackle the above challenges, we +propose a systematic framework that marries the benefits of VQ-VAE and a novel +token-level control policy trained with reinforcement learning using carefully +designed reward functions. A high-level prior model can be easily injected on +top to generate unlimited long and diverse sequences. Although we focus on no +driving sources now, our framework can be generalized for controlled synthesis +with explicit driving sources. Through comprehensive evaluations, we conclude +that our proposed framework can address all the above-mentioned challenges and +outperform other strong baselines very significantly. + +
+
+
+
+
+ + ☆ Large Language Models for Telecom: Forthcoming Impact on the Industry + + +
+ Large Language Models (LLMs) have emerged as a transformative force, +revolutionizing numerous fields well beyond the conventional domain of Natural +Language Processing (NLP) and garnering unprecedented attention. As LLM +technology continues to progress, the telecom industry is facing the prospect +of its potential impact on its landscape. To elucidate these implications, we +delve into the inner workings of LLMs, providing insights into their current +capabilities and limitations. We also examine the use cases that can be readily +implemented in the telecom industry, streamlining numerous tasks that currently +hinder operational efficiency and demand significant manpower and engineering +expertise. Furthermore, we uncover essential research directions that deal with +the distinctive challenges of utilizing the LLMs within the telecom domain. +Addressing these challenges represents a significant stride towards fully +harnessing the potential of LLMs and unlocking their capabilities to the +fullest extent within the telecom domain. + +
+
+
+
+
+ + ☆ Does AI for science need another ImageNet Or totally different + benchmarks? A case study of machine learning force fields + + +
+ AI for science (AI4S) is an emerging research field that aims to enhance the +accuracy and speed of scientific computing tasks using machine learning +methods. Traditional AI benchmarking methods struggle to adapt to the unique +challenges posed by AI4S because they assume data in training, testing, and +future real-world queries are independent and identically distributed, while +AI4S workloads anticipate out-of-distribution problem instances. This paper +investigates the need for a novel approach to effectively benchmark AI for +science, using the machine learning force field (MLFF) as a case study. MLFF is +a method to accelerate molecular dynamics (MD) simulation with low +computational cost and high accuracy. We identify various missed opportunities +in scientifically meaningful benchmarking and propose solutions to evaluate +MLFF models, specifically in the aspects of sample efficiency, time domain +sensitivity, and cross-dataset generalization capabilities. By setting up the +problem instantiation similar to the actual scientific applications, more +meaningful performance metrics from the benchmark can be achieved. This suite +of metrics has demonstrated a better ability to assess a model's performance in +real-world scientific applications, in contrast to traditional AI benchmarking +methodologies. This work is a component of the SAIBench project, an AI4S +benchmarking suite. The project homepage is +https://www.computercouncil.org/SAIBench. + +
+
+
+
+
+ + ☆ Fast and Accurate Transferability Measurement by Evaluating Intra-class + Feature Variance + + +
+ Given a set of pre-trained models, how can we quickly and accurately find the +most useful pre-trained model for a downstream task? Transferability +measurement is to quantify how transferable is a pre-trained model learned on a +source task to a target task. It is used for quickly ranking pre-trained models +for a given task and thus becomes a crucial step for transfer learning. +Existing methods measure transferability as the discrimination ability of a +source model for a target data before transfer learning, which cannot +accurately estimate the fine-tuning performance. Some of them restrict the +application of transferability measurement in selecting the best supervised +pre-trained models that have classifiers. It is important to have a general +method for measuring transferability that can be applied in a variety of +situations, such as selecting the best self-supervised pre-trained models that +do not have classifiers, and selecting the best transferring layer for a target +task. In this work, we propose TMI (TRANSFERABILITY MEASUREMENT WITH +INTRA-CLASS FEATURE VARIANCE), a fast and accurate algorithm to measure +transferability. We view transferability as the generalization of a pre-trained +model on a target task by measuring intra-class feature variance. Intra-class +variance evaluates the adaptability of the model to a new task, which measures +how transferable the model is. Compared to previous studies that estimate how +discriminative the models are, intra-class variance is more accurate than those +as it does not require an optimal feature extractor and classifier. Extensive +experiments on real-world datasets show that TMI outperforms competitors for +selecting the top-5 best models, and exhibits consistently better correlation +in 13 out of 17 cases. + +
+
+
+
+
+ + ☆ Learning nonparametric DAGs with incremental information via high-order + HSIC + + +
+ Score-based methods for learning Bayesain networks(BN) aim to maximizing the +global score functions. However, if local variables have direct and indirect +dependence simultaneously, the global optimization on score functions misses +edges between variables with indirect dependent relationship, of which scores +are smaller than those with direct dependent relationship. In this paper, we +present an identifiability condition based on a determined subset of parents to +identify the underlying DAG. By the identifiability condition, we develop a +two-phase algorithm namely optimal-tuning (OT) algorithm to locally amend the +global optimization. In the optimal phase, an optimization problem based on +first-order Hilbert-Schmidt independence criterion (HSIC) gives an estimated +skeleton as the initial determined parents subset. In the tuning phase, the +skeleton is locally tuned by deletion, addition and DAG-formalization +strategies using the theoretically proved incremental properties of high-order +HSIC. Numerical experiments for different synthetic datasets and real-world +datasets show that the OT algorithm outperforms existing methods. Especially in +Sigmoid Mix model with the size of the graph being ${\rm\bf d=40}$, the +structure intervention distance (SID) of the OT algorithm is 329.7 smaller than +the one obtained by CAM, which indicates that the graph estimated by the OT +algorithm misses fewer edges compared with CAM. + +
+
+
+
+
+ + ☆ Learned Point Cloud Compression for Classification SP 2023 + + +
+ Deep learning is increasingly being used to perform machine vision tasks such +as classification, object detection, and segmentation on 3D point cloud data. +However, deep learning inference is computationally expensive. The limited +computational capabilities of end devices thus necessitate a codec for +transmitting point cloud data over the network for server-side processing. Such +a codec must be lightweight and capable of achieving high compression ratios +without sacrificing accuracy. Motivated by this, we present a novel point cloud +codec that is highly specialized for the machine task of classification. Our +codec, based on PointNet, achieves a significantly better rate-accuracy +trade-off in comparison to alternative methods. In particular, it achieves a +94% reduction in BD-bitrate over non-specialized codecs on the ModelNet40 +dataset. For low-resource end devices, we also propose two lightweight +configurations of our encoder that achieve similar BD-bitrate reductions of 93% +and 92% with 3% and 5% drops in top-1 accuracy, while consuming only 0.470 and +0.048 encoder-side kMACs/point, respectively. Our codec demonstrates the +potential of specialized codecs for machine analysis of point clouds, and +provides a basis for extension to more complex tasks and datasets in the +future. + +
+
+ comment: 6 pages, 4 figures, IEEE MMSP 2023 +
+
+
+
+
+ + ☆ Node Embedding for Homophilous Graphs with ARGEW: Augmentation of Random + walks by Graph Edge Weights + + +
+ Representing nodes in a network as dense vectors node embeddings is important +for understanding a given network and solving many downstream tasks. In +particular, for weighted homophilous graphs where similar nodes are connected +with larger edge weights, we desire node embeddings where node pairs with +strong weights have closer embeddings. Although random walk based node +embedding methods like node2vec and node2vec+ do work for weighted networks via +including edge weights in the walk transition probabilities, our experiments +show that the embedding result does not adequately reflect edge weights. In +this paper, we propose ARGEW (Augmentation of Random walks by Graph Edge +Weights), a novel augmentation method for random walks that expands the corpus +in such a way that nodes with larger edge weights end up with closer +embeddings. ARGEW can work with any random walk based node embedding method, +because it is independent of the random sampling strategy itself and works on +top of the already-performed walks. With several real-world networks, we +demonstrate that with ARGEW, compared to not using it, the desired pattern that +node pairs with larger edge weights have closer embeddings is much clearer. We +also examine ARGEW's performance in node classification: node2vec with ARGEW +outperforms pure node2vec and is not sensitive to hyperparameters (i.e. +consistently good). In fact, it achieves similarly good results as supervised +GCN, even without any node feature or label information during training. +Finally, we explain why ARGEW works consistently well by exploring the +coappearance distributions using a synthetic graph with clear structural roles. + +
+
+
+
+
+ + ☆ INR-Arch: A Dataflow Architecture and Compiler for Arbitrary-Order + Gradient Computations in Implicit Neural Representation Processing + + +
+ An increasing number of researchers are finding use for nth-order gradient +computations for a wide variety of applications, including graphics, +meta-learning (MAML), scientific computing, and most recently, implicit neural +representations (INRs). Recent work shows that the gradient of an INR can be +used to edit the data it represents directly without needing to convert it back +to a discrete representation. However, given a function represented as a +computation graph, traditional architectures face challenges in efficiently +computing its nth-order gradient due to the higher demand for computing power +and higher complexity in data movement. This makes it a promising target for +FPGA acceleration. In this work, we introduce INR-Arch, a framework that +transforms the computation graph of an nth-order gradient into a +hardware-optimized dataflow architecture. We address this problem in two +phases. First, we design a dataflow architecture that uses FIFO streams and an +optimized computation kernel library, ensuring high memory efficiency and +parallel computation. Second, we propose a compiler that extracts and optimizes +computation graphs, automatically configures hardware parameters such as +latency and stream depths to optimize throughput, while ensuring deadlock-free +operation, and outputs High-Level Synthesis (HLS) code for FPGA implementation. +We utilize INR editing as our benchmark, presenting results that demonstrate +1.8-4.8x and 1.5-3.6x speedup compared to CPU and GPU baselines respectively. +Furthermore, we obtain 3.1-8.9x and 1.7-4.3x lower memory usage, and 1.7-11.3x +and 5.5-32.8x lower energy-delay product. Our framework will be made +open-source and available on GitHub. + +
+
+ comment: 9 pages, 8 figures, 4 tables +
+
+
+
+
+ + ☆ On the equivalence of Occam algorithms + + +
+ Blumer et al. (1987, 1989) showed that any concept class that is learnable by +Occam algorithms is PAC learnable. Board and Pitt (1990) showed a partial +converse of this theorem: for concept classes that are closed under exception +lists, any class that is PAC learnable is learnable by an Occam algorithm. +However, their Occam algorithm outputs a hypothesis whose complexity is +$\delta$-dependent, which is an important limitation. In this paper, we show +that their partial converse applies to Occam algorithms with +$\delta$-independent complexities as well. Thus, we provide a posteriori +justification of various theoretical results and algorithm design methods which +use the partial converse as a basis for their work. + +
+
+ comment: 13 pages, submitted to Information and Computation +
+
+
+
+
+ + ☆ Comparing the quality of neural network uncertainty estimates for + classification problems + + +
+ Traditional deep learning (DL) models are powerful classifiers, but many +approaches do not provide uncertainties for their estimates. Uncertainty +quantification (UQ) methods for DL models have received increased attention in +the literature due to their usefulness in decision making, particularly for +high-consequence decisions. However, there has been little research done on how +to evaluate the quality of such methods. We use statistical methods of +frequentist interval coverage and interval width to evaluate the quality of +credible intervals, and expected calibration error to evaluate classification +predicted confidence. These metrics are evaluated on Bayesian neural networks +(BNN) fit using Markov Chain Monte Carlo (MCMC) and variational inference (VI), +bootstrapped neural networks (NN), Deep Ensembles (DE), and Monte Carlo (MC) +dropout. We apply these different UQ for DL methods to a hyperspectral image +target detection problem and show the inconsistency of the different methods' +results and the necessity of a UQ quality metric. To reconcile these +differences and choose a UQ method that appropriately quantifies the +uncertainty, we create a simulated data set with fully parameterized +probability distribution for a two-class classification problem. The gold +standard MCMC performs the best overall, and the bootstrapped NN is a close +second, requiring the same computational expense as DE. Through this +comparison, we demonstrate that, for a given data set, different models can +produce uncertainty estimates of markedly different quality. This in turn +points to a great need for principled assessment methods of UQ quality in DL +applications. + +
+
+
+
+
+ + ☆ Learning to Team-Based Navigation: A Review of Deep Reinforcement + Learning Techniques for Multi-Agent Pathfinding + + +
+ Multi-agent pathfinding (MAPF) is a critical field in many large-scale +robotic applications, often being the fundamental step in multi-agent systems. +The increasing complexity of MAPF in complex and crowded environments, however, +critically diminishes the effectiveness of existing solutions. In contrast to +other studies that have either presented a general overview of the recent +advancements in MAPF or extensively reviewed Deep Reinforcement Learning (DRL) +within multi-agent system settings independently, our work presented in this +review paper focuses on highlighting the integration of DRL-based approaches in +MAPF. Moreover, we aim to bridge the current gap in evaluating MAPF solutions +by addressing the lack of unified evaluation metrics and providing +comprehensive clarification on these metrics. Finally, our paper discusses the +potential of model-based DRL as a promising future direction and provides its +required foundational understanding to address current challenges in MAPF. Our +objective is to assist readers in gaining insight into the current research +direction, providing unified metrics for comparing different MAPF algorithms +and expanding their knowledge of model-based DRL to address the existing +challenges in MAPF. + +
+
+
+
+
+ + ☆ DF2: Distribution-Free Decision-Focused Learning + + +
+ Decision-focused learning (DFL) has recently emerged as a powerful approach +for predict-then-optimize problems by customizing a predictive model to a +downstream optimization task. However, existing end-to-end DFL methods are +hindered by three significant bottlenecks: model mismatch error, sample average +approximation error, and gradient approximation error. Model mismatch error +stems from the misalignment between the model's parameterized predictive +distribution and the true probability distribution. Sample average +approximation error arises when using finite samples to approximate the +expected optimization objective. Gradient approximation error occurs as DFL +relies on the KKT condition for exact gradient computation, while most methods +approximate the gradient for backpropagation in non-convex objectives. In this +paper, we present DF2 -- the first \textit{distribution-free} decision-focused +learning method explicitly designed to address these three bottlenecks. Rather +than depending on a task-specific forecaster that requires precise model +assumptions, our method directly learns the expected optimization function +during training. To efficiently learn the function in a data-driven manner, we +devise an attention-based model architecture inspired by the distribution-based +parameterization of the expected objective. Our method is, to the best of our +knowledge, the first to address all three bottlenecks within a single model. We +evaluate DF2 on a synthetic problem, a wind power bidding problem, and a +non-convex vaccine distribution problem, demonstrating the effectiveness of +DF2. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Combining Machine Learning Classifiers for Stock Trading with Effective + Feature Extraction + + +
+ The unpredictability and volatility of the stock market render it challenging +to make a substantial profit using any generalised scheme. Many previous +studies tried different techniques to build a machine learning model, which can +make a significant profit in the US stock market by performing live trading. +However, very few studies have focused on the importance of finding the best +features for a particular trading period. Our top approach used the performance +to narrow down the features from a total of 148 to about 30. Furthermore, the +top 25 features were dynamically selected before each time training our machine +learning model. It uses ensemble learning with four classifiers: Gaussian Naive +Bayes, Decision Tree, Logistic Regression with L1 regularization, and +Stochastic Gradient Descent, to decide whether to go long or short on a +particular stock. Our best model performed daily trade between July 2011 and +January 2019, generating 54.35% profit. Finally, our work showcased that +mixtures of weighted classifiers perform better than any individual predictor +of making trading decisions in the stock market. + +
+
+
+
+
+ + ♻ ☆ Detection and classification of vocal productions in large scale audio + recordings + + +
+ We propose an automatic data processing pipeline to extract vocal productions +from large-scale natural audio recordings and classify these vocal productions. +The pipeline is based on a deep neural network and adresses both issues +simultaneously. Though a series of computationel steps (windowing, creation of +a noise class, data augmentation, re-sampling, transfer learning, Bayesian +optimisation), it automatically trains a neural network without requiring a +large sample of labeled data and important computing resources. Our end-to-end +methodology can handle noisy recordings made under different recording +conditions. We test it on two different natural audio data sets, one from a +group of Guinea baboons recorded from a primate research center and one from +human babies recorded at home. The pipeline trains a model on 72 and 77 minutes +of labeled audio recordings, with an accuracy of 94.58% and 99.76%. It is then +used to process 443 and 174 hours of natural continuous recordings and it +creates two new databases of 38.8 and 35.2 hours, respectively. We discuss the +strengths and limitations of this approach that can be applied to any massive +audio recording. + +
+
+
+
+
+ + ♻ ☆ RT-1: Robotics Transformer for Real-World Control at Scale + + +
+ By transferring knowledge from large, diverse, task-agnostic datasets, modern +machine learning models can solve specific downstream tasks either zero-shot or +with small task-specific datasets to a high level of performance. While this +capability has been demonstrated in other fields such as computer vision, +natural language processing or speech recognition, it remains to be shown in +robotics, where the generalization capabilities of the models are particularly +critical due to the difficulty of collecting real-world robotic data. We argue +that one of the keys to the success of such general robotic models lies with +open-ended task-agnostic training, combined with high-capacity architectures +that can absorb all of the diverse, robotic data. In this paper, we present a +model class, dubbed Robotics Transformer, that exhibits promising scalable +model properties. We verify our conclusions in a study of different model +classes and their ability to generalize as a function of the data size, model +size, and data diversity based on a large-scale data collection on real robots +performing real-world tasks. The project's website and videos can be found at +robotics-transformer1.github.io + +
+
+ comment: See website at robotics-transformer1.github.io +
+
+
+
+
+ + ♻ ☆ Inverse Kernel Decomposition + + +
+ The state-of-the-art dimensionality reduction approaches largely rely on +complicated optimization procedures. On the other hand, closed-form approaches +requiring merely eigen-decomposition do not have enough sophistication and +nonlinearity. In this paper, we propose a novel nonlinear dimensionality +reduction method -- Inverse Kernel Decomposition (IKD) -- based on an +eigen-decomposition of the sample covariance matrix of data. The method is +inspired by Gaussian process latent variable models (GPLVMs) and has comparable +performance with GPLVMs. To deal with very noisy data with weak correlations, +we propose two solutions -- blockwise and geodesic -- to make use of locally +correlated data points and provide better and numerically more stable latent +estimations. We use synthetic datasets and four real-world datasets to show +that IKD is a better dimensionality reduction method than other +eigen-decomposition-based methods, and achieves comparable performance against +optimization-based methods with faster running speeds. Open-source IKD +implementation in Python can be accessed at this +\url{https://github.com/JerrySoybean/ikd}. + +
+
+
+
+
+ + ♻ ☆ SLEM: Machine Learning for Path Modeling and Causal Inference with Super + Learner Equation Modeling + + +
+ Causal inference is a crucial goal of science, enabling researchers to arrive +at meaningful conclusions regarding the predictions of hypothetical +interventions using observational data. Path models, Structural Equation Models +(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to +unambiguously specify assumptions regarding the causal structure underlying a +phenomenon. Unlike DAGs, which make very few assumptions about the functional +and parametric form, SEM assumes linearity. This can result in functional +misspecification which prevents researchers from undertaking reliable effect +size estimation. In contrast, we propose Super Learner Equation Modeling, a +path modeling technique integrating machine learning Super Learner ensembles. +We empirically demonstrate its ability to provide consistent and unbiased +estimates of causal effects, its competitive performance for linear models when +compared with SEM, and highlight its superiority over SEM when dealing with +non-linear relationships. We provide open-source code, and a tutorial notebook +with example usage, accentuating the easy-to-use nature of the method. + +
+
+
+
+
+ + ♻ ☆ A method for escaping limit cycles in training GANs + + +
+ This paper mainly conducts further research to alleviate the issue of limit +cycling behavior in training generative adversarial networks (GANs) through the +proposed predictive centripetal acceleration algorithm (PCAA). Specifically, we +first derive the upper and lower bounds on the last-iterate convergence rates +of PCAA for the general bilinear game, with the upper bound notably improving +upon previous results. Then, we combine PCAA with the adaptive moment +estimation algorithm (Adam) to propose PCAA-Adam, a practical approach for +training GANs. Finally, we validate the effectiveness of the proposed algorithm +through experiments conducted on bilinear games, multivariate Gaussian +distributions, and the CelebA dataset, respectively. + +
+
+
+
+
+ + ♻ ☆ RANS-PINN based Simulation Surrogates for Predicting Turbulent Flows + + +
+ Physics-informed neural networks (PINNs) provide a framework to build +surrogate models for dynamical systems governed by differential equations. +During the learning process, PINNs incorporate a physics-based regularization +term within the loss function to enhance generalization performance. Since +simulating dynamics controlled by partial differential equations (PDEs) can be +computationally expensive, PINNs have gained popularity in learning parametric +surrogates for fluid flow problems governed by Navier-Stokes equations. In this +work, we introduce RANS-PINN, a modified PINN framework, to predict flow fields +(i.e., velocity and pressure) in high Reynolds number turbulent flow regimes. +To account for the additional complexity introduced by turbulence, RANS-PINN +employs a 2-equation eddy viscosity model based on a Reynolds-averaged +Navier-Stokes (RANS) formulation. Furthermore, we adopt a novel training +approach that ensures effective initialization and balance among the various +components of the loss function. The effectiveness of the RANS-PINN framework +is then demonstrated using a parametric PINN. + +
+
+
+
+
+ + ♻ ☆ Oracle Teacher: Leveraging Target Information for Better Knowledge + Distillation of CTC Models + + +
+ Knowledge distillation (KD), best known as an effective method for model +compression, aims at transferring the knowledge of a bigger network (teacher) +to a much smaller network (student). Conventional KD methods usually employ the +teacher model trained in a supervised manner, where output labels are treated +only as targets. Extending this supervised scheme further, we introduce a new +type of teacher model for connectionist temporal classification (CTC)-based +sequence models, namely Oracle Teacher, that leverages both the source inputs +and the output labels as the teacher model's input. Since the Oracle Teacher +learns a more accurate CTC alignment by referring to the target information, it +can provide the student with more optimal guidance. One potential risk for the +proposed approach is a trivial solution that the model's output directly copies +the target input. Based on a many-to-one mapping property of the CTC algorithm, +we present a training strategy that can effectively prevent the trivial +solution and thus enables utilizing both source and target inputs for model +training. Extensive experiments are conducted on two sequence learning tasks: +speech recognition and scene text recognition. From the experimental results, +we empirically show that the proposed model improves the students across these +tasks while achieving a considerable speed-up in the teacher model's training +time. + +
+
+ comment: Accepted by IEEE/ACM Transactions on Audio, Speech and Language + Processing +
+
+
+
+
+ + ♻ ☆ Preventing Zero-Shot Transfer Degradation in Continual Learning of + Vision-Language Models ICCV 2023 + + +
+ Continual learning (CL) can help pre-trained vision-language models +efficiently adapt to new or under-trained data distributions without +re-training. Nevertheless, during the continual training of the Contrastive +Language-Image Pre-training (CLIP) model, we observe that the model's zero-shot +transfer ability significantly degrades due to catastrophic forgetting. +Existing CL methods can mitigate forgetting by replaying previous data. +However, since the CLIP dataset is private, replay methods cannot access the +pre-training dataset. In addition, replaying data of previously learned +downstream tasks can enhance their performance but comes at the cost of +sacrificing zero-shot performance. To address this challenge, we propose a +novel method ZSCL to prevent zero-shot transfer degradation in the continual +learning of vision-language models in both feature and parameter space. In the +feature space, a reference dataset is introduced for distillation between the +current and initial models. The reference dataset should have semantic +diversity but no need to be labeled, seen in pre-training, or matched +image-text pairs. In parameter space, we prevent a large parameter shift by +averaging weights during the training. We propose a more challenging +Multi-domain Task Incremental Learning (MTIL) benchmark to evaluate different +methods, where tasks are from various domains instead of class-separated in a +single dataset. Our method outperforms other methods in the traditional +class-incremental learning setting and the MTIL by 9.7% average score. Our code +locates at https://github.com/Thunderbeee/ZSCL. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Diverse Data Types Steganalysis: A Review + + +
+ Steganography and steganalysis are two interrelated aspects of the field of +information security. Steganography seeks to conceal communications, whereas +steganalysis is aimed to either find them or even, if possible, recover the +data they contain. Steganography and steganalysis have attracted a great deal +of interest, particularly from law enforcement. Steganography is often used by +cybercriminals and even terrorists to avoid being captured while in possession +of incriminating evidence, even encrypted, since cryptography is prohibited or +restricted in many countries. Therefore, knowledge of cutting-edge techniques +to uncover concealed information is crucial in exposing illegal acts. Over the +last few years, a number of strong and reliable steganography and steganalysis +techniques have been introduced in the literature. This review paper provides a +comprehensive overview of deep learning-based steganalysis techniques used to +detect hidden information within digital media. The paper covers all types of +cover in steganalysis, including image, audio, and video, and discusses the +most commonly used deep learning techniques. In addition, the paper explores +the use of more advanced deep learning techniques, such as deep transfer +learning (DTL) and deep reinforcement learning (DRL), to enhance the +performance of steganalysis systems. The paper provides a systematic review of +recent research in the field, including data sets and evaluation metrics used +in recent studies. It also presents a detailed analysis of DTL-based +steganalysis approaches and their performance on different data sets. The +review concludes with a discussion on the current state of deep learning-based +steganalysis, challenges, and future research directions. + +
+
+
+
+
+ + ♻ ☆ MAMAF-Net: Motion-Aware and Multi-Attention Fusion Network for Stroke + Diagnosis + + +
+ Stroke is a major cause of mortality and disability worldwide from which one +in four people are in danger of incurring in their lifetime. The pre-hospital +stroke assessment plays a vital role in identifying stroke patients accurately +to accelerate further examination and treatment in hospitals. Accordingly, the +National Institutes of Health Stroke Scale (NIHSS), Cincinnati Pre-hospital +Stroke Scale (CPSS) and Face Arm Speed Time (F.A.S.T.) are globally known tests +for stroke assessment. However, the validity of these tests is skeptical in the +absence of neurologists and access to healthcare may be limited. Therefore, in +this study, we propose a motion-aware and multi-attention fusion network +(MAMAF-Net) that can detect stroke from multimodal examination videos. Contrary +to other studies on stroke detection from video analysis, our study for the +first time proposes an end-to-end solution from multiple video recordings of +each subject with a dataset encapsulating stroke, transient ischemic attack +(TIA), and healthy controls. The proposed MAMAF-Net consists of motion-aware +modules to sense the mobility of patients, attention modules to fuse the +multi-input video data, and 3D convolutional layers to perform diagnosis from +the attention-based extracted features. Experimental results over the collected +Stroke-data dataset show that the proposed MAMAF-Net achieves a successful +detection of stroke with 93.62% sensitivity and 95.33% AUC score. + +
+
+
+
+
+ + ♻ ☆ Nonparametric Inference under B-bits Quantization + + +
+ Statistical inference based on lossy or incomplete samples is often needed in +research areas such as signal/image processing, medical image storage, remote +sensing, signal transmission. In this paper, we propose a nonparametric testing +procedure based on samples quantized to $B$ bits through a computationally +efficient algorithm. Under mild technical conditions, we establish the +asymptotic properties of the proposed test statistic and investigate how the +testing power changes as $B$ increases. In particular, we show that if $B$ +exceeds a certain threshold, the proposed nonparametric testing procedure +achieves the classical minimax rate of testing (Shang and Cheng, 2015) for +spline models. We further extend our theoretical investigations to a +nonparametric linearity test and an adaptive nonparametric test, expanding the +applicability of the proposed methods. Extensive simulation studies {together +with a real-data analysis} are used to demonstrate the validity and +effectiveness of the proposed tests. + +
+
+
+
+
+ + ♻ ☆ Selecting the number of clusters, clustering models, and algorithms. A + unifying approach based on the quadratic discriminant score + + +
+ Cluster analysis requires many decisions: the clustering method and the +implied reference model, the number of clusters and, often, several +hyper-parameters and algorithms' tunings. In practice, one produces several +partitions, and a final one is chosen based on validation or selection +criteria. There exist an abundance of validation methods that, implicitly or +explicitly, assume a certain clustering notion. Moreover, they are often +restricted to operate on partitions obtained from a specific method. In this +paper, we focus on groups that can be well separated by quadratic or linear +boundaries. The reference cluster concept is defined through the quadratic +discriminant score function and parameters describing clusters' size, center +and scatter. We develop two cluster-quality criteria called quadratic scores. +We show that these criteria are consistent with groups generated from a general +class of elliptically-symmetric distributions. The quest for this type of +groups is common in applications. The connection with likelihood theory for +mixture models and model-based clustering is investigated. Based on bootstrap +resampling of the quadratic scores, we propose a selection rule that allows +choosing among many clustering solutions. The proposed method has the +distinctive advantage that it can compare partitions that cannot be compared +with other state-of-the-art methods. Extensive numerical experiments and the +analysis of real data show that, even if some competing methods turn out to be +superior in some setups, the proposed methodology achieves a better overall +performance. + +
+
+ comment: Supplemental materials are included at the end of the paper +
+
+
+
+
+ + ♻ ☆ ExBEHRT: Extended Transformer for Electronic Health Records to Predict + Disease Subtypes & Progressions ICLR 2023 + + +
+ In this study, we introduce ExBEHRT, an extended version of BEHRT (BERT +applied to electronic health records), and apply different algorithms to +interpret its results. While BEHRT considers only diagnoses and patient age, we +extend the feature space to several multimodal records, namely demographics, +clinical characteristics, vital signs, smoking status, diagnoses, procedures, +medications, and laboratory tests, by applying a novel method to unify the +frequencies and temporal dimensions of the different features. We show that +additional features significantly improve model performance for various +downstream tasks in different diseases. To ensure robustness, we interpret +model predictions using an adaptation of expected gradients, which has not been +previously applied to transformers with EHR data and provides more granular +interpretations than previous approaches such as feature and token importances. +Furthermore, by clustering the model representations of oncology patients, we +show that the model has an implicit understanding of the disease and is able to +classify patients with the same cancer type into different risk groups. Given +the additional features and interpretability, ExBEHRT can help make informed +decisions about disease trajectories, diagnoses, and risk factors of various +diseases. + +
+
+ comment: ICLR 2023 Workshop on Trustworthy Machine Learning for Healthcare + (Website: https://sites.google.com/view/tml4h2023/accepted-papers ) +
+
+
+
+
+ + ♻ ☆ On the Trade-off between Over-smoothing and Over-squashing in Deep Graph + Neural Networks CIKM + + +
+ Graph Neural Networks (GNNs) have succeeded in various computer science +applications, yet deep GNNs underperform their shallow counterparts despite +deep learning's success in other domains. Over-smoothing and over-squashing are +key challenges when stacking graph convolutional layers, hindering deep +representation learning and information propagation from distant nodes. Our +work reveals that over-smoothing and over-squashing are intrinsically related +to the spectral gap of the graph Laplacian, resulting in an inevitable +trade-off between these two issues, as they cannot be alleviated +simultaneously. To achieve a suitable compromise, we propose adding and +removing edges as a viable approach. We introduce the Stochastic Jost and Liu +Curvature Rewiring (SJLR) algorithm, which is computationally efficient and +preserves fundamental properties compared to previous curvature-based methods. +Unlike existing approaches, SJLR performs edge addition and removal during GNN +training while maintaining the graph unchanged during testing. Comprehensive +comparisons demonstrate SJLR's competitive performance in addressing +over-smoothing and over-squashing. + +
+
+ comment: This paper has been accepted for publication at the 32nd ACM + International Conference on Information and Knowledge Management (CIKM) 2023 +
+
+
+
+
+ + ♻ ☆ Cross-modal Contrastive Learning for Multimodal Fake News Detection + + +
+ Automatic detection of multimodal fake news has gained a widespread attention +recently. Many existing approaches seek to fuse unimodal features to produce +multimodal news representations. However, the potential of powerful cross-modal +contrastive learning methods for fake news detection has not been well +exploited. Besides, how to aggregate features from different modalities to +boost the performance of the decision-making process is still an open question. +To address that, we propose COOLANT, a cross-modal contrastive learning +framework for multimodal fake news detection, aiming to achieve more accurate +image-text alignment. To further improve the alignment precision, we leverage +an auxiliary task to soften the loss term of negative samples during the +contrast process. A cross-modal fusion module is developed to learn the +cross-modality correlations. An attention mechanism with an attention guidance +module is implemented to help effectively and interpretably aggregate the +aligned unimodal representations and the cross-modality correlations. Finally, +we evaluate the COOLANT and conduct a comparative study on two widely used +datasets, Twitter and Weibo. The experimental results demonstrate that our +COOLANT outperforms previous approaches by a large margin and achieves new +state-of-the-art results on the two datasets. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ PENTACET data -- 23 Million Contextual Code Comments and 250,000 SATD + comments + + +
+ Most Self-Admitted Technical Debt (SATD) research utilizes explicit SATD +features such as 'TODO' and 'FIXME' for SATD detection. A closer look reveals +several SATD research uses simple SATD ('Easy to Find') code comments without +the contextual data (preceding and succeeding source code context). This work +addresses this gap through PENTACET (or 5C dataset) data. PENTACET is a large +Curated Contextual Code Comments per Contributor and the most extensive SATD +data. We mine 9,096 Open Source Software Java projects with a total of 435 +million LOC. The outcome is a dataset with 23 million code comments, preceding +and succeeding source code context for each comment, and more than 250,000 +comments labeled as SATD, including both 'Easy to Find' and 'Hard to Find' +SATD. We believe PENTACET data will further SATD research using Artificial +Intelligence techniques. + +
+
+ comment: Accepted in MSR 2023 Tools and Data Showcase +
+
+
+
+
+ + ♻ ☆ Faithful Knowledge Distillation + + +
+ Knowledge distillation (KD) has received much attention due to its success in +compressing networks to allow for their deployment in resource-constrained +systems. While the problem of adversarial robustness has been studied before in +the KD setting, previous works overlook what we term the relative calibration +of the student network with respect to its teacher in terms of soft +confidences. In particular, we focus on two crucial questions with regard to a +teacher-student pair: (i) do the teacher and student disagree at points close +to correctly classified dataset examples, and (ii) is the distilled student as +confident as the teacher around dataset examples? These are critical questions +when considering the deployment of a smaller student network trained from a +robust teacher within a safety-critical setting. To address these questions, we +introduce a faithful imitation framework to discuss the relative calibration of +confidences and provide empirical and certified methods to evaluate the +relative calibration of a student w.r.t. its teacher. Further, to verifiably +align the relative calibration incentives of the student to those of its +teacher, we introduce faithful distillation. Our experiments on the MNIST, +Fashion-MNIST and CIFAR-10 datasets demonstrate the need for such an analysis +and the advantages of the increased verifiability of faithful distillation over +alternative adversarial distillation methods. + +
+
+ comment: 7pgs (main content), 4 figures +
+
+
+
+
+ + ♻ ☆ Robust Graph Representation Learning for Local Corruption Recovery WWW '23 + + +
+ The performance of graph representation learning is affected by the quality +of graph input. While existing research usually pursues a globally smoothed +graph embedding, we believe the rarely observed anomalies are as well harmful +to an accurate prediction. This work establishes a graph learning scheme that +automatically detects (locally) corrupted feature attributes and recovers +robust embedding for prediction tasks. The detection operation leverages a +graph autoencoder, which does not make any assumptions about the distribution +of the local corruptions. It pinpoints the positions of the anomalous node +attributes in an unbiased mask matrix, where robust estimations are recovered +with sparsity promoting regularizer. The optimizer approaches a new embedding +that is sparse in the framelet domain and conditionally close to input +observations. Extensive experiments are provided to validate our proposed model +can recover a robust graph representation from black-box poisoning and achieve +excellent performance. + +
+
+ comment: WWW '23: Proceedings of the ACM Web Conference 2023 +
+
+
+
+
+ + ♻ ☆ Improving the Transferability of Adversarial Examples via Direction + Tuning + + +
+ In the transfer-based adversarial attacks, adversarial examples are only +generated by the surrogate models and achieve effective perturbation in the +victim models. Although considerable efforts have been developed on improving +the transferability of adversarial examples generated by transfer-based +adversarial attacks, our investigation found that, the big deviation between +the actual and steepest update directions of the current transfer-based +adversarial attacks is caused by the large update step length, resulting in the +generated adversarial examples can not converge well. However, directly +reducing the update step length will lead to serious update oscillation so that +the generated adversarial examples also can not achieve great transferability +to the victim models. To address these issues, a novel transfer-based attack, +namely direction tuning attack, is proposed to not only decrease the update +deviation in the large step length, but also mitigate the update oscillation in +the small sampling step length, thereby making the generated adversarial +examples converge well to achieve great transferability on victim models. In +addition, a network pruning method is proposed to smooth the decision boundary, +thereby further decreasing the update oscillation and enhancing the +transferability of the generated adversarial examples. The experiment results +on ImageNet demonstrate that the average attack success rate (ASR) of the +adversarial examples generated by our method can be improved from 87.9\% to +94.5\% on five victim models without defenses, and from 69.1\% to 76.2\% on +eight advanced defense methods, in comparison with that of latest +gradient-based attacks. + +
+
+ comment: Accepted by INS 2023 +
+
+
+
+
+ + ♻ ☆ On the Design Fundamentals of Diffusion Models: A Survey + + +
+ Diffusion models are generative models, which gradually add and remove noise +to learn the underlying distribution of training data for data generation. The +components of diffusion models have gained significant attention with many +design choices proposed. Existing reviews have primarily focused on +higher-level solutions, thereby covering less on the design fundamentals of +components. This study seeks to address this gap by providing a comprehensive +and coherent review on component-wise design choices in diffusion models. +Specifically, we organize this review according to their three key components, +namely the forward process, the reverse process, and the sampling procedure. +This allows us to provide a fine-grained perspective of diffusion models, +benefiting future studies in the analysis of individual components, the +applicability of design choices, and the implementation of diffusion models. + +
+
+
+
+
+ + ♻ ☆ Pretraining Respiratory Sound Representations using Metadata and + Contrastive Learning + + +
+ Methods based on supervised learning using annotations in an end-to-end +fashion have been the state-of-the-art for classification problems. However, +they may be limited in their generalization capability, especially in the low +data regime. In this study, we address this issue using supervised contrastive +learning combined with available metadata to solve multiple pretext tasks that +learn a good representation of data. We apply our approach on respiratory sound +classification. This task is suited for this setting as demographic information +such as sex and age are correlated with presence of lung diseases, and learning +a system that implicitly encode this information may better detect anomalies. +Supervised contrastive learning is a paradigm that learns similar +representations to samples sharing the same class labels and dissimilar +representations to samples with different class labels. The feature extractor +learned using this paradigm extract useful features from the data, and we show +that it outperforms cross-entropy in classifying respiratory anomalies in two +different datasets. We also show that learning representations using only +metadata, without class labels, obtains similar performance as using cross +entropy with those labels only. In addition, when combining class labels with +metadata using multiple supervised contrastive learning, an extension of +supervised contrastive learning solving an additional task of grouping patients +within the same sex and age group, more informative features are learned. This +work suggests the potential of using multiple metadata sources in supervised +contrastive settings, in particular in settings with class imbalance and few +data. Our code is released at https://github.com/ilyassmoummad/scl_icbhi2017 + +
+
+
+
+
+ + ♻ ☆ Graph Neural Network Sensitivity Under Probabilistic Error Model + + +
+ Graph convolutional networks (GCNs) can successfully learn the graph signal +representation by graph convolution. The graph convolution depends on the graph +filter, which contains the topological dependency of data and propagates data +features. However, the estimation errors in the propagation matrix (e.g., the +adjacency matrix) can have a significant impact on graph filters and GCNs. In +this paper, we study the effect of a probabilistic graph error model on the +performance of the GCNs. We prove that the adjacency matrix under the error +model is bounded by a function of graph size and error probability. We further +analytically specify the upper bound of a normalized adjacency matrix with +self-loop added. Finally, we illustrate the error bounds by running experiments +on a synthetic dataset and study the sensitivity of a simple GCN under this +probabilistic error model on accuracy. + +
+
+
+
+
+ + ♻ ☆ HRFuser: A Multi-resolution Sensor Fusion Architecture for 2D Object + Detection SC + + +
+ Besides standard cameras, autonomous vehicles typically include multiple +additional sensors, such as lidars and radars, which help acquire richer +information for perceiving the content of the driving scene. While several +recent works focus on fusing certain pairs of sensors - such as camera with +lidar or radar - by using architectural components specific to the examined +setting, a generic and modular sensor fusion architecture is missing from the +literature. In this work, we propose HRFuser, a modular architecture for +multi-modal 2D object detection. It fuses multiple sensors in a +multi-resolution fashion and scales to an arbitrary number of input modalities. +The design of HRFuser is based on state-of-the-art high-resolution networks for +image-only dense prediction and incorporates a novel multi-window +cross-attention block as the means to perform fusion of multiple modalities at +multiple resolutions. We demonstrate via extensive experiments on nuScenes and +the adverse conditions DENSE datasets that our model effectively leverages +complementary features from additional modalities, substantially improving upon +camera-only performance and consistently outperforming state-of-the-art 3D and +2D fusion methods evaluated on 2D object detection metrics. The source code is +publicly available. + +
+
+ comment: IEEE International Conference on Intelligent Transportation Systems + (ITSC) 2023 +
+
+
+
+
+ + ♻ ☆ Learning representations that are closed-form Monge mapping optimal with + application to domain adaptation + + +
+ Optimal transport (OT) is a powerful geometric tool used to compare and align +probability measures following the least effort principle. Despite its +widespread use in machine learning (ML), OT problem still bears its +computational burden, while at the same time suffering from the curse of +dimensionality for measures supported on general high-dimensional spaces. In +this paper, we propose to tackle these challenges using representation +learning. In particular, we seek to learn an embedding space such that the +samples of the two input measures become alignable in it with a simple affine +mapping that can be calculated efficiently in closed-form. We then show that +such approach leads to results that are comparable to solving the original OT +problem when applied to the transfer learning task on which many OT baselines +where previously evaluated in both homogeneous and heterogeneous DA settings. +The code for our contribution is available at +\url{https://github.com/Oleffa/LaOT}. + +
+
+
+
+
+ + ♻ ☆ Constraining Linear-chain CRFs to Regular Languages + + +
+ A major challenge in structured prediction is to represent the +interdependencies within output structures. When outputs are structured as +sequences, linear-chain conditional random fields (CRFs) are a widely used +model class which can learn \textit{local} dependencies in the output. However, +the CRF's Markov assumption makes it impossible for CRFs to represent +distributions with \textit{nonlocal} dependencies, and standard CRFs are unable +to respect nonlocal constraints of the data (such as global arity constraints +on output labels). We present a generalization of CRFs that can enforce a broad +class of constraints, including nonlocal ones, by specifying the space of +possible output structures as a regular language $\mathcal{L}$. The resulting +regular-constrained CRF (RegCCRF) has the same formal properties as a standard +CRF, but assigns zero probability to all label sequences not in $\mathcal{L}$. +Notably, RegCCRFs can incorporate their constraints during training, while +related models only enforce constraints during decoding. We prove that +constrained training is never worse than constrained decoding, and show +empirically that it can be substantially better in practice. Additionally, we +demonstrate a practical benefit on downstream tasks by incorporating a RegCCRF +into a deep neural model for semantic role labeling, exceeding state-of-the-art +results on a standard dataset. + +
+
+
+
+
+ + ♻ ☆ Generating artificial digital image correlation data using + physics-guided adversarial networks + + +
+ Digital image correlation (DIC) has become a valuable tool in the evaluation +of mechanical experiments, particularly fatigue crack growth experiments. The +evaluation requires accurate information of the crack path and crack tip +position, which is difficult to obtain due to inherent noise and artefacts. +Machine learning models have been extremely successful in recognizing this +relevant information. But for the training of robust models, which generalize +well, big data is needed. However, data is typically scarce in the field of +material science and engineering because experiments are expensive and +time-consuming. We present a method to generate synthetic DIC data using +generative adversarial networks with a physics-guided discriminator. To decide +whether data samples are real or fake, this discriminator additionally receives +the derived von Mises equivalent strain. We show that this physics-guided +approach leads to improved results in terms of visual quality of samples, +sliced Wasserstein distance, and geometry score. + +
+
+
+
+
+ + ♻ ☆ Personalised Language Modelling of Screen Characters Using Rich Metadata + Annotations + + +
+ Language models that are sensitive to external context can more effectively +capture the speaking patterns of individuals with specific characteristics or +in particular environments. However, obtaining and leveraging such annotations +can be challenging. In this work, we show how to leverage rich character and +film annotations to personalise language models in a scalable manner. Our best +model can reduce perplexity by up to 6.5% compared to a parameter-matched +language model. Our approach performs on par with speaker-specific fine-tuning +when the fine-tuning data (i.e. past dialogue) for individual speakers is +available. On top of that, it also generalises well to a scenario with no such +data, relying on combinations of demographic characteristics expressed via +metadata. Our findings are consistent across two corpora, one of which is also +a contribution of this paper: Cornell-rich contains rich manual annotations for +863 speaking characters from the Cornell Movie Dialog Corpus, including +features such as characteristic quotes and character descriptions, along with +six automatically extracted metadata features for over 95% of the featured +films. Finally, we also present a cost-benefit analysis highlighting which +annotations are most cost-effective in reducing perplexity. + +
+
+ comment: 9 pages; 4 figures; 6 tables. Preprint +
+
+
+
+
+ + ♻ ☆ Verifying the Robustness of Automatic Credibility Assessment + + +
+ Text classification methods have been widely investigated as a way to detect +content of low credibility: fake news, social media bots, propaganda, etc. +Quite accurate models (likely based on deep neural networks) help in moderating +public electronic platforms and often cause content creators to face rejection +of their submissions or removal of already published texts. Having the +incentive to evade further detection, content creators try to come up with a +slightly modified version of the text (known as an attack with an adversarial +example) that exploit the weaknesses of classifiers and result in a different +output. Here we systematically test the robustness of popular text classifiers +against available attacking techniques and discover that, indeed, in some cases +insignificant changes in input text can mislead the models. We also introduce +BODEGA: a benchmark for testing both victim models and attack methods on four +misinformation detection tasks in an evaluation framework designed to simulate +real use-cases of content moderation. Finally, we manually analyse a subset +adversarial examples and check what kinds of modifications are used in +successful attacks. The BODEGA code and data is openly shared in hope of +enhancing the comparability and replicability of further research in this area + +
+
+
+
+
+ + ♻ ☆ Enhancing the Robustness via Adversarial Learning and Joint + Spatial-Temporal Embeddings in Traffic Forecasting CIKM 2023 + + +
+ Traffic forecasting is an essential problem in urban planning and computing. +The complex dynamic spatial-temporal dependencies among traffic objects (e.g., +sensors and road segments) have been calling for highly flexible models; +unfortunately, sophisticated models may suffer from poor robustness especially +in capturing the trend of the time series (1st-order derivatives with time), +leading to unrealistic forecasts. To address the challenge of balancing +dynamics and robustness, we propose TrendGCN, a new scheme that extends the +flexibility of GCNs and the distribution-preserving capacity of generative and +adversarial loss for handling sequential data with inherent statistical +correlations. On the one hand, our model simultaneously incorporates spatial +(node-wise) embeddings and temporal (time-wise) embeddings to account for +heterogeneous space-and-time convolutions; on the other hand, it uses GAN +structure to systematically evaluate statistical consistencies between the real +and the predicted time series in terms of both the temporal trending and the +complex spatial-temporal dependencies. Compared with traditional approaches +that handle step-wise predictive errors independently, our approach can produce +more realistic and robust forecasts. Experiments on six benchmark traffic +forecasting datasets and theoretical analysis both demonstrate the superiority +and the state-of-the-art performance of TrendGCN. Source code is available at +https://github.com/juyongjiang/TrendGCN. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Joint Multi-view Unsupervised Feature Selection and Graph Learning + + +
+ Despite significant progress, previous multi-view unsupervised feature +selection methods mostly suffer from two limitations. First, they generally +utilize either cluster structure or similarity structure to guide the feature +selection, which neglect the possibility of a joint formulation with mutual +benefits. Second, they often learn the similarity structure by either global +structure learning or local structure learning, which lack the capability of +graph learning with both global and local structural awareness. In light of +this, this paper presents a joint multi-view unsupervised feature selection and +graph learning (JMVFG) approach. Particularly, we formulate the multi-view +feature selection with orthogonal decomposition, where each target matrix is +decomposed into a view-specific basis matrix and a view-consistent cluster +indicator. The cross-space locality preservation is incorporated to bridge the +cluster structure learning in the projected space and the similarity learning +(i.e., graph learning) in the original space. Further, a unified objective +function is presented to enable the simultaneous learning of the cluster +structure, the global and local similarity structures, and the multi-view +consistency and inconsistency, upon which an alternating optimization algorithm +is developed with theoretically proved convergence. Extensive experiments on a +variety of real-world multi-view datasets demonstrate the superiority of our +approach for both the multi-view feature selection and graph learning tasks. +The code is available at https://github.com/huangdonghere/JMVFG. + +
+
+ comment: To appear in IEEE Transactions on Emerging Topics in Computational + Intelligence +
+
+
+
+
+ + ♻ ☆ Towards Causal Representation Learning and Deconfounding from Indefinite + Data + + +
+ Owing to the cross-pollination between causal discovery and deep learning, +non-statistical data (e.g., images, text, etc.) encounters significant +conflicts in terms of properties and methods with traditional causal data. To +unify these data types of varying forms, we redefine causal data from two novel +perspectives and then propose three data paradigms. Among them, the indefinite +data (like dialogues or video sources) induce low sample utilization and +incapability of the distribution assumption, both leading to the fact that +learning causal representation from indefinite data is, as of yet, largely +unexplored. We design the causal strength variational model to settle down +these two problems. Specifically, we leverage the causal strength instead of +independent noise as the latent variable to construct evidence lower bound. By +this design ethos, The causal strengths of different structures are regarded as +a distribution and can be expressed as a 2D matrix. Moreover, considering the +latent confounders, we disentangle the causal graph G into two relation +subgraphs O and C. O contains pure relations between observed variables, while +C represents the relations from latent variables to observed variables. We +implement the above designs as a dynamic variational inference model, tailored +to learn causal representation from indefinite data under latent confounding. +Finally, we conduct comprehensive experiments on synthetic and real-world data +to demonstrate the effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Con$^{2}$DA: Simplifying Semi-supervised Domain Adaptation by Learning + Consistent and Contrastive Feature Representations NeurIPS 2021 + + +
+ In this work, we present Con$^{2}$DA, a simple framework that extends recent +advances in semi-supervised learning to the semi-supervised domain adaptation +(SSDA) problem. Our framework generates pairs of associated samples by +performing stochastic data transformations to a given input. Associated data +pairs are mapped to a feature representation space using a feature extractor. +We use different loss functions to enforce consistency between the feature +representations of associated data pairs of samples. We show that these learned +representations are useful to deal with differences in data distributions in +the domain adaptation problem. We performed experiments to study the main +components of our model and we show that (i) learning of the consistent and +contrastive feature representations is crucial to extract good discriminative +features across different domains, and ii) our model benefits from the use of +strong augmentation policies. With these findings, our method achieves +state-of-the-art performances in three benchmark datasets for SSDA. + +
+
+ comment: Accepted to NeurIPS 2021 Workshop on Distribution Shifts: Connecting + Methods and Applications +
+
+
+
+
+ + ♻ ☆ ECLAD: Extracting Concepts with Local Aggregated Descriptors + + +
+ Convolutional neural networks (CNNs) are increasingly being used in critical +systems, where robustness and alignment are crucial. In this context, the field +of explainable artificial intelligence has proposed the generation of +high-level explanations of the prediction process of CNNs through concept +extraction. While these methods can detect whether or not a concept is present +in an image, they are unable to determine its location. What is more, a fair +comparison of such approaches is difficult due to a lack of proper validation +procedures. To address these issues, we propose a novel method for automatic +concept extraction and localization based on representations obtained through +pixel-wise aggregations of CNN activation maps. Further, we introduce a process +for the validation of concept-extraction techniques based on synthetic datasets +with pixel-wise annotations of their main components, reducing the need for +human intervention. Extensive experimentation on both synthetic and real-world +datasets demonstrates that our method outperforms state-of-the-art +alternatives. + +
+
+ comment: 34 pages, under review +
+
+
+
+
+ + ♻ ☆ Trainable Weight Averaging: A General Approach for Subspace Training ICLR 2023 + + +
+ Training deep neural networks (DNNs) in low-dimensional subspaces is a +promising direction for achieving efficient training and better generalization +performance. Our previous work extracts the subspaces by performing the +dimension reduction method over the training trajectory, which verifies that +DNN could be well-trained in a tiny subspace. However, that method is +inefficient for subspace extraction and numerically unstable, limiting its +applicability to more general tasks. In this paper, we connect subspace +training to weight averaging and propose \emph{Trainable Weight Averaging} +(TWA), a general approach for subspace training. TWA is efficient in terms of +subspace extraction and easy to use, making it a promising new optimizer for +DNN's training. Our design also includes an efficient scheme that allows +parallel training across multiple nodes to handle large-scale problems and +evenly distribute the memory and computation burden to each node. TWA can be +used for both efficient training and generalization enhancement, for different +neural network architectures, and for various tasks from image classification +and object detection, to neural language processing. The code of implementation +is available at https://github.com/nblt/TWA, which includes extensive +experiments covering various benchmark computer vision and neural language +processing tasks with various architectures. + +
+
+ comment: Journal version in progress. Previously accepted to ICLR 2023 +
+
+
+
+
+ + ♻ ☆ A Neural-Network-Based Convex Regularizer for Image Reconstruction + + +
+ The emergence of deep-learning-based methods to solve image-reconstruction +problems has enabled a significant increase in reconstruction quality. +Unfortunately, these new methods often lack reliability and explainability, and +there is a growing interest to address these shortcomings while retaining the +boost in performance. In this work, we tackle this issue by revisiting +regularizers that are the sum of convex-ridge functions. The gradient of such +regularizers is parameterized by a neural network that has a single hidden +layer with increasing and learnable activation functions. This neural network +is trained within a few minutes as a multistep Gaussian denoiser. The numerical +experiments for denoising, CT, and MRI reconstruction show improvements over +methods that offer similar reliability guarantees. + +
+
+
+
+
+ + ♻ ☆ Robust Lane Detection through Self Pre-training with Masked Sequential + Autoencoders and Fine-tuning with Customized PolyLoss + + +
+ Lane detection is crucial for vehicle localization which makes it the +foundation for automated driving and many intelligent and advanced driving +assistant systems. Available vision-based lane detection methods do not make +full use of the valuable features and aggregate contextual information, +especially the interrelationships between lane lines and other regions of the +images in continuous frames. To fill this research gap and upgrade lane +detection performance, this paper proposes a pipeline consisting of self +pre-training with masked sequential autoencoders and fine-tuning with +customized PolyLoss for the end-to-end neural network models using +multi-continuous image frames. The masked sequential autoencoders are adopted +to pre-train the neural network models with reconstructing the missing pixels +from a random masked image as the objective. Then, in the fine-tuning +segmentation phase where lane detection segmentation is performed, the +continuous image frames are served as the inputs, and the pre-trained model +weights are transferred and further updated using the backpropagation mechanism +with customized PolyLoss calculating the weighted errors between the output +lane detection results and the labeled ground truth. Extensive experiment +results demonstrate that, with the proposed pipeline, the lane detection model +performance on both normal and challenging scenes can be advanced beyond the +state-of-the-art, delivering the best testing accuracy (98.38%), precision +(0.937), and F1-measure (0.924) on the normal scene testing set, together with +the best overall accuracy (98.36%) and precision (0.844) in the challenging +scene test set, while the training time can be substantially shortened. + +
+
+ comment: 12 pages, 8 figures, accepted by journal of IEEE Transactions on + Intelligent Transportation Systems +
+
+
+
+
+ + ♻ ☆ Robust Quadruped Jumping via Deep Reinforcement Learning + + +
+ In this paper, we consider a general task of jumping varying distances and +heights for a quadrupedal robot in noisy environments, such as off of uneven +terrain and with variable robot dynamics parameters. To accurately jump in such +conditions, we propose a framework using deep reinforcement learning that +leverages and augments the complex solution of nonlinear trajectory +optimization for quadrupedal jumping. While the standalone optimization limits +jumping to take-off from flat ground and requires accurate assumptions of robot +dynamics, our proposed approach improves the robustness to allow jumping off of +significantly uneven terrain with variable robot dynamical parameters and +environmental conditions. Compared with walking and running, the realization of +aggressive jumping on hardware necessitates accounting for the motors' +torque-speed relationship as well as the robot's total power limits. By +incorporating these constraints into our learning framework, we successfully +deploy our policy sim-to-real without further tuning, fully exploiting the +available onboard power supply and motors. We demonstrate robustness to +environment noise of foot disturbances of up to 6 cm in height, or 33% of the +robot's nominal standing height, while jumping 2x the body length in distance. + +
+
+
+
+
+ + ♻ ☆ Machine learning methods for the search for L&T brown dwarfs in the data + of modern sky surveys + + +
+ According to various estimates, brown dwarfs (BD) should account for up to 25 +percent of all objects in the Galaxy. However, few of them are discovered and +well-studied, both individually and as a population. Homogeneous and complete +samples of brown dwarfs are needed for these kinds of studies. Due to their +weakness, spectral studies of brown dwarfs are rather laborious. For this +reason, creating a significant reliable sample of brown dwarfs, confirmed by +spectroscopic observations, seems unattainable at the moment. Numerous attempts +have been made to search for and create a set of brown dwarfs using their +colours as a decision rule applied to a vast amount of survey data. In this +work, we use machine learning methods such as Random Forest Classifier, +XGBoost, SVM Classifier and TabNet on PanStarrs DR1, 2MASS and WISE data to +distinguish L and T brown dwarfs from objects of other spectral and luminosity +classes. The explanation of the models is discussed. We also compare our models +with classical decision rules, proving their efficiency and relevance. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ LLM As DBA + + +
+ Database administrators (DBAs) play a crucial role in managing, maintaining +and optimizing a database system to ensure data availability, performance, and +reliability. However, it is hard and tedious for DBAs to manage a large number +of database instances (e.g., millions of instances on the cloud databases). +Recently large language models (LLMs) have shown great potential to understand +valuable documents and accordingly generate reasonable answers. Thus, we +propose D-Bot, a LLM-based database administrator that can continuously acquire +database maintenance experience from textual sources, and provide reasonable, +well-founded, in-time diagnosis and optimization advice for target databases. +This paper presents a revolutionary LLM-centric framework for database +maintenance, including (i) database maintenance knowledge detection from +documents and tools, (ii) tree of thought reasoning for root cause analysis, +and (iii) collaborative diagnosis among multiple LLMs. Our preliminary +experimental results that D-Bot can efficiently and effectively diagnose the +root causes and our code is available at +github.com/TsinghuaDatabaseGroup/DB-GPT. + +
+
+
+
+
+ + ♻ ☆ A Survey on Popularity Bias in Recommender Systems + + +
+ Recommender systems help people find relevant content in a personalized way. +One main promise of such systems is that they are able to increase the +visibility of items in the long tail, i.e., the lesser-known items in a +catalogue. Existing research, however, suggests that in many situations today's +recommendation algorithms instead exhibit a popularity bias, meaning that they +often focus on rather popular items in their recommendations. Such a bias may +not only lead to limited value of the recommendations for consumers and +providers in the short run, but it may also cause undesired reinforcement +effects over time. In this paper, we discuss the potential reasons for +popularity bias and we review existing approaches to detect, quantify and +mitigate popularity bias in recommender systems. Our survey therefore includes +both an overview of the computational metrics used in the literature as well as +a review of the main technical approaches to reduce the bias. We furthermore +critically discuss today's literature, where we observe that the research is +almost entirely based on computational experiments and on certain assumptions +regarding the practical effects of including long-tail items in the +recommendations. + +
+
+
+
+
+ + ♻ ☆ A Dynamics Theory of Implicit Regularization in Deep Low-Rank Matrix + Factorization + + +
+ Implicit regularization is an important way to interpret neural networks. +Recent theory starts to explain implicit regularization with the model of deep +matrix factorization (DMF) and analyze the trajectory of discrete gradient +dynamics in the optimization process. These discrete gradient dynamics are +relatively small but not infinitesimal, thus fitting well with the practical +implementation of neural networks. Currently, discrete gradient dynamics +analysis has been successfully applied to shallow networks but encounters the +difficulty of complex computation for deep networks. In this work, we introduce +another discrete gradient dynamics approach to explain implicit regularization, +i.e. landscape analysis. It mainly focuses on gradient regions, such as saddle +points and local minima. We theoretically establish the connection between +saddle point escaping (SPE) stages and the matrix rank in DMF. We prove that, +for a rank-R matrix reconstruction, DMF will converge to a second-order +critical point after R stages of SPE. This conclusion is further experimentally +verified on a low-rank matrix reconstruction problem. This work provides a new +theory to analyze implicit regularization in deep learning. + +
+
+ comment: 15 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Branching for Motion Prediction using Motion Increments + + +
+ Human motion prediction (HMP) has emerged as a popular research topic due to +its diverse applications, but it remains a challenging task due to the +stochastic and aperiodic nature of future poses. Traditional methods rely on +hand-crafted features and machine learning techniques, which often struggle to +model the complex dynamics of human motion. Recent deep learning-based methods +have achieved success by learning spatio-temporal representations of motion, +but these models often overlook the reliability of motion data. Additionally, +the temporal and spatial dependencies of skeleton nodes are distinct. The +temporal relationship captures motion information over time, while the spatial +relationship describes body structure and the relationships between different +nodes. In this paper, we propose a novel spatio-temporal branching network +using incremental information for HMP, which decouples the learning of +temporal-domain and spatial-domain features, extracts more motion information, +and achieves complementary cross-domain knowledge learning through knowledge +distillation. Our approach effectively reduces noise interference and provides +more expressive information for characterizing motion by separately extracting +temporal and spatial features. We evaluate our approach on standard HMP +benchmarks and outperform state-of-the-art methods in terms of prediction +accuracy. + +
+
+
+
+
+ + ♻ ☆ A Cover Time Study of a non-Markovian Algorithm + + +
+ Given a traversal algorithm, cover time is the expected number of steps +needed to visit all nodes in a given graph. A smaller cover time means a higher +exploration efficiency of traversal algorithm. Although random walk algorithms +have been studied extensively in the existing literature, there has been no +cover time result for any non-Markovian method. In this work, we stand on a +theoretical perspective and show that the negative feedback strategy (a +count-based exploration method) is better than the naive random walk search. In +particular, the former strategy can locally improve the search efficiency for +an arbitrary graph. It also achieves smaller cover times for special but +important graphs, including clique graphs, tree graphs, etc. Moreover, we make +connections between our results and reinforcement learning literature to give +new insights on why classical UCB and MCTS algorithms are so useful. Various +numerical results corroborate our theoretical findings. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ CLAMS: A Cluster Ambiguity Measure for Estimating Perceptual Variability + in Visual Clustering IEEE VIS 2023 + + +
+ Visual clustering is a common perceptual task in scatterplots that supports +diverse analytics tasks (e.g., cluster identification). However, even with the +same scatterplot, the ways of perceiving clusters (i.e., conducting visual +clustering) can differ due to the differences among individuals and ambiguous +cluster boundaries. Although such perceptual variability casts doubt on the +reliability of data analysis based on visual clustering, we lack a systematic +way to efficiently assess this variability. In this research, we study +perceptual variability in conducting visual clustering, which we call Cluster +Ambiguity. To this end, we introduce CLAMS, a data-driven visual quality +measure for automatically predicting cluster ambiguity in monochrome +scatterplots. We first conduct a qualitative study to identify key factors that +affect the visual separation of clusters (e.g., proximity or size difference +between clusters). Based on study findings, we deploy a regression module that +estimates the human-judged separability of two clusters. Then, CLAMS predicts +cluster ambiguity by analyzing the aggregated results of all pairwise +separability between clusters that are generated by the module. CLAMS +outperforms widely-used clustering techniques in predicting ground truth +cluster ambiguity. Meanwhile, CLAMS exhibits performance on par with human +annotators. We conclude our work by presenting two applications for optimizing +and benchmarking data mining techniques using CLAMS. The interactive demo of +CLAMS is available at clusterambiguity.dev. + +
+
+ comment: IEEE Transactions on Visualization and Computer Graphics (TVCG) + (Proc. IEEE VIS 2023); equally contributed by Hyeon Jeon and Ghulam Jilani + Quadri +
+
+
+
+
+ + ♻ ☆ ZADU: A Python Library for Evaluating the Reliability of Dimensionality + Reduction Embeddings IEEE VIS 2023 + + +
+ Dimensionality reduction (DR) techniques inherently distort the original +structure of input high-dimensional data, producing imperfect low-dimensional +embeddings. Diverse distortion measures have thus been proposed to evaluate the +reliability of DR embeddings. However, implementing and executing distortion +measures in practice has so far been time-consuming and tedious. To address +this issue, we present ZADU, a Python library that provides distortion +measures. ZADU is not only easy to install and execute but also enables +comprehensive evaluation of DR embeddings through three key features. First, +the library covers a wide range of distortion measures. Second, it +automatically optimizes the execution of distortion measures, substantially +reducing the running time required to execute multiple measures. Last, the +library informs how individual points contribute to the overall distortions, +facilitating the detailed analysis of DR embeddings. By simulating a real-world +scenario of optimizing DR embeddings, we verify that our optimization scheme +substantially reduces the time required to execute distortion measures. +Finally, as an application of ZADU, we present another library called ZADUVis +that allows users to easily create distortion visualizations that depict the +extent to which each region of an embedding suffers from distortions. + +
+
+ comment: 2023 IEEE Visualization and Visual Analytics (IEEE VIS 2023) Short + paper +
+
+
+
+
+ + ♻ ☆ Classes are not Clusters: Improving Label-based Evaluation of + Dimensionality Reduction IEEE VIS 2023 + + +
+ A common way to evaluate the reliability of dimensionality reduction (DR) +embeddings is to quantify how well labeled classes form compact, mutually +separated clusters in the embeddings. This approach is based on the assumption +that the classes stay as clear clusters in the original high-dimensional space. +However, in reality, this assumption can be violated; a single class can be +fragmented into multiple separated clusters, and multiple classes can be merged +into a single cluster. We thus cannot always assure the credibility of the +evaluation using class labels. In this paper, we introduce two novel quality +measures -- Label-Trustworthiness and Label-Continuity (Label-T&C) -- advancing +the process of DR evaluation based on class labels. Instead of assuming that +classes are well-clustered in the original space, Label-T&C work by (1) +estimating the extent to which classes form clusters in the original and +embedded spaces and (2) evaluating the difference between the two. A +quantitative evaluation showed that Label-T&C outperform widely used DR +evaluation measures (e.g., Trustworthiness and Continuity, Kullback-Leibler +divergence) in terms of the accuracy in assessing how well DR embeddings +preserve the cluster structure, and are also scalable. Moreover, we present +case studies demonstrating that Label-T&C can be successfully used for +revealing the intrinsic characteristics of DR techniques and their +hyperparameters. + +
+
+ comment: IEEE Transactions on Visualization and Computer Graphics (TVCG) + (Proc. IEEE VIS 2023) +
+
+
+
+
+ + ♻ ☆ Initial State Interventions for Deconfounded Imitation Learning + + +
+ Imitation learning suffers from causal confusion. This phenomenon occurs when +learned policies attend to features that do not causally influence the expert +actions but are instead spuriously correlated. Causally confused agents produce +low open-loop supervised loss but poor closed-loop performance upon deployment. +We consider the problem of masking observed confounders in a disentangled +representation of the observation space. Our novel masking algorithm leverages +the usual ability to intervene in the initial system state, avoiding any +requirement involving expert querying, expert reward functions, or causal graph +specification. Under certain assumptions, we theoretically prove that this +algorithm is conservative in the sense that it does not incorrectly mask +observations that causally influence the expert; furthermore, intervening on +the initial state serves to strictly reduce excess conservatism. The masking +algorithm is applied to behavior cloning for two illustrative control systems: +CartPole and Reacher. + +
+
+ comment: 62nd IEEE Conference on Decision and Control +
+
+
+
+
+ + ♻ ☆ Self-Supervised Coordinate Projection Network for Sparse-View Computed + Tomography + + +
+ In the present work, we propose a Self-supervised COordinate Projection +nEtwork (SCOPE) to reconstruct the artifacts-free CT image from a single SV +sinogram by solving the inverse tomography imaging problem. Compared with +recent related works that solve similar problems using implicit neural +representation network (INR), our essential contribution is an effective and +simple re-projection strategy that pushes the tomography image reconstruction +quality over supervised deep learning CT reconstruction works. The proposed +strategy is inspired by the simple relationship between linear algebra and +inverse problems. To solve the under-determined linear equation system, we +first introduce INR to constrain the solution space via image continuity prior +and achieve a rough solution. And secondly, we propose to generate a dense view +sinogram that improves the rank of the linear equation system and produces a +more stable CT image solution space. Our experiment results demonstrate that +the re-projection strategy significantly improves the image reconstruction +quality (+3 dB for PSNR at least). Besides, we integrate the recent hash +encoding into our SCOPE model, which greatly accelerates the model training. +Finally, we evaluate SCOPE in parallel and fan X-ray beam SVCT reconstruction +tasks. Experimental results indicate that the proposed SCOPE model outperforms +two latest INR-based methods and two well-popular supervised DL methods +quantitatively and qualitatively. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ Unconstrained Dynamic Regret via Sparse Coding + + +
+ Motivated by the challenge of nonstationarity in sequential decision making, +we study Online Convex Optimization (OCO) under the coupling of two problem +structures: the domain is unbounded, and the comparator sequence +$u_1,\ldots,u_T$ is arbitrarily time-varying. As no algorithm can guarantee low +regret simultaneously against all comparator sequences, handling this setting +requires moving from minimax optimality to comparator adaptivity. That is, +sensible regret bounds should depend on certain complexity measures of the +comparator relative to one's prior knowledge. + This paper achieves a new type of these adaptive regret bounds via a sparse +coding framework. The complexity of the comparator is measured by its energy +and its sparsity on a user-specified dictionary, which offers considerable +versatility. Equipped with a wavelet dictionary for example, our framework +improves the state-of-the-art bound (Jacobsen & Cutkosky, 2022) by adapting to +both ($i$) the magnitude of the comparator average $||\bar +u||=||\sum_{t=1}^Tu_t/T||$, rather than the maximum $\max_t||u_t||$; and ($ii$) +the comparator variability $\sum_{t=1}^T||u_t-\bar u||$, rather than the +uncentered sum $\sum_{t=1}^T||u_t||$. Furthermore, our analysis is simpler due +to decoupling function approximation from regret minimization. + +
+
+ comment: Small technical improvements + fixing typos +
+
+
+
+
+ + ♻ ☆ NIPD: A Federated Learning Person Detection Benchmark Based on + Real-World Non-IID Data IJCAI 23 + + +
+ Federated learning (FL), a privacy-preserving distributed machine learning, +has been rapidly applied in wireless communication networks. FL enables +Internet of Things (IoT) clients to obtain well-trained models while preventing +privacy leakage. Person detection can be deployed on edge devices with limited +computing power if combined with FL to process the video data directly at the +edge. However, due to the different hardware and deployment scenarios of +different cameras, the data collected by the camera present non-independent and +identically distributed (non-IID), and the global model derived from FL +aggregation is less effective. Meanwhile, existing research lacks public data +set for real-world FL object detection, which is not conducive to studying the +non-IID problem on IoT cameras. Therefore, we open source a non-IID IoT person +detection (NIPD) data set, which is collected from five different cameras. To +our knowledge, this is the first true device-based non-IID person detection +data set. Based on this data set, we explain how to establish a FL experimental +platform and provide a benchmark for non-IID person detection. NIPD is expected +to promote the application of FL and the security of smart city. + +
+
+ comment: 8 pages, 5 figures, 3 tables, FL-IJCAI 23 conference +
+
+
+
+
+ + ♻ ☆ Hard Sample Mining Enabled Supervised Contrastive Feature Learning for + Wind Turbine Pitch System Fault Diagnosis + + +
+ The efficient utilization of wind power by wind turbines relies on the +ability of their pitch systems to adjust blade pitch angles in response to +varying wind speeds. However, the presence of multiple health conditions in the +pitch system due to the long-term wear and tear poses challenges in accurately +classifying them, thus increasing the maintenance cost of wind turbines or even +damaging them. This paper proposes a novel method based on hard sample +mining-enabled supervised contrastive learning (HSMSCL) to address this +problem. The proposed method employs cosine similarity to identify hard samples +and subsequently, leverages supervised contrastive learning to learn more +discriminative representations by constructing hard sample pairs. Furthermore, +the hard sample mining framework in the proposed method also constructs hard +samples with learned representations to make the training process of the +multilayer perceptron (MLP) more challenging and make it a more effective +classifier. The proposed approach progressively improves the fault diagnosis +model by introducing hard samples in the SCL and MLP phases, thus enhancing its +performance in complex multi-class fault diagnosis tasks. + To evaluate the effectiveness of the proposed method, two real datasets +comprising wind turbine pitch system cog belt fracture data are utilized. The +fault diagnosis performance of the proposed method is compared against existing +methods, and the results demonstrate its superior performance. The proposed +approach exhibits significant improvements in fault diagnosis performance, +providing promising prospects for enhancing the reliability and efficiency of +wind turbine pitch system fault diagnosis. + +
+
+
+
+
+ + ♻ ☆ Completeness of Atomic Structure Representations + + +
+ In this paper, we address the challenge of obtaining a comprehensive and +symmetric representation of point particle groups, such as atoms in a molecule, +which is crucial in physics and theoretical chemistry. The problem has become +even more important with the widespread adoption of machine-learning techniques +in science, as it underpins the capacity of models to accurately reproduce +physical relationships while being consistent with fundamental symmetries and +conservation laws. However, the descriptors that are commonly used to represent +point clouds -- most notably those adopted to describe matter at the atomic +scale -- are unable to distinguish between special arrangements of particles. +This makes it impossible to machine learn their properties. Frameworks that are +provably complete exist but are only so in the limit in which they +simultaneously describe the mutual relationship between all atoms, which is +impractical. We present a novel approach to construct descriptors of finite +correlations based on the relative arrangement of particle triplets, which can +be employed to create symmetry-adapted models with universal approximation +capabilities. Our strategy is demonstrated on a class of atomic arrangements +that are specifically built to defy a broad class of conventional symmetric +descriptors, showcasing its potential for addressing their limitations. + +
+
+
+
+
+ + ♻ ☆ Learning to Relight Portrait Images via a Virtual Light Stage and + Synthetic-to-Real Adaptation SIGGRAPH + + +
+ Given a portrait image of a person and an environment map of the target +lighting, portrait relighting aims to re-illuminate the person in the image as +if the person appeared in an environment with the target lighting. To achieve +high-quality results, recent methods rely on deep learning. An effective +approach is to supervise the training of deep neural networks with a +high-fidelity dataset of desired input-output pairs, captured with a light +stage. However, acquiring such data requires an expensive special capture rig +and time-consuming efforts, limiting access to only a few resourceful +laboratories. To address the limitation, we propose a new approach that can +perform on par with the state-of-the-art (SOTA) relighting methods without +requiring a light stage. Our approach is based on the realization that a +successful relighting of a portrait image depends on two conditions. First, the +method needs to mimic the behaviors of physically-based relighting. Second, the +output has to be photorealistic. To meet the first condition, we propose to +train the relighting network with training data generated by a virtual light +stage that performs physically-based rendering on various 3D synthetic humans +under different environment maps. To meet the second condition, we develop a +novel synthetic-to-real approach to bring photorealism to the relighting +network output. In addition to achieving SOTA results, our approach offers +several advantages over the prior methods, including controllable glares on +glasses and more temporally-consistent results for relighting videos. + +
+
+ comment: To appear in ACM Transactions on Graphics (SIGGRAPH Asia 2022). 21 + pages, 25 figures, 7 tables. Project page: + https://research.nvidia.com/labs/dir/lumos/ +
+
+
+
+
+ + ♻ ☆ Exploring Machine Learning and Transformer-based Approaches for + Deceptive Text Classification: A Comparative Analysis + + +
+ Deceptive text classification is a critical task in natural language +processing that aims to identify deceptive o fraudulent content. This study +presents a comparative analysis of machine learning and transformer-based +approaches for deceptive text classification. We investigate the effectiveness +of traditional machine learning algorithms and state-of-the-art transformer +models, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive +text. A labeled dataset consisting of deceptive and non-deceptive texts is used +for training and evaluation purposes. Through extensive experimentation, we +compare the performance metrics, including accuracy, precision, recall, and F1 +score, of the different approaches. The results of this study shed light on the +strengths and limitations of machine learning and transformer-based methods for +deceptive text classification, enabling researchers and practitioners to make +informed decisions when dealing with deceptive content. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Trained Transformers Learn Linear Models In-Context + + +
+ Attention-based neural networks such as transformers have demonstrated a +remarkable ability to exhibit in-context learning (ICL): Given a short prompt +sequence of tokens from an unseen task, they can formulate relevant per-token +and next-token predictions without any parameter updates. By embedding a +sequence of labeled training data and unlabeled test data as a prompt, this +allows for transformers to behave like supervised learning algorithms. Indeed, +recent work has shown that when training transformer architectures over random +instances of linear regression problems, these models' predictions mimic those +of ordinary least squares. + Towards understanding the mechanisms underlying this phenomenon, we +investigate the dynamics of ICL in transformers with a single linear +self-attention layer trained by gradient flow on linear regression tasks. We +show that despite non-convexity, gradient flow with a suitable random +initialization finds a global minimum of the objective function. At this global +minimum, when given a test prompt of labeled examples from a new prediction +task, the transformer achieves prediction error competitive with the best +linear predictor over the test prompt distribution. We additionally +characterize the robustness of the trained transformer to a variety of +distribution shifts and show that although a number of shifts are tolerated, +shifts in the covariate distribution of the prompts are not. Motivated by this, +we consider a generalized ICL setting where the covariate distributions can +vary across prompts. We show that although gradient flow succeeds at finding a +global minimum in this setting, the trained transformer is still brittle under +mild covariate shifts. We complement this finding with experiments on large, +nonlinear transformer architectures which we show are more robust under +covariate shifts. + +
+
+ comment: 50 pages, experiments added, reference added, typo corrected +
+
+
+
+
+ + ♻ ☆ Precise High-Dimensional Asymptotics for Quantifying Heterogeneous + Transfers + + +
+ The problem of learning one task with samples from another task has received +much interest recently. In this paper, we ask a fundamental question: when is +combining data from two tasks better than learning one task alone? Intuitively, +the transfer effect from one task to another task depends on dataset shifts +such as sample sizes and covariance matrices. However, quantifying such a +transfer effect is challenging since we need to compare the risks between joint +learning and single-task learning, and the comparative advantage of one over +the other depends on the exact kind of dataset shift between both tasks. This +paper uses random matrix theory to tackle this challenge in a linear regression +setting with two tasks. We give precise asymptotics about the excess risks of +some commonly used estimators in the high-dimensional regime, when the sample +sizes increase proportionally with the feature dimension at fixed ratios. The +precise asymptotics is provided as a function of the sample sizes and +covariate/model shifts, which can be used to study transfer effects: In a +random-effects model, we give conditions to determine positive and negative +transfers between learning two tasks versus single-task learning; the +conditions reveal intricate relations between dataset shifts and transfer +effects. Simulations justify the validity of the asymptotics in finite +dimensions. Our analysis examines several functions of two different sample +covariance matrices, revealing some estimates that generalize classical results +in the random matrix theory literature, which may be of independent interest. + +
+
+ comment: 64 pages, 6 figures; We thoroughly revised the paper by adding new + results and reorganizing the presentation +
+
+
+
+
+ + ♻ ☆ Homophily-enhanced Structure Learning for Graph Clustering CIKM'23 + + +
+ Graph clustering is a fundamental task in graph analysis, and recent advances +in utilizing graph neural networks (GNNs) have shown impressive results. +Despite the success of existing GNN-based graph clustering methods, they often +overlook the quality of graph structure, which is inherent in real-world graphs +due to their sparse and multifarious nature, leading to subpar performance. +Graph structure learning allows refining the input graph by adding missing +links and removing spurious connections. However, previous endeavors in graph +structure learning have predominantly centered around supervised settings, and +cannot be directly applied to our specific clustering tasks due to the absence +of ground-truth labels. To bridge the gap, we propose a novel method called +\textbf{ho}mophily-enhanced structure \textbf{le}arning for graph clustering +(HoLe). Our motivation stems from the observation that subtly enhancing the +degree of homophily within the graph structure can significantly improve GNNs +and clustering outcomes. To realize this objective, we develop two +clustering-oriented structure learning modules, i.e., hierarchical correlation +estimation and cluster-aware sparsification. The former module enables a more +accurate estimation of pairwise node relationships by leveraging guidance from +latent and clustering spaces, while the latter one generates a sparsified +structure based on the similarity matrix and clustering assignments. +Additionally, we devise a joint optimization approach alternating between +training the homophily-enhanced structure learning and GNN-based clustering, +thereby enforcing their reciprocal effects. Extensive experiments on seven +benchmark datasets of various types and scales, across a range of clustering +metrics, demonstrate the superiority of HoLe against state-of-the-art +baselines. + +
+
+ comment: 11 pages with 7 figures. Accepted by CIKM'23 +
+
+
+
+
+ + ♻ ☆ A Law of Data Separation in Deep Learning + + +
+ While deep learning has enabled significant advances in many areas of +science, its black-box nature hinders architecture design for future artificial +intelligence applications and interpretation for high-stakes decision makings. +We addressed this issue by studying the fundamental question of how deep neural +networks process data in the intermediate layers. Our finding is a simple and +quantitative law that governs how deep neural networks separate data according +to class membership throughout all layers for classification. This law shows +that each layer improves data separation at a constant geometric rate, and its +emergence is observed in a collection of network architectures and datasets +during training. This law offers practical guidelines for designing +architectures, improving model robustness and out-of-sample performance, as +well as interpreting the predictions. + +
+
+ comment: Accepted at PNAS +
+
+
+
+
+ + ♻ ☆ A Survey on Training Challenges in Generative Adversarial Networks for + Biomedical Image Analysis + + +
+ In biomedical image analysis, the applicability of deep learning methods is +directly impacted by the quantity of image data available. This is due to deep +learning models requiring large image datasets to provide high-level +performance. Generative Adversarial Networks (GANs) have been widely utilized +to address data limitations through the generation of synthetic biomedical +images. GANs consist of two models. The generator, a model that learns how to +produce synthetic images based on the feedback it receives. The discriminator, +a model that classifies an image as synthetic or real and provides feedback to +the generator. Throughout the training process, a GAN can experience several +technical challenges that impede the generation of suitable synthetic imagery. +First, the mode collapse problem whereby the generator either produces an +identical image or produces a uniform image from distinct input features. +Second, the non-convergence problem whereby the gradient descent optimizer +fails to reach a Nash equilibrium. Thirdly, the vanishing gradient problem +whereby unstable training behavior occurs due to the discriminator achieving +optimal classification performance resulting in no meaningful feedback being +provided to the generator. These problems result in the production of synthetic +imagery that is blurry, unrealistic, and less diverse. To date, there has been +no survey article outlining the impact of these technical challenges in the +context of the biomedical imagery domain. This work presents a review and +taxonomy based on solutions to the training problems of GANs in the biomedical +imaging domain. This survey highlights important challenges and outlines future +research directions about the training of GANs in the domain of biomedical +imagery. + +
+
+ comment: Submitted to the AI Review Journal +
+
+
+
+
+
+
+
+ + Multimedia 10 + +
+
+
+ + ☆ Audio-Visual Spatial Integration and Recursive Attention for Robust + Sound Source Localization ACM MM 2023 + + +
+ The objective of the sound source localization task is to enable machines to +detect the location of sound-making objects within a visual scene. While the +audio modality provides spatial cues to locate the sound source, existing +approaches only use audio as an auxiliary role to compare spatial regions of +the visual modality. Humans, on the other hand, utilize both audio and visual +modalities as spatial cues to locate sound sources. In this paper, we propose +an audio-visual spatial integration network that integrates spatial cues from +both modalities to mimic human behavior when detecting sound-making objects. +Additionally, we introduce a recursive attention network to mimic human +behavior of iterative focusing on objects, resulting in more accurate attention +regions. To effectively encode spatial information from both modalities, we +propose audio-visual pair matching loss and spatial region alignment loss. By +utilizing the spatial cues of audio-visual modalities and recursively focusing +objects, our method can perform more robust sound source localization. +Comprehensive experimental results on the Flickr SoundNet and VGG-Sound Source +datasets demonstrate the superiority of our proposed method over existing +approaches. Our code is available at: https://github.com/VisualAIKHU/SIRA-SSL + +
+
+ comment: Camera-Ready, ACM MM 2023 +
+
+
+
+
+ + ☆ Versatile Face Animator: Driving Arbitrary 3D Facial Avatar in RGBD + Space ACM MM2023 + + +
+ Creating realistic 3D facial animation is crucial for various applications in +the movie production and gaming industry, especially with the burgeoning demand +in the metaverse. However, prevalent methods such as blendshape-based +approaches and facial rigging techniques are time-consuming, labor-intensive, +and lack standardized configurations, making facial animation production +challenging and costly. In this paper, we propose a novel self-supervised +framework, Versatile Face Animator, which combines facial motion capture with +motion retargeting in an end-to-end manner, eliminating the need for +blendshapes or rigs. Our method has the following two main characteristics: 1) +we propose an RGBD animation module to learn facial motion from raw RGBD videos +by hierarchical motion dictionaries and animate RGBD images rendered from 3D +facial mesh coarse-to-fine, enabling facial animation on arbitrary 3D +characters regardless of their topology, textures, blendshapes, and rigs; and +2) we introduce a mesh retarget module to utilize RGBD animation to create 3D +facial animation by manipulating facial mesh with controller transformations, +which are estimated from dense optical flow fields and blended together with +geodesic-distance-based weights. Comprehensive experiments demonstrate the +effectiveness of our proposed framework in generating impressive 3D facial +animation results, highlighting its potential as a promising solution for the +cost-effective and efficient production of facial animation in the metaverse. + +
+
+ comment: Accepted by ACM MM2023 +
+
+
+
+
+ + ☆ ViGT: Proposal-free Video Grounding with Learnable Token in Transformer SC + + +
+ The video grounding (VG) task aims to locate the queried action or event in +an untrimmed video based on rich linguistic descriptions. Existing +proposal-free methods are trapped in complex interaction between video and +query, overemphasizing cross-modal feature fusion and feature correlation for +VG. In this paper, we propose a novel boundary regression paradigm that +performs regression token learning in a transformer. Particularly, we present a +simple but effective proposal-free framework, namely Video Grounding +Transformer (ViGT), which predicts the temporal boundary using a learnable +regression token rather than multi-modal or cross-modal features. In ViGT, the +benefits of a learnable token are manifested as follows. (1) The token is +unrelated to the video or the query and avoids data bias toward the original +video and query. (2) The token simultaneously performs global context +aggregation from video and query features. First, we employed a sharing feature +encoder to project both video and query into a joint feature space before +performing cross-modal co-attention (i.e., video-to-query attention and +query-to-video attention) to highlight discriminative features in each +modality. Furthermore, we concatenated a learnable regression token [REG] with +the video and query features as the input of a vision-language transformer. +Finally, we utilized the token [REG] to predict the target moment and visual +features to constrain the foreground and background probabilities at each +timestamp. The proposed ViGT performed well on three public datasets: ANet +Captions, TACoS and YouCookII. Extensive ablation studies and qualitative +analysis further validated the interpretability of ViGT. + +
+
+ comment: This paper has been accepted by SCIENCE CHINA Information Sciences +
+
+
+
+
+ + ☆ Audio is all in one: speech-driven gesture synthetics using WavLM + pre-trained model + + +
+ The generation of co-speech gestures for digital humans is an emerging area +in the field of virtual human creation. Prior research has made progress by +using acoustic and semantic information as input and adopting classify method +to identify the person's ID and emotion for driving co-speech gesture +generation. However, this endeavour still faces significant challenges. These +challenges go beyond the intricate interplay between co-speech gestures, speech +acoustic, and semantics; they also encompass the complexities associated with +personality, emotion, and other obscure but important factors. This paper +introduces "diffmotion-v2," a speech-conditional diffusion-based and +non-autoregressive transformer-based generative model with WavLM pre-trained +model. It can produce individual and stylized full-body co-speech gestures only +using raw speech audio, eliminating the need for complex multimodal processing +and manually annotated. Firstly, considering that speech audio not only +contains acoustic and semantic features but also conveys personality traits, +emotions, and more subtle information related to accompanying gestures, we +pioneer the adaptation of WavLM, a large-scale pre-trained model, to extract +low-level and high-level audio information. Secondly, we introduce an adaptive +layer norm architecture in the transformer-based layer to learn the +relationship between speech information and accompanying gestures. Extensive +subjective evaluation experiments are conducted on the Trinity, ZEGGS, and BEAT +datasets to confirm the WavLM and the model's ability to synthesize natural +co-speech gestures with various styles. + +
+
+ comment: 10 pages, 5 figures, 1 table +
+
+
+
+
+ + ☆ Semantics2Hands: Transferring Hand Motion Semantics between Avatars + + +
+ Human hands, the primary means of non-verbal communication, convey intricate +semantics in various scenarios. Due to the high sensitivity of individuals to +hand motions, even minor errors in hand motions can significantly impact the +user experience. Real applications often involve multiple avatars with varying +hand shapes, highlighting the importance of maintaining the intricate semantics +of hand motions across the avatars. Therefore, this paper aims to transfer the +hand motion semantics between diverse avatars based on their respective hand +models. To address this problem, we introduce a novel anatomy-based semantic +matrix (ASM) that encodes the semantics of hand motions. The ASM quantifies the +positions of the palm and other joints relative to the local frame of the +corresponding joint, enabling precise retargeting of hand motions. +Subsequently, we obtain a mapping function from the source ASM to the target +hand joint rotations by employing an anatomy-based semantics reconstruction +network (ASRN). We train the ASRN using a semi-supervised learning strategy on +the Mixamo and InterHand2.6M datasets. We evaluate our method in intra-domain +and cross-domain hand motion retargeting tasks. The qualitative and +quantitative results demonstrate the significant superiority of our ASRN over +the state-of-the-arts. + +
+
+ comment: Accepted to MM 2023, 9 pages, 10 figures. Project page: + https://abcyzj.github.io/S2H/ +
+
+
+
+
+ + ♻ ☆ Deep Learning for Diverse Data Types Steganalysis: A Review + + +
+ Steganography and steganalysis are two interrelated aspects of the field of +information security. Steganography seeks to conceal communications, whereas +steganalysis is aimed to either find them or even, if possible, recover the +data they contain. Steganography and steganalysis have attracted a great deal +of interest, particularly from law enforcement. Steganography is often used by +cybercriminals and even terrorists to avoid being captured while in possession +of incriminating evidence, even encrypted, since cryptography is prohibited or +restricted in many countries. Therefore, knowledge of cutting-edge techniques +to uncover concealed information is crucial in exposing illegal acts. Over the +last few years, a number of strong and reliable steganography and steganalysis +techniques have been introduced in the literature. This review paper provides a +comprehensive overview of deep learning-based steganalysis techniques used to +detect hidden information within digital media. The paper covers all types of +cover in steganalysis, including image, audio, and video, and discusses the +most commonly used deep learning techniques. In addition, the paper explores +the use of more advanced deep learning techniques, such as deep transfer +learning (DTL) and deep reinforcement learning (DRL), to enhance the +performance of steganalysis systems. The paper provides a systematic review of +recent research in the field, including data sets and evaluation metrics used +in recent studies. It also presents a detailed analysis of DTL-based +steganalysis approaches and their performance on different data sets. The +review concludes with a discussion on the current state of deep learning-based +steganalysis, challenges, and future research directions. + +
+
+
+
+
+ + ♻ ☆ Towards Unified Text-based Person Retrieval: A Large-scale + Multi-Attribute and Language Search Benchmark + + +
+ In this paper, we introduce a large Multi-Attribute and Language Search +dataset for text-based person retrieval, called MALS, and explore the +feasibility of performing pre-training on both attribute recognition and +image-text matching tasks in one stone. In particular, MALS contains 1,510,330 +image-text pairs, which is about 37.5 times larger than prevailing CUHK-PEDES, +and all images are annotated with 27 attributes. Considering the privacy +concerns and annotation costs, we leverage the off-the-shelf diffusion models +to generate the dataset. To verify the feasibility of learning from the +generated data, we develop a new joint Attribute Prompt Learning and Text +Matching Learning (APTM) framework, considering the shared knowledge between +attribute and text. As the name implies, APTM contains an attribute prompt +learning stream and a text matching learning stream. (1) The attribute prompt +learning leverages the attribute prompts for image-attribute alignment, which +enhances the text matching learning. (2) The text matching learning facilitates +the representation learning on fine-grained details, and in turn, boosts the +attribute prompt learning. Extensive experiments validate the effectiveness of +the pre-training on MALS, achieving state-of-the-art retrieval performance via +APTM on three challenging real-world benchmarks. In particular, APTM achieves a +consistent improvement of +6.96%, +7.68%, and +16.95% Recall@1 accuracy on +CUHK-PEDES, ICFG-PEDES, and RSTPReid datasets by a clear margin, respectively. + +
+
+
+
+
+ + ♻ ☆ Training Multimedia Event Extraction With Generated Images and Captions + + +
+ Contemporary news reporting increasingly features multimedia content, +motivating research on multimedia event extraction. However, the task lacks +annotated multimodal training data and artificially generated training data +suffer from distribution shift from real-world data. In this paper, we propose +Cross-modality Augmented Multimedia Event Learning (CAMEL), which successfully +utilizes artificially generated multimodal training data and achieves +state-of-the-art performance. We start with two labeled unimodal datasets in +text and image respectively, and generate the missing modality using +off-the-shelf image generators like Stable Diffusion and image captioners like +BLIP. After that, we train the network on the resultant multimodal datasets. In +order to learn robust features that are effective across domains, we devise an +iterative and gradual training strategy. Substantial experiments show that +CAMEL surpasses state-of-the-art (SOTA) baselines on the M2E2 benchmark. On +multimedia events in particular, we outperform the prior SOTA by 4.2% F1 on +event mention identification and by 9.8% F1 on argument identification, which +indicates that CAMEL learns synergistic representations from the two +modalities. Our work demonstrates a recipe to unleash the power of synthetic +training data in structured prediction. + +
+
+
+
+
+ + ♻ ☆ PersonalTailor: Personalizing 2D Pattern Design from 3D Garment Point + Clouds + + +
+ Garment pattern design aims to convert a 3D garment to the corresponding 2D +panels and their sewing structure. Existing methods rely either on template +fitting with heuristics and prior assumptions, or on model learning with +complicated shape parameterization. Importantly, both approaches do not allow +for personalization of the output garment, which today has increasing demands. +To fill this demand, we introduce PersonalTailor: a personalized 2D pattern +design method, where the user can input specific constraints or demands (in +language or sketch) for personal 2D panel fabrication from 3D point clouds. +PersonalTailor first learns a multi-modal panel embeddings based on +unsupervised cross-modal association and attentive fusion. It then predicts a +binary panel masks individually using a transformer encoder-decoder framework. +Extensive experiments show that our PersonalTailor excels on both personalized +and standard pattern fabrication tasks. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ A Closer Look into Recent Video-based Learning Research: A Comprehensive + Review of Video Characteristics, Tools, Technologies, and Learning + Effectiveness + + +
+ People increasingly use videos on the Web as a source for learning. To +support this way of learning, researchers and developers are continuously +developing tools, proposing guidelines, analyzing data, and conducting +experiments. However, it is still not clear what characteristics a video should +have to be an effective learning medium. In this paper, we present a +comprehensive review of 257 articles on video-based learning for the period +from 2016 to 2021. One of the aims of the review is to identify the video +characteristics that have been explored by previous work. Based on our +analysis, we suggest a taxonomy which organizes the video characteristics and +contextual aspects into eight categories: (1) audio features, (2) visual +features, (3) textual features, (4) instructor behavior, (5) learners +activities, (6) interactive features (quizzes, etc.), (7) production style, and +(8) instructional design. Also, we identify four representative research +directions: (1) proposals of tools to support video-based learning, (2) studies +with controlled experiments, (3) data analysis studies, and (4) proposals of +design guidelines for learning videos. We find that the most explored +characteristics are textual features followed by visual features, learner +activities, and interactive features. Text of transcripts, video frames, and +images (figures and illustrations) are most frequently used by tools that +support learning through videos. The learner activity is heavily explored +through log files in data analysis studies, and interactive features have been +frequently scrutinized in controlled experiments. We complement our review by +contrasting research findings that investigate the impact of video +characteristics on the learning effectiveness, report on tasks and technologies +used to develop tools that support learning, and summarize trends of design +guidelines to produce learning videos + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 33 + +
+
+
+ + ☆ EXPRESSO: A Benchmark and Analysis of Discrete Expressive Speech + Resynthesis + + +
+ Recent work has shown that it is possible to resynthesize high-quality speech +based, not on text, but on low bitrate discrete units that have been learned in +a self-supervised fashion and can therefore capture expressive aspects of +speech that are hard to transcribe (prosody, voice styles, non-verbal +vocalization). The adoption of these methods is still limited by the fact that +most speech synthesis datasets are read, severely limiting spontaneity and +expressivity. Here, we introduce Expresso, a high-quality expressive speech +dataset for textless speech synthesis that includes both read speech and +improvised dialogues rendered in 26 spontaneous expressive styles. We +illustrate the challenges and potentials of this dataset with an expressive +resynthesis benchmark where the task is to encode the input in low-bitrate +units and resynthesize it in a target voice while preserving content and style. +We evaluate resynthesis quality with automatic metrics for different +self-supervised discrete encoders, and explore tradeoffs between quality, +bitrate and invariance to speaker and style. All the dataset, evaluation +metrics and baseline models are open source + +
+
+
+
+
+ + ☆ A Preliminary Study of the Intrinsic Relationship between Complexity and + Alignment + + +
+ Training large language models (LLMs) with open-domain instruction data has +yielded remarkable success in aligning to end tasks and user preferences. +Extensive research has highlighted that enhancing the quality and diversity of +instruction data consistently improves performance. However, the impact of data +complexity, as a crucial metric, remains relatively unexplored in three +aspects: (1) scaling law, where the sustainability of performance improvements +with increasing complexity is uncertain, (2) additional tokens, whether the +improvement brought by complexity comes from introducing more training tokens, +and (3) curriculum tuning, where the potential advantages of incorporating +instructions ranging from easy to difficult are not yet fully understood. In +this paper, we propose \textit{tree-instruct} to systematically enhance the +complexity of instruction data in a controllable manner. This approach adds a +specified number of nodes into the instruction semantic tree, yielding new +instruction data based on the modified tree. By adjusting the number of added +nodes, we can control the difficulty level in the modified instruction data. +Our preliminary experiments reveal the following insights: (1) Increasing +complexity consistently leads to sustained performance improvements. For +instance, using 1,000 instruction data and 10 nodes resulted in a substantial +24\% increase in win rate. (2) Under the same token budget, a few complex +instructions outperform diverse yet simple instructions. (3) Curriculum +instruction tuning might not yield the anticipated results; focusing on +increasing complexity appears to be the key. + +
+
+
+
+
+ + ☆ Finding Already Debunked Narratives via Multistage Retrieval: Enabling + Cross-Lingual, Cross-Dataset and Zero-Shot Learning + + +
+ The task of retrieving already debunked narratives aims to detect stories +that have already been fact-checked. The successful detection of claims that +have already been debunked not only reduces the manual efforts of professional +fact-checkers but can also contribute to slowing the spread of misinformation. +Mainly due to the lack of readily available data, this is an understudied +problem, particularly when considering the cross-lingual task, i.e. the +retrieval of fact-checking articles in a language different from the language +of the online post being checked. This paper fills this gap by (i) creating a +novel dataset to enable research on cross-lingual retrieval of already debunked +narratives, using tweets as queries to a database of fact-checking articles; +(ii) presenting an extensive experiment to benchmark fine-tuned and +off-the-shelf multilingual pre-trained Transformer models for this task; and +(iii) proposing a novel multistage framework that divides this cross-lingual +debunk retrieval task into refinement and re-ranking stages. Results show that +the task of cross-lingual retrieval of already debunked narratives is +challenging and off-the-shelf Transformer models fail to outperform a strong +lexical-based baseline (BM25). Nevertheless, our multistage retrieval framework +is robust, outperforming BM25 in most scenarios and enabling cross-domain and +zero-shot learning, without significantly harming the model's performance. + +
+
+
+
+
+ + ☆ AST-MHSA : Code Summarization using Multi-Head Self-Attention + + +
+ Code summarization aims to generate concise natural language descriptions for +source code. The prevailing approaches adopt transformer-based encoder-decoder +architectures, where the Abstract Syntax Tree (AST) of the source code is +utilized for encoding structural information. However, ASTs are much longer +than the corresponding source code, and existing methods ignore this size +constraint by directly feeding the entire linearized AST into the encoders. +This simplistic approach makes it challenging to extract truly valuable +dependency relations from the overlong input sequence and leads to significant +computational overhead due to self-attention applied to all nodes in the AST. + To address this issue effectively and efficiently, we present a model, +AST-MHSA that uses multi-head attention to extract the important semantic +information from the AST. The model consists of two main components: an encoder +and a decoder. The encoder takes as input the abstract syntax tree (AST) of the +code and generates a sequence of hidden states. The decoder then takes these +hidden states as input and generates a natural language summary of the code. + The multi-head attention mechanism allows the model to learn different +representations of the input code, which can be combined to generate a more +comprehensive summary. The model is trained on a dataset of code and summaries, +and the parameters of the model are optimized to minimize the loss between the +generated summaries and the ground-truth summaries. + +
+
+
+
+
+ + ☆ IIHT: Medical Report Generation with Image-to-Indicator Hierarchical + Transformer + + +
+ Automated medical report generation has become increasingly important in +medical analysis. It can produce computer-aided diagnosis descriptions and thus +significantly alleviate the doctors' work. Inspired by the huge success of +neural machine translation and image captioning, various deep learning methods +have been proposed for medical report generation. However, due to the inherent +properties of medical data, including data imbalance and the length and +correlation between report sequences, the generated reports by existing methods +may exhibit linguistic fluency but lack adequate clinical accuracy. In this +work, we propose an image-to-indicator hierarchical transformer (IIHT) +framework for medical report generation. It consists of three modules, i.e., a +classifier module, an indicator expansion module and a generator module. The +classifier module first extracts image features from the input medical images +and produces disease-related indicators with their corresponding states. The +disease-related indicators are subsequently utilised as input for the indicator +expansion module, incorporating the "data-text-data" strategy. The +transformer-based generator then leverages these extracted features along with +image features as auxiliary information to generate final reports. Furthermore, +the proposed IIHT method is feasible for radiologists to modify disease +indicators in real-world scenarios and integrate the operations into the +indicator expansion module for fluent and accurate medical report generation. +Extensive experiments and comparisons with state-of-the-art methods under +various evaluation metrics demonstrate the great performance of the proposed +method. + +
+
+
+
+
+ + ☆ LASIGE and UNICAGE solution to the NASA LitCoin NLP Competition + + +
+ Biomedical Natural Language Processing (NLP) tends to become cumbersome for +most researchers, frequently due to the amount and heterogeneity of text to be +processed. To address this challenge, the industry is continuously developing +highly efficient tools and creating more flexible engineering solutions. This +work presents the integration between industry data engineering solutions for +efficient data processing and academic systems developed for Named Entity +Recognition (LasigeUnicage\_NER) and Relation Extraction (BiOnt). Our design +reflects an integration of those components with external knowledge in the form +of additional training data from other datasets and biomedical ontologies. We +used this pipeline in the 2022 LitCoin NLP Challenge, where our team +LasigeUnicage was awarded the 7th Prize out of approximately 200 participating +teams, reflecting a successful collaboration between the academia (LASIGE) and +the industry (Unicage). The software supporting this work is available at +\url{https://github.com/lasigeBioTM/Litcoin-Lasige_Unicage}. + +
+
+
+
+
+ + ☆ You Only Prompt Once: On the Capabilities of Prompt Learning on Large + Language Models to Tackle Toxic Content + + +
+ The spread of toxic content online is an important problem that has adverse +effects on user experience online and in our society at large. Motivated by the +importance and impact of the problem, research focuses on developing solutions +to detect toxic content, usually leveraging machine learning (ML) models +trained on human-annotated datasets. While these efforts are important, these +models usually do not generalize well and they can not cope with new trends +(e.g., the emergence of new toxic terms). Currently, we are witnessing a shift +in the approach to tackling societal issues online, particularly leveraging +large language models (LLMs) like GPT-3 or T5 that are trained on vast corpora +and have strong generalizability. In this work, we investigate how we can use +LLMs and prompt learning to tackle the problem of toxic content, particularly +focusing on three tasks; 1) Toxicity Classification, 2) Toxic Span Detection, +and 3) Detoxification. We perform an extensive evaluation over five model +architectures and eight datasets demonstrating that LLMs with prompt learning +can achieve similar or even better performance compared to models trained on +these specific tasks. We find that prompt learning achieves around 10\% +improvement in the toxicity classification task compared to the baselines, +while for the toxic span detection task we find better performance to the best +baseline (0.643 vs. 0.640 in terms of $F_1$-score). Finally, for the +detoxification task, we find that prompt learning can successfully reduce the +average toxicity score (from 0.775 to 0.213) while preserving semantic meaning. + +
+
+ comment: To Appear in the 45th IEEE Symposium on Security and Privacy, May + 20-23, 2024 +
+
+
+
+
+ + ☆ Do Language Models Refer? + + +
+ What do language models (LMs) do with language? Everyone agrees that they +produce sequences of (mostly) coherent sentences. But are they saying anything +with those strings or simply babbling in a convincing simulacrum of language +use? This is a vague question, and there are many ways of making it precise. +Here we will address one aspect of the question, namely, whether LMs' words +refer: that is, whether the outputs of LMs achieve "word-to-world" connections. +There is prima facie reason to think they do not since LMs do not interact with +the world in the way that ordinary language users do. Drawing on insights from +the externalist tradition in philosophy of language, we argue that appearances +are misleading and that there is good reason to think that LMs can refer. + +
+
+
+
+
+ + ☆ Exploring Linguistic Similarity and Zero-Shot Learning for Multilingual + Translation of Dravidian Languages + + +
+ Current research in zero-shot translation is plagued by several issues such +as high compute requirements, increased training time and off target +translations. Proposed remedies often come at the cost of additional data or +compute requirements. Pivot based neural machine translation is preferred over +a single-encoder model for most settings despite the increased training and +evaluation time. In this work, we overcome the shortcomings of zero-shot +translation by taking advantage of transliteration and linguistic similarity. +We build a single encoder-decoder neural machine translation system for +Dravidian-Dravidian multilingual translation and perform zero-shot translation. +We compare the data vs zero-shot accuracy tradeoff and evaluate the performance +of our vanilla method against the current state of the art pivot based method. +We also test the theory that morphologically rich languages require large +vocabularies by restricting the vocabulary using an optimal transport based +technique. Our model manages to achieves scores within 3 BLEU of large-scale +pivot-based models when it is trained on 50\% of the language directions. + +
+
+
+
+
+ + ☆ Bringing order into the realm of Transformer-based language models for + artificial intelligence and law + + +
+ Transformer-based language models (TLMs) have widely been recognized to be a +cutting-edge technology for the successful development of deep-learning-based +solutions to problems and applications that require natural language processing +and understanding. Like for other textual domains, TLMs have indeed pushed the +state-of-the-art of AI approaches for many tasks of interest in the legal +domain. Despite the first Transformer model being proposed about six years ago, +there has been a rapid progress of this technology at an unprecedented rate, +whereby BERT and related models represent a major reference, also in the legal +domain. This article provides the first systematic overview of TLM-based +methods for AI-driven problems and tasks in the legal sphere. A major goal is +to highlight research advances in this field so as to understand, on the one +hand, how the Transformers have contributed to the success of AI in supporting +legal processes, and on the other hand, what are the current limitations and +opportunities for further research development. + +
+
+ comment: Accepted for publication with Artificial Intelligence and Law, + Springer Nature +
+
+
+
+
+ + ☆ LLM As DBA + + +
+ Database administrators (DBAs) play a crucial role in managing, maintaining +and optimizing a database system to ensure data availability, performance, and +reliability. However, it is hard and tedious for DBAs to manage a large number +of database instances (e.g., millions of instances on the cloud databases). +Recently large language models (LLMs) have shown great potential to understand +valuable documents and accordingly generate reasonable answers. Thus, we +propose D-Bot, a LLM-based database administrator that can continuously acquire +database maintenance experience from textual sources, and provide reasonable, +well-founded, in-time diagnosis and optimization advice for target databases. +This paper presents a revolutionary LLM-centric framework for database +maintenance, including (i) database maintenance knowledge detection from +documents and tools, (ii) tree of thought reasoning for root cause analysis, +and (iii) collaborative diagnosis among multiple LLMs. Our preliminary +experimental results that D-Bot can efficiently and effectively diagnose the +root causes and our code is available at +github.com/TsinghuaDatabaseGroup/DB-GPT. + +
+
+
+
+
+ + ☆ Exploring Machine Learning and Transformer-based Approaches for + Deceptive Text Classification: A Comparative Analysis + + +
+ Deceptive text classification is a critical task in natural language +processing that aims to identify deceptive or fraudulent content. This study +presents a comparative analysis of machine learning and transformer-based +approaches for deceptive text classification. We investigate the effectiveness +of traditional machine learning algorithms and state-of-the-art transformer +models, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive +text. A labeled dataset consisting of deceptive and non-deceptive texts is used +for training and evaluation purposes. Through extensive experimentation, we +compare the performance metrics, including accuracy, precision, recall, and F1 +score, of the different approaches. The results of this study shed light on the +strengths and limitations of machine learning and transformer-based methods for +deceptive text classification, enabling researchers and practitioners to make +informed decisions when dealing with deceptive content + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ WeaverBird: Empowering Financial Decision-Making with Large Language + Model, Knowledge Base, and Search Engine + + +
+ We present WeaverBird, an intelligent dialogue system designed specifically +for the finance domain. Our system harnesses a large language model of GPT +architecture that has been tuned using extensive corpora of finance-related +text. As a result, our system possesses the capability to understand complex +financial queries, such as "How should I manage my investments during +inflation?", and provide informed responses. Furthermore, our system +incorporates a local knowledge base and a search engine to retrieve relevant +information. The final responses are conditioned on the search results and +include proper citations to the sources, thus enjoying an enhanced credibility. +Through a range of finance-related questions, we have demonstrated the superior +performance of our system compared to other models. To experience our system +firsthand, users can interact with our live demo at +https://weaverbird.ttic.edu, as well as watch our 2-min video illustration at +https://www.youtube.com/watch?v=yofgeqnlrMc. + +
+
+
+
+
+ + ☆ Metacognitive Prompting Improves Understanding in Large Language Models + + +
+ In Large Language Models (LLMs), there have been consistent advancements in +task-specific performance, largely influenced by effective prompt design. While +recent research on prompting has enhanced the reasoning capabilities of LLMs, a +gap remains in further improving their understanding abilities. In this study, +we introduce metacognitive prompting (MP), a strategy inspired by human +introspective reasoning processes. Using MP, LLMs undergo a systematic series +of structured, self-aware evaluations, drawing on both their vast inherent +knowledge and new insights. Our experiments involve five prevalent LLMs: +Llama2, Vicuna, PaLM, GPT-3.5, and GPT-4, all of which span various general +natural language understanding (NLU) tasks from the GLUE and SuperGLUE +benchmarks. Results indicate that, although GPT-4 consistently excels in most +tasks, PaLM, when equipped with MP, approaches its performance level. +Furthermore, across models and datasets, MP consistently outperforms existing +prompting methods, including standard and chain-of-thought prompting. This +study underscores the potential to amplify the understanding abilities of LLMs +and highlights the benefits of mirroring human introspective reasoning in NLU +tasks. + +
+
+ comment: 9 pages, in submission +
+
+
+
+
+ + ☆ Classification of Human- and AI-Generated Texts: Investigating Features + for ChatGPT + + +
+ Recently, generative AIs like ChatGPT have become available to the wide +public. These tools can for instance be used by students to generate essays or +whole theses. But how does a teacher know whether a text is written by a +student or an AI? In our work, we explore traditional and new features to (1) +detect text generated by AI from scratch and (2) text rephrased by AI. Since we +found that classification is more difficult when the AI has been instructed to +create the text in a way that a human would not recognize that it was generated +by an AI, we also investigate this more advanced case. For our experiments, we +produced a new text corpus covering 10 school topics. Our best systems to +classify basic and advanced human-generated/AI-generated texts have F1-scores +of over 96%. Our best systems for classifying basic and advanced +human-generated/AI-rephrased texts have F1-scores of more than 78%. The systems +use a combination of perplexity, semantic, list lookup, error-based, +readability, AI feedback, and text vector features. Our results show that the +new features substantially help to improve the performance of many classifiers. +Our best basic text rephrasing detection system even outperforms GPTZero by +183.8% relative in F1-score. + +
+
+
+
+
+ + ☆ Developing an Informal-Formal Persian Corpus + + +
+ Informal language is a style of spoken or written language frequently used in +casual conversations, social media, weblogs, emails and text messages. In +informal writing, the language faces some lexical and/or syntactic changes +varying among different languages. Persian is one of the languages with many +differences between its formal and informal styles of writing, thus developing +informal language processing tools for this language seems necessary. Such a +converter needs a large aligned parallel corpus of colloquial-formal sentences +which can be useful for linguists to extract a regulated grammar and +orthography for colloquial Persian as is done for the formal language. In this +paper we explain our methodology in building a parallel corpus of 50,000 +sentence pairs with alignments in the word/phrase level. The sentences were +attempted to cover almost all kinds of lexical and syntactic changes between +informal and formal Persian, therefore both methods of exploring and collecting +from the different resources of informal scripts and following the phonological +and morphological patterns of changes were applied to find as much instances as +possible. The resulting corpus has about 530,000 alignments and a dictionary +containing 49,397 word and phrase pairs. + +
+
+ comment: 16 pages, 1 Figure and 3 tables +
+
+
+
+
+ + ☆ Few-Shot Data-to-Text Generation via Unified Representation and + Multi-Source Learning + + +
+ We present a novel approach for structured data-to-text generation that +addresses the limitations of existing methods that primarily focus on specific +types of structured data. Our proposed method aims to improve performance in +multi-task training, zero-shot and few-shot scenarios by providing a unified +representation that can handle various forms of structured data such as tables, +knowledge graph triples, and meaning representations. We demonstrate that our +proposed approach can effectively adapt to new structured forms, and can +improve performance in comparison to current methods. For example, our method +resulted in a 66% improvement in zero-shot BLEU scores when transferring models +trained on table inputs to a knowledge graph dataset. Our proposed method is an +important step towards a more general data-to-text generation framework. + +
+
+
+
+
+ + ☆ Investigating disaster response through social media data and the + Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S. + wildfire season + + +
+ Effective disaster response is critical for affected communities. Responders +and decision-makers would benefit from reliable, timely measures of the issues +impacting their communities during a disaster, and social media offers a +potentially rich data source. Social media can reflect public concerns and +demands during a disaster, offering valuable insights for decision-makers to +understand evolving situations and optimize resource allocation. We used +Bidirectional Encoder Representations from Transformers (BERT) topic modeling +to cluster topics from Twitter data. Then, we conducted a temporal-spatial +analysis to examine the distribution of these topics across different regions +during the 2020 western U.S. wildfire season. Our results show that Twitter +users mainly focused on three topics:"health impact," "damage," and +"evacuation." We used the Susceptible-Infected-Recovered (SIR) theory to +explore the magnitude and velocity of topic diffusion on Twitter. The results +displayed a clear relationship between topic trends and wildfire propagation +patterns. The estimated parameters obtained from the SIR model in selected +cities revealed that residents exhibited a high level of several concerns +during the wildfire. Our study details how the SIR model and topic modeling +using social media data can provide decision-makers with a quantitative +approach to measure disaster response and support their decision-making +processes. + +
+
+
+
+
+ + ☆ A Novel Self-training Approach for Low-resource Speech Recognition + + +
+ In this paper, we propose a self-training approach for automatic speech +recognition (ASR) for low-resource settings. While self-training approaches +have been extensively developed and evaluated for high-resource languages such +as English, their applications to low-resource languages like Punjabi have been +limited, despite the language being spoken by millions globally. The scarcity +of annotated data has hindered the development of accurate ASR systems, +especially for low-resource languages (e.g., Punjabi and M\=aori languages). To +address this issue, we propose an effective self-training approach that +generates highly accurate pseudo-labels for unlabeled low-resource speech. Our +experimental analysis demonstrates that our approach significantly improves +word error rate, achieving a relative improvement of 14.94% compared to a +baseline model across four real speech datasets. Further, our proposed approach +reports the best results on the Common Voice Punjabi dataset. + +
+
+ comment: Accepted to Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ Synthesizing Mixed-type Electronic Health Records using Diffusion Models + + +
+ Electronic Health Records (EHRs) contain sensitive patient information, which +presents privacy concerns when sharing such data. Synthetic data generation is +a promising solution to mitigate these risks, often relying on deep generative +models such as Generative Adversarial Networks (GANs). However, recent studies +have shown that diffusion models offer several advantages over GANs, such as +generation of more realistic synthetic data and stable training in generating +data modalities, including image, text, and sound. In this work, we investigate +the potential of diffusion models for generating realistic mixed-type tabular +EHRs, comparing TabDDPM model with existing methods on four datasets in terms +of data quality, utility, privacy, and augmentation. Our experiments +demonstrate that TabDDPM outperforms the state-of-the-art models across all +evaluation metrics, except for privacy, which confirms the trade-off between +privacy and utility. + +
+
+ comment: Page 2, Figure 1 is updated +
+
+
+
+
+ + ♻ ☆ VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts + + +
+ Contrastive Language-Image Pre-training (CLIP) has drawn increasing attention +recently for its transferable visual representation learning. However, due to +the semantic gap within datasets, CLIP's pre-trained image-text alignment +becomes sub-optimal on downstream tasks, which severely harms its transferring +performance. To better adapt the cross-modality embedding space, we propose to +enhance CLIP via Visual-guided Texts, named VT-CLIP. Specifically, we guide +textual features of different categories to adaptively explore informative +regions on the image and aggregate visual features by attention mechanisms. In +this way, the texts become visual-guided, namely, more semantically correlated +with downstream images, which greatly benefits the category-wise matching +process. In few-shot settings, we evaluate our VT-CLIP on 11 well-known +classification datasets to demonstrate its effectiveness. + +
+
+
+
+
+ + ♻ ☆ GPT-4 Can't Reason + + +
+ GPT-4 was released in March 2023 to wide acclaim, marking a very substantial +improvement across the board over GPT-3.5 (OpenAI's previously best model, +which had powered the initial release of ChatGPT). However, despite the +genuinely impressive improvement, there are good reasons to be highly skeptical +of GPT-4's ability to reason. This position paper discusses the nature of +reasoning; criticizes the current formulation of reasoning problems in the NLP +community, as well as the way in which LLM reasoning performance is currently +evaluated; introduces a small collection of 21 diverse reasoning problems; and +performs a detailed qualitative evaluation of GPT-4's performance on those +problems. Based on this analysis, the paper concludes that, despite its +occasional flashes of analytical brilliance, GPT-4 at present is utterly +incapable of reasoning. + +
+
+
+
+
+ + ♻ ☆ Extending an Event-type Ontology: Adding Verbs and Classes Using + Fine-tuned LLMs Suggestions ACL 2023 + + +
+ In this project, we have investigated the use of advanced machine learning +methods, specifically fine-tuned large language models, for pre-annotating data +for a lexical extension task, namely adding descriptive words (verbs) to an +existing (but incomplete, as of yet) ontology of event types. Several research +questions have been focused on, from the investigation of a possible heuristics +to provide at least hints to annotators which verbs to include and which are +outside the current version of the ontology, to the possible use of the +automatic scores to help the annotators to be more efficient in finding a +threshold for identifying verbs that cannot be assigned to any existing class +and therefore they are to be used as seeds for a new class. We have also +carefully examined the correlation of the automatic scores with the human +annotation. While the correlation turned out to be strong, its influence on the +annotation proper is modest due to its near linearity, even though the mere +fact of such pre-annotation leads to relatively short annotation times. + +
+
+ comment: Published at LAW-XVII @ ACL 2023 +
+
+
+
+
+ + ♻ ☆ From Retrieval to Generation: Efficient and Effective Entity Set + Expansion + + +
+ Entity Set Expansion (ESE) is a critical task aiming to expand entities of +the target semantic class described by a small seed entity set. Most existing +ESE methods are retrieval-based frameworks that need to extract the contextual +features of entities and calculate the similarity between seed entities and +candidate entities. To achieve the two purposes, they should iteratively +traverse the corpus and the entity vocabulary provided in the datasets, +resulting in poor efficiency and scalability. The experimental results indicate +that the time consumed by the retrieval-based ESE methods increases linearly +with entity vocabulary and corpus size. In this paper, we firstly propose a +generative ESE framework, Generative Entity Set Expansion (GenExpan), which +utilizes a generative pre-trained language model to accomplish ESE task. +Specifically, a prefix tree is employed to guarantee the validity of entity +generation, and automatically generated class names are adopted to guide the +model to generate target entities. Moreover, we propose Knowledge Calibration +and Generative Ranking to further bridge the gap between generic knowledge of +the language model and the goal of ESE task. Experiments on publicly available +datasets show that GenExpan is efficient and effective. For efficiency, +expansion time consumed by GenExpan is independent of entity vocabulary and +corpus size, and GenExpan achieves an average 600% speedup compared to strong +baselines. For expansion performance, our framework outperforms previous +state-of-the-art ESE methods. + +
+
+
+
+
+ + ♻ ☆ Strahler Number of Natural Language Sentences in Comparison with Random + Trees + + +
+ The Strahler number was originally proposed to characterize the complexity of +river bifurcation and has found various applications. This article proposes +computation of the Strahler number's upper and lower limits for natural +language sentence tree structures. Through empirical measurements across +grammatically annotated data, the Strahler number of natural language sentences +is shown to be almost 3 or 4, similarly to the case of river bifurcation as +reported by Strahler (1957). From the theory behind the number, we show that it +is one kind of lower limit on the amount of memory required to process +sentences. We consider the Strahler number to provide reasoning that explains +reports showing that the number of required memory areas to process sentences +is 3 to 4 for parsing (Abney and Johnson, 1991; Schuler et al., 2010), and +reports indicating a psychological "magical number" of 3 to 5 (Cowan, 2001). An +analytical and empirical analysis shows that the Strahler number is not +constant but grows logarithmically; therefore, the Strahler number of sentences +derives from the range of sentence lengths. Furthermore, the Strahler number is +not different for random trees, which could suggest that its origin is not +specific to natural language. + +
+
+
+
+
+ + ♻ ☆ Domain Mastery Benchmark: An Ever-Updating Benchmark for Evaluating + Holistic Domain Knowledge of Large Language Model--A Preliminary Release + + +
+ Domain knowledge refers to the in-depth understanding, expertise, and +familiarity with a specific subject, industry, field, or area of special +interest. The existing benchmarks are all lack of an overall design for domain +knowledge evaluation. Holding the belief that the real ability of domain +language understanding can only be fairly evaluated by an comprehensive and +in-depth benchmark, we introduces the Domma, a Domain Mastery Benchmark. DomMa +targets at testing Large Language Models (LLMs) on their domain knowledge +understanding, it features extensive domain coverage, large data volume, and a +continually updated data set based on Chinese 112 first-level subject +classifications. DomMa consist of 100,000 questions in both Chinese and English +sourced from graduate entrance examinations and undergraduate exams in Chinese +college. We have also propose designs to make benchmark and evaluation process +more suitable to LLMs. + +
+
+ comment: The paper is updated, but we make a mistake that submit a new arxiv + paper but not replace this one, the new version is in arXiv:2306.05783 +
+
+
+
+
+ + ♻ ☆ Progressive-Hint Prompting Improves Reasoning in Large Language Models + + +
+ The performance of Large Language Models (LLMs) in reasoning tasks depends +heavily on prompt design, with Chain-of-Thought (CoT) and self-consistency +being critical methods that enhance this ability. However, these methods do not +fully exploit the answers generated by the LLM to guide subsequent responses. +This paper proposes a new prompting method, named Progressive-Hint Prompting +(PHP), that enables automatic multiple interactions between users and LLMs by +using previously generated answers as hints to progressively guide toward the +correct answers. PHP is orthogonal to CoT and self-consistency, making it easy +to combine with state-of-the-art techniques to further improve performance. We +conducted extensive and comprehensive experiments on seven benchmarks. The +results show that PHP significantly improves accuracy while remaining highly +efficient. For instance, with text-davinci-003, we observed a 4.2% improvement +on GSM8K with greedy decoding compared to Complex CoT, and a 46.17% reduction +in sample paths with self-consistency. With GPT-4 and PHP, we achieve +state-of-the-art performances on SVAMP (89.1% -> 91.9%), GSM8K (92% -> 95.5%), +AQuA (76.4% -> 79.9%) and MATH (50.3% -> 53.9%). + +
+
+ comment: Tech Report +
+
+
+
+
+ + ♻ ☆ Towards Multiple References Era -- Addressing Data Leakage and Limited + Reference Diversity in NLG Evaluation + + +
+ N-gram matching-based evaluation metrics, such as BLEU and chrF, are widely +utilized across a range of natural language generation (NLG) tasks. However, +recent studies have revealed a weak correlation between these matching-based +metrics and human evaluations, especially when compared with neural-based +metrics like BLEURT. In this paper, we conjecture that the performance +bottleneck in matching-based metrics may be caused by the limited diversity of +references. To address this issue, we propose to utilize \textit{multiple +references} to enhance the consistency between these metrics and human +evaluations. Within the WMT Metrics benchmarks, we observe that the +multi-references F200spBLEU surpasses the conventional single-reference one by +an accuracy improvement of 7.2\%. Remarkably, it also exceeds the neural-based +BERTscore by an accuracy enhancement of 3.9\%. Moreover, we observe that the +data leakage issue in large language models (LLMs) can be mitigated to a large +extent by our multi-reference metric. We release the code and data at +\url{https://github.com/SefaZeng/LLM-Ref} + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ A Compact End-to-End Model with Local and Global Context for Spoken + Language Identification INTERSPEECH 2023 + + +
+ We introduce TitaNet-LID, a compact end-to-end neural network for Spoken +Language Identification (LID) that is based on the ContextNet architecture. +TitaNet-LID employs 1D depth-wise separable convolutions and +Squeeze-and-Excitation layers to effectively capture local and global context +within an utterance. Despite its small size, TitaNet-LID achieves performance +similar to state-of-the-art models on the VoxLingua107 dataset while being 10 +times smaller. Furthermore, it can be easily adapted to new acoustic conditions +and unseen languages through simple fine-tuning, achieving a state-of-the-art +accuracy of 88.2% on the FLEURS benchmark. Our model is scalable and can +achieve a better trade-off between accuracy and speed. TitaNet-LID performs +well even on short utterances less than 5s in length, indicating its robustness +to input length. + +
+
+ comment: Accepted to INTERSPEECH 2023 +
+
+
+
+
+ + ♻ ☆ Predicting Perfect Quality Segments in MT Output with Fine-Tuned OpenAI + LLM: Is it possible to capture editing distance patterns from historical + data? + + +
+ Translation Quality Estimation (TQE) is an important step before deploying +the output translation into usage. TQE is also critical in assessing machine +translation (MT) and human translation (HT) quality without seeing the +reference translations. In this work, we examine if the state-of-the-art large +language models (LLMs) can be fine-tuned for the TQE task and their capability. +We take ChatGPT as one example and approach TQE as a binary classification +task. Using English to Italian, German, French, Japanese, Dutch, Portuguese, +Turkish, and Chinese training corpora, our experimental results show that +fine-tuned ChatGPT via its API can achieve a relatively high score on +predicting translation quality, i.e. if the translation needs to be edited, but +there is definitely much space to improve the accuracy. English-Italiano +bilingual Abstract is available in the paper. + +
+
+ comment: 7 pages, 11 figures, under-review to ItalianNLP-2023 +
+
+
+
+
+ + ♻ ☆ Which Features are Learned by CodeBert: An Empirical Study of the + BERT-based Source Code Representation Learning + + +
+ The Bidirectional Encoder Representations from Transformers (BERT) were +proposed in the natural language process (NLP) and shows promising results. +Recently researchers applied the BERT to source-code representation learning +and reported some good news on several downstream tasks. However, in this +paper, we illustrated that current methods cannot effectively understand the +logic of source codes. The representation of source code heavily relies on the +programmer-defined variable and function names. We design and implement a set +of experiments to demonstrate our conjecture and provide some insights for +future works. + +
+
+ comment: 1 table, 2 figures +
+
+
+
+
+ + ♻ ☆ Causality Guided Disentanglement for Cross-Platform Hate Speech + Detection + + +
+ Social media platforms, despite their value in promoting open discourse, are +often exploited to spread harmful content. Current deep learning and natural +language processing models used for detecting this harmful content overly rely +on domain-specific terms affecting their capabilities to adapt to generalizable +hate speech detection. This is because they tend to focus too narrowly on +particular linguistic signals or the use of certain categories of words. +Another significant challenge arises when platforms lack high-quality annotated +data for training, leading to a need for cross-platform models that can adapt +to different distribution shifts. Our research introduces a cross-platform hate +speech detection model capable of being trained on one platform's data and +generalizing to multiple unseen platforms. To achieve good generalizability +across platforms, one way is to disentangle the input representations into +invariant and platform-dependent features. We also argue that learning causal +relationships, which remain constant across diverse environments, can +significantly aid in understanding invariant representations in hate speech. By +disentangling input into platform-dependent features (useful for predicting +hate targets) and platform-independent features (used to predict the presence +of hate), we learn invariant representations resistant to distribution shifts. +These features are then used to predict hate speech across unseen platforms. +Our extensive experiments across four platforms highlight our model's enhanced +efficacy compared to existing state-of-the-art methods in detecting generalized +hate speech. + +
+
+
+
+
+ + ♻ ☆ There is more than one kind of robustness: Fooling Whisper with + adversarial examples + + +
+ Whisper is a recent Automatic Speech Recognition (ASR) model displaying +impressive robustness to both out-of-distribution inputs and random noise. In +this work, we show that this robustness does not carry over to adversarial +noise. We show that we can degrade Whisper performance dramatically, or even +transcribe a target sentence of our choice, by generating very small input +perturbations with Signal Noise Ratio of 35-45dB. We also show that by fooling +the Whisper language detector we can very easily degrade the performance of +multilingual models. These vulnerabilities of a widely popular open-source +model have practical security implications and emphasize the need for +adversarially robust ASR. + +
+
+ comment: Accepted at InterSpeech 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 124 + +
+
+
+ + ☆ Iterative Reweighted Least Squares Networks With Convergence Guarantees + for Solving Inverse Imaging Problems + + +
+ In this work we present a novel optimization strategy for image +reconstruction tasks under analysis-based image regularization, which promotes +sparse and/or low-rank solutions in some learned transform domain. We +parameterize such regularizers using potential functions that correspond to +weighted extensions of the $\ell_p^p$-vector and $\mathcal{S}_p^p$ +Schatten-matrix quasi-norms with $0 < p \le 1$. Our proposed minimization +strategy extends the Iteratively Reweighted Least Squares (IRLS) method, +typically used for synthesis-based $\ell_p$ and $\mathcal{S}_p$ norm and +analysis-based $\ell_1$ and nuclear norm regularization. We prove that under +mild conditions our minimization algorithm converges linearly to a stationary +point, and we provide an upper bound for its convergence rate. Further, to +select the parameters of the regularizers that deliver the best results for the +problem at hand, we propose to learn them from training data by formulating the +supervised learning process as a stochastic bilevel optimization problem. We +show that thanks to the convergence guarantees of our proposed minimization +strategy, such optimization can be successfully performed with a +memory-efficient implicit back-propagation scheme. We implement our learned +IRLS variants as recurrent networks and assess their performance on the +challenging image reconstruction tasks of non-blind deblurring, +super-resolution and demosaicking. The comparisons against other existing +learned reconstruction approaches demonstrate that our overall method is very +competitive and in many cases outperforms existing unrolled networks, whose +number of parameters is orders of magnitude higher than in our case. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.10536 +
+
+
+
+
+ + ☆ PlankAssembly: Robust 3D Reconstruction from Three Orthographic Views + with Learnt Shape Programs ICCV 2023 + + +
+ In this paper, we develop a new method to automatically convert 2D line +drawings from three orthographic views into 3D CAD models. Existing methods for +this problem reconstruct 3D models by back-projecting the 2D observations into +3D space while maintaining explicit correspondence between the input and +output. Such methods are sensitive to errors and noises in the input, thus +often fail in practice where the input drawings created by human designers are +imperfect. To overcome this difficulty, we leverage the attention mechanism in +a Transformer-based sequence generation model to learn flexible mappings +between the input and output. Further, we design shape programs which are +suitable for generating the objects of interest to boost the reconstruction +accuracy and facilitate CAD modeling applications. Experiments on a new +benchmark dataset show that our method significantly outperforms existing ones +when the inputs are noisy or incomplete. + +
+
+ comment: To Appear in ICCV 2023. The first three authors contributed equally + to this work. The project page is at + https://manycore-research.github.io/PlankAssembly +
+
+
+
+
+ + ☆ Neural Progressive Meshes SIGGRAPH 2023 + + +
+ The recent proliferation of 3D content that can be consumed on hand-held +devices necessitates efficient tools for transmitting large geometric data, +e.g., 3D meshes, over the Internet. Detailed high-resolution assets can pose a +challenge to storage as well as transmission bandwidth, and level-of-detail +techniques are often used to transmit an asset using an appropriate bandwidth +budget. It is especially desirable for these methods to transmit data +progressively, improving the quality of the geometry with more data. Our key +insight is that the geometric details of 3D meshes often exhibit similar local +patterns even across different shapes, and thus can be effectively represented +with a shared learned generative space. We learn this space using a +subdivision-based encoder-decoder architecture trained in advance on a large +collection of surfaces. We further observe that additional residual features +can be transmitted progressively between intermediate levels of subdivision +that enable the client to control the tradeoff between bandwidth cost and +quality of reconstruction, providing a neural progressive mesh representation. +We evaluate our method on a diverse set of complex 3D shapes and demonstrate +that it outperforms baselines in terms of compression ratio and reconstruction +quality. + +
+
+ comment: SIGGRAPH 2023 +
+
+
+
+
+ + ☆ Zero Grads Ever Given: Learning Local Surrogate Losses for + Non-Differentiable Graphics + + +
+ Gradient-based optimization is now ubiquitous across graphics, but +unfortunately can not be applied to problems with undefined or zero gradients. +To circumvent this issue, the loss function can be manually replaced by a +"surrogate" that has similar minima but is differentiable. Our proposed +framework, ZeroGrads, automates this process by learning a neural approximation +of the objective function, the surrogate, which in turn can be used to +differentiate through arbitrary black-box graphics pipelines. We train the +surrogate on an actively smoothed version of the objective and encourage +locality, focusing the surrogate's capacity on what matters at the current +training episode. The fitting is performed online, alongside the parameter +optimization, and self-supervised, without pre-computed data or pre-trained +models. As sampling the objective is expensive (it requires a full rendering or +simulator run), we devise an efficient sampling scheme that allows for +tractable run-times and competitive performance at little overhead. We +demonstrate optimizing diverse non-convex, non-differentiable black-box +problems in graphics, such as visibility in rendering, discrete parameter +spaces in procedural modelling or optimal control in physics-driven animation. +In contrast to more traditional algorithms, our approach scales well to higher +dimensions, which we demonstrate on problems with up to 35k interlinked +variables. + +
+
+
+
+
+ + ☆ Follow Anything: Open-set detection, tracking, and following in + real-time + + +
+ Tracking and following objects of interest is critical to several robotics +use cases, ranging from industrial automation to logistics and warehousing, to +healthcare and security. In this paper, we present a robotic system to detect, +track, and follow any object in real-time. Our approach, dubbed ``follow +anything'' (FAn), is an open-vocabulary and multimodal model -- it is not +restricted to concepts seen at training time and can be applied to novel +classes at inference time using text, images, or click queries. Leveraging rich +visual descriptors from large-scale pre-trained models (foundation models), FAn +can detect and segment objects by matching multimodal queries (text, images, +clicks) against an input image sequence. These detected and segmented objects +are tracked across image frames, all while accounting for occlusion and object +re-emergence. We demonstrate FAn on a real-world robotic system (a micro aerial +vehicle) and report its ability to seamlessly follow the objects of interest in +a real-time control loop. FAn can be deployed on a laptop with a lightweight +(6-8 GB) graphics card, achieving a throughput of 6-20 frames per second. To +enable rapid adoption, deployment, and extensibility, we open-source all our +code on our project webpage at https://github.com/alaamaalouf/FollowAnything . +We also encourage the reader the watch our 5-minutes explainer video in this +https://www.youtube.com/watch?v=6Mgt3EPytrw . + +
+
+ comment: Project webpage: https://github.com/alaamaalouf/FollowAnything + Explainer video: https://www.youtube.com/watch?v=6Mgt3EPytrw +
+
+
+
+
+ + ☆ MapTRv2: An End-to-End Framework for Online Vectorized HD Map + Construction + + +
+ High-definition (HD) map provides abundant and precise static environmental +information of the driving scene, serving as a fundamental and indispensable +component for planning in autonomous driving system. In this paper, we present +\textbf{Map} \textbf{TR}ansformer, an end-to-end framework for online +vectorized HD map construction. We propose a unified permutation-equivalent +modeling approach, \ie, modeling map element as a point set with a group of +equivalent permutations, which accurately describes the shape of map element +and stabilizes the learning process. We design a hierarchical query embedding +scheme to flexibly encode structured map information and perform hierarchical +bipartite matching for map element learning. To speed up convergence, we +further introduce auxiliary one-to-many matching and dense supervision. The +proposed method well copes with various map elements with arbitrary shapes. It +runs at real-time inference speed and achieves state-of-the-art performance on +both nuScenes and Argoverse2 datasets. Abundant qualitative results show stable +and robust map construction quality in complex and various driving scenes. Code +and more demos are available at \url{https://github.com/hustvl/MapTR} for +facilitating further studies and applications. + +
+
+ comment: Code available at https://github.com/hustvl/MapTR . arXiv admin note: + substantial text overlap with arXiv:2208.14437 +
+
+
+
+
+ + ☆ FrozenRecon: Pose-free 3D Scene Reconstruction with Frozen Depth Models ICCV 2023 + + +
+ 3D scene reconstruction is a long-standing vision task. Existing approaches +can be categorized into geometry-based and learning-based methods. The former +leverages multi-view geometry but can face catastrophic failures due to the +reliance on accurate pixel correspondence across views. The latter was +proffered to mitigate these issues by learning 2D or 3D representation +directly. However, without a large-scale video or 3D training data, it can +hardly generalize to diverse real-world scenarios due to the presence of tens +of millions or even billions of optimization parameters in the deep network. +Recently, robust monocular depth estimation models trained with large-scale +datasets have been proven to possess weak 3D geometry prior, but they are +insufficient for reconstruction due to the unknown camera parameters, the +affine-invariant property, and inter-frame inconsistency. Here, we propose a +novel test-time optimization approach that can transfer the robustness of +affine-invariant depth models such as LeReS to challenging diverse scenes while +ensuring inter-frame consistency, with only dozens of parameters to optimize +per video frame. Specifically, our approach involves freezing the pre-trained +affine-invariant depth model's depth predictions, rectifying them by optimizing +the unknown scale-shift values with a geometric consistency alignment module, +and employing the resulting scale-consistent depth maps to robustly obtain +camera poses and achieve dense scene reconstruction, even in low-texture +regions. Experiments show that our method achieves state-of-the-art +cross-dataset reconstruction on five zero-shot testing datasets. + +
+
+ comment: Accepted to ICCV 2023. Project webpage is at: + https://aim-uofa.github.io/FrozenRecon/ +
+
+
+
+
+ + ☆ Rethinking Integration of Prediction and Planning in Deep Learning-Based + Automated Driving Systems: A Review + + +
+ Automated driving has the potential to revolutionize personal, public, and +freight mobility. Besides the enormous challenge of perception, i.e. accurately +perceiving the environment using available sensor data, automated driving +comprises planning a safe, comfortable, and efficient motion trajectory. To +promote safety and progress, many works rely on modules that predict the future +motion of surrounding traffic. Modular automated driving systems commonly +handle prediction and planning as sequential separate tasks. While this +accounts for the influence of surrounding traffic on the ego-vehicle, it fails +to anticipate the reactions of traffic participants to the ego-vehicle's +behavior. Recent works suggest that integrating prediction and planning in an +interdependent joint step is necessary to achieve safe, efficient, and +comfortable driving. While various models implement such integrated systems, a +comprehensive overview and theoretical understanding of different principles +are lacking. We systematically review state-of-the-art deep learning-based +prediction, planning, and integrated prediction and planning models. Different +facets of the integration ranging from model architecture and model design to +behavioral aspects are considered and related to each other. Moreover, we +discuss the implications, strengths, and limitations of different integration +methods. By pointing out research gaps, describing relevant future challenges, +and highlighting trends in the research field, we identify promising directions +for future research. + +
+
+
+
+
+ + ☆ Deformable Mixer Transformer with Gating for Multi-Task Learning of + Dense Prediction AAAI 2023 + + +
+ CNNs and Transformers have their own advantages and both have been widely +used for dense prediction in multi-task learning (MTL). Most of the current +studies on MTL solely rely on CNN or Transformer. In this work, we present a +novel MTL model by combining both merits of deformable CNN and query-based +Transformer with shared gating for multi-task learning of dense prediction. +This combination may offer a simple and efficient solution owing to its +powerful and flexible task-specific learning and advantages of lower cost, less +complexity and smaller parameters than the traditional MTL methods. We +introduce deformable mixer Transformer with gating (DeMTG), a simple and +effective encoder-decoder architecture up-to-date that incorporates the +convolution and attention mechanism in a unified network for MTL. It is +exquisitely designed to use advantages of each block, and provide deformable +and comprehensive features for all tasks from local and global perspective. +First, the deformable mixer encoder contains two types of operators: the +channel-aware mixing operator leveraged to allow communication among different +channels, and the spatial-aware deformable operator with deformable convolution +applied to efficiently sample more informative spatial locations. Second, the +task-aware gating transformer decoder is used to perform the task-specific +predictions, in which task interaction block integrated with self-attention is +applied to capture task interaction features, and the task query block +integrated with gating attention is leveraged to select corresponding +task-specific features. Further, the experiment results demonstrate that the +proposed DeMTG uses fewer GFLOPs and significantly outperforms current +Transformer-based and CNN-based competitive models on a variety of metrics on +three dense prediction datasets. Our code and models are available at +https://github.com/yangyangxu0/DeMTG. + +
+
+ comment: Comments: submitted to IJCV; an extension to our previous AAAI 2023 + paper arXiv:2301.03461 +
+
+
+
+
+ + ☆ Shadow Datasets, New challenging datasets for Causal Representation + Learning + + +
+ Discovering causal relations among semantic factors is an emergent topic in +representation learning. Most causal representation learning (CRL) methods are +fully supervised, which is impractical due to costly labeling. To resolve this +restriction, weakly supervised CRL methods were introduced. To evaluate CRL +performance, four existing datasets, Pendulum, Flow, CelebA(BEARD) and +CelebA(SMILE), are utilized. However, existing CRL datasets are limited to +simple graphs with few generative factors. Thus we propose two new datasets +with a larger number of diverse generative factors and more sophisticated +causal graphs. In addition, current real datasets, CelebA(BEARD) and +CelebA(SMILE), the originally proposed causal graphs are not aligned with the +dataset distributions. Thus, we propose modifications to them. + +
+
+
+
+
+ + ☆ Masked Diffusion as Self-supervised Representation Learner + + +
+ Denoising diffusion probabilistic models have recently demonstrated +state-of-the-art generative performance and been used as strong pixel-level +representation learners. This paper decomposes the interrelation between the +generative capability and representation learning ability inherent in diffusion +models. We present masked diffusion model (MDM), a scalable self-supervised +representation learner that substitutes the conventional additive Gaussian +noise of traditional diffusion with a masking mechanism. Our proposed approach +convincingly surpasses prior benchmarks, demonstrating remarkable advancements +in both medical and natural image semantic segmentation tasks, particularly +within the context of few-shot scenario. + +
+
+
+
+
+ + ☆ Hard No-Box Adversarial Attack on Skeleton-Based Human Action + Recognition with Skeleton-Motion-Informed Gradient + + +
+ Recently, methods for skeleton-based human activity recognition have been +shown to be vulnerable to adversarial attacks. However, these attack methods +require either the full knowledge of the victim (i.e. white-box attacks), +access to training data (i.e. transfer-based attacks) or frequent model queries +(i.e. black-box attacks). All their requirements are highly restrictive, +raising the question of how detrimental the vulnerability is. In this paper, we +show that the vulnerability indeed exists. To this end, we consider a new +attack task: the attacker has no access to the victim model or the training +data or labels, where we coin the term hard no-box attack. Specifically, we +first learn a motion manifold where we define an adversarial loss to compute a +new gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our +gradient contains information of the motion dynamics, which is different from +existing gradient-based attack methods that compute the loss gradient assuming +each dimension in the data is independent. The SMI gradient can augment many +gradient-based attack methods, leading to a new family of no-box attack +methods. Extensive evaluation and comparison show that our method imposes a +real threat to existing classifiers. They also show that the SMI gradient +improves the transferability and imperceptibility of adversarial samples in +both no-box and transfer-based black-box settings. + +
+
+
+
+
+ + ☆ 2D3D-MATR: 2D-3D Matching Transformer for Detection-free Registration + between Images and Point Clouds ICCV 2023 + + +
+ The commonly adopted detect-then-match approach to registration finds +difficulties in the cross-modality cases due to the incompatible keypoint +detection and inconsistent feature description. We propose, 2D3D-MATR, a +detection-free method for accurate and robust registration between images and +point clouds. Our method adopts a coarse-to-fine pipeline where it first +computes coarse correspondences between downsampled patches of the input image +and the point cloud and then extends them to form dense correspondences between +pixels and points within the patch region. The coarse-level patch matching is +based on transformer which jointly learns global contextual constraints with +self-attention and cross-modality correlations with cross-attention. To resolve +the scale ambiguity in patch matching, we construct a multi-scale pyramid for +each image patch and learn to find for each point patch the best matching image +patch at a proper resolution level. Extensive experiments on two public +benchmarks demonstrate that 2D3D-MATR outperforms the previous state-of-the-art +P2-Net by around $20$ percentage points on inlier ratio and over $10$ points on +registration recall. Our code and models are available at +\url{https://github.com/minhaolee/2D3DMATR}. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ AD-CLIP: Adapting Domains in Prompt Space Using CLIP ICCV + + +
+ Although deep learning models have shown impressive performance on supervised +learning tasks, they often struggle to generalize well when the training +(source) and test (target) domains differ. Unsupervised domain adaptation (DA) +has emerged as a popular solution to this problem. However, current DA +techniques rely on visual backbones, which may lack semantic richness. Despite +the potential of large-scale vision-language foundation models like CLIP, their +effectiveness for DA has yet to be fully explored. To address this gap, we +introduce AD-CLIP, a domain-agnostic prompt learning strategy for CLIP that +aims to solve the DA problem in the prompt space. We leverage the frozen vision +backbone of CLIP to extract both image style (domain) and content information, +which we apply to learn prompt tokens. Our prompts are designed to be +domain-invariant and class-generalizable, by conditioning prompt learning on +image style and content features simultaneously. We use standard supervised +contrastive learning in the source domain, while proposing an entropy +minimization strategy to align domains in the embedding space given the target +domain data. We also consider a scenario where only target domain samples are +available during testing, without any source domain data, and propose a +cross-domain style mapping network to hallucinate domain-agnostic tokens. Our +extensive experiments on three benchmark DA datasets demonstrate the +effectiveness of AD-CLIP compared to existing literature. + +
+
+ comment: 10 pages, 8 figures, 4 tables. Accepted at OOD-CV, ICCV Workshop, + 2023 +
+
+
+
+
+ + ☆ Attention-based 3D CNN with Multi-layer Features for Alzheimer's Disease + Diagnosis using Brain Images + + +
+ Structural MRI and PET imaging play an important role in the diagnosis of +Alzheimer's disease (AD), showing the morphological changes and glucose +metabolism changes in the brain respectively. The manifestations in the brain +image of some cognitive impairment patients are relatively inconspicuous, for +example, it still has difficulties in achieving accurate diagnosis through sMRI +in clinical practice. With the emergence of deep learning, convolutional neural +network (CNN) has become a valuable method in AD-aided diagnosis, but some CNN +methods cannot effectively learn the features of brain image, making the +diagnosis of AD still presents some challenges. In this work, we propose an +end-to-end 3D CNN framework for AD diagnosis based on ResNet, which integrates +multi-layer features obtained under the effect of the attention mechanism to +better capture subtle differences in brain images. The attention maps showed +our model can focus on key brain regions related to the disease diagnosis. Our +method was verified in ablation experiments with two modality images on 792 +subjects from the ADNI database, where AD diagnostic accuracies of 89.71% and +91.18% were achieved based on sMRI and PET respectively, and also outperformed +some state-of-the-art methods. + +
+
+ comment: 4 pages, 4 figures +
+
+
+
+
+ + ☆ Counterfactual Cross-modality Reasoning for Weakly Supervised Video + Moment Localization ACM MM 2023 + + +
+ Video moment localization aims to retrieve the target segment of an untrimmed +video according to the natural language query. Weakly supervised methods gains +attention recently, as the precise temporal location of the target segment is +not always available. However, one of the greatest challenges encountered by +the weakly supervised method is implied in the mismatch between the video and +language induced by the coarse temporal annotations. To refine the +vision-language alignment, recent works contrast the cross-modality +similarities driven by reconstructing masked queries between positive and +negative video proposals. However, the reconstruction may be influenced by the +latent spurious correlation between the unmasked and the masked parts, which +distorts the restoring process and further degrades the efficacy of contrastive +learning since the masked words are not completely reconstructed from the +cross-modality knowledge. In this paper, we discover and mitigate this spurious +correlation through a novel proposed counterfactual cross-modality reasoning +method. Specifically, we first formulate query reconstruction as an aggregated +causal effect of cross-modality and query knowledge. Then by introducing +counterfactual cross-modality knowledge into this aggregation, the spurious +impact of the unmasked part contributing to the reconstruction is explicitly +modeled. Finally, by suppressing the unimodal effect of masked query, we can +rectify the reconstructions of video proposals to perform reasonable +contrastive learning. Extensive experimental evaluations demonstrate the +effectiveness of our proposed method. The code is available at +\href{https://github.com/sLdZ0306/CCR}{https://github.com/sLdZ0306/CCR}. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ IIHT: Medical Report Generation with Image-to-Indicator Hierarchical + Transformer + + +
+ Automated medical report generation has become increasingly important in +medical analysis. It can produce computer-aided diagnosis descriptions and thus +significantly alleviate the doctors' work. Inspired by the huge success of +neural machine translation and image captioning, various deep learning methods +have been proposed for medical report generation. However, due to the inherent +properties of medical data, including data imbalance and the length and +correlation between report sequences, the generated reports by existing methods +may exhibit linguistic fluency but lack adequate clinical accuracy. In this +work, we propose an image-to-indicator hierarchical transformer (IIHT) +framework for medical report generation. It consists of three modules, i.e., a +classifier module, an indicator expansion module and a generator module. The +classifier module first extracts image features from the input medical images +and produces disease-related indicators with their corresponding states. The +disease-related indicators are subsequently utilised as input for the indicator +expansion module, incorporating the "data-text-data" strategy. The +transformer-based generator then leverages these extracted features along with +image features as auxiliary information to generate final reports. Furthermore, +the proposed IIHT method is feasible for radiologists to modify disease +indicators in real-world scenarios and integrate the operations into the +indicator expansion module for fluent and accurate medical report generation. +Extensive experiments and comparisons with state-of-the-art methods under +various evaluation metrics demonstrate the great performance of the proposed +method. + +
+
+
+
+
+ + ☆ Self-Supervised Monocular Depth Estimation by Direction-aware Cumulative + Convolution Network ICCV2023 + + +
+ Monocular depth estimation is known as an ill-posed task in which objects in +a 2D image usually do not contain sufficient information to predict their +depth. Thus, it acts differently from other tasks (e.g., classification and +segmentation) in many ways. In this paper, we find that self-supervised +monocular depth estimation shows a direction sensitivity and environmental +dependency in the feature representation. But the current backbones borrowed +from other tasks pay less attention to handling different types of +environmental information, limiting the overall depth accuracy. To bridge this +gap, we propose a new Direction-aware Cumulative Convolution Network (DaCCN), +which improves the depth feature representation in two aspects. First, we +propose a direction-aware module, which can learn to adjust the feature +extraction in each direction, facilitating the encoding of different types of +information. Secondly, we design a new cumulative convolution to improve the +efficiency for aggregating important environmental information. Experiments +show that our method achieves significant improvements on three widely used +benchmarks, KITTI, Cityscapes, and Make3D, setting a new state-of-the-art +performance on the popular benchmarks with all three types of self-supervision. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ☆ Object Goal Navigation with Recursive Implicit Maps IROS 2023 + + +
+ Object goal navigation aims to navigate an agent to locations of a given +object category in unseen environments. Classical methods explicitly build maps +of environments and require extensive engineering while lacking semantic +information for object-oriented exploration. On the other hand, end-to-end +learning methods alleviate manual map design and predict actions using implicit +representations. Such methods, however, lack an explicit notion of geometry and +may have limited ability to encode navigation history. In this work, we propose +an implicit spatial map for object goal navigation. Our implicit map is +recursively updated with new observations at each step using a transformer. To +encourage spatial reasoning, we introduce auxiliary tasks and train our model +to reconstruct explicit maps as well as to predict visual features, semantic +labels and actions. Our method significantly outperforms the state of the art +on the challenging MP3D dataset and generalizes well to the HM3D dataset. We +successfully deploy our model on a real robot and achieve encouraging object +goal navigation results in real scenes using only a few real-world +demonstrations. Code, trained models and videos are available at +\url{https://www.di.ens.fr/willow/research/onav_rim/}. + +
+
+ comment: Accepted to IROS 2023 +
+
+
+
+
+ + ☆ NUPES : Non-Uniform Post-Training Quantization via Power Exponent Search + + +
+ Deep neural network (DNN) deployment has been confined to larger hardware +devices due to their expensive computational requirements. This challenge has +recently reached another scale with the emergence of large language models +(LLMs). In order to reduce both their memory footprint and latency, a promising +technique is quantization. It consists in converting floating point +representations to low bit-width fixed point representations, usually by +assuming a uniform mapping onto a regular grid. This process, referred to in +the literature as uniform quantization, may however be ill-suited as most DNN +weights and activations follow a bell-shaped distribution. This is even worse +on LLMs whose weight distributions are known to exhibit large, high impact, +outlier values. In this work, we propose an improvement over the most commonly +adopted way to tackle this limitation in deep learning models quantization, +namely, non-uniform quantization. NUPES leverages automorphisms to preserve the +scalar multiplications. Such transformations are derived from power functions. +However, the optimization of the exponent parameter and weight values remains a +challenging and novel problem which could not be solved with previous post +training optimization techniques which only learn to round up or down weight +values in order to preserve the predictive function. We circumvent this +limitation with a new paradigm: learning new quantized weights over the entire +quantized space. Similarly, we enable the optimization of the power exponent, +i.e. the optimization of the quantization operator itself during training by +alleviating all the numerical instabilities. The resulting predictive function +is compatible with integer-only low-bit inference. We show the ability of the +method to achieve state-of-the-art compression rates in both, data-free and +data-driven configurations. + +
+
+
+
+
+ + ☆ Test-Time Selection for Robust Skin Lesion Analysis MICCAI 2023 + + +
+ Skin lesion analysis models are biased by artifacts placed during image +acquisition, which influence model predictions despite carrying no clinical +information. Solutions that address this problem by regularizing models to +prevent learning those spurious features achieve only partial success, and +existing test-time debiasing techniques are inappropriate for skin lesion +analysis due to either making unrealistic assumptions on the distribution of +test data or requiring laborious annotation from medical practitioners. We +propose TTS (Test-Time Selection), a human-in-the-loop method that leverages +positive (e.g., lesion area) and negative (e.g., artifacts) keypoints in test +samples. TTS effectively steers models away from exploiting spurious +artifact-related correlations without retraining, and with less annotation +requirements. Our solution is robust to a varying availability of annotations, +and different levels of bias. We showcase on the ISIC2019 dataset (for which we +release a subset of annotated images) how our model could be deployed in the +real-world for mitigating bias. + +
+
+ comment: Accepted at ISIC Workshop @ MICCAI 2023 +
+
+
+
+
+ + ☆ Category Feature Transformer for Semantic Segmentation + + +
+ Aggregation of multi-stage features has been revealed to play a significant +role in semantic segmentation. Unlike previous methods employing point-wise +summation or concatenation for feature aggregation, this study proposes the +Category Feature Transformer (CFT) that explores the flow of category embedding +and transformation among multi-stage features through the prevalent multi-head +attention mechanism. CFT learns unified feature embeddings for individual +semantic categories from high-level features during each aggregation process +and dynamically broadcasts them to high-resolution features. Integrating the +proposed CFT into a typical feature pyramid structure exhibits superior +performance over a broad range of backbone networks. We conduct extensive +experiments on popular semantic segmentation benchmarks. Specifically, the +proposed CFT obtains a compelling 55.1% mIoU with greatly reduced model +parameters and computations on the challenging ADE20K dataset. + +
+
+
+
+
+ + ☆ Exploring Linguistic Similarity and Zero-Shot Learning for Multilingual + Translation of Dravidian Languages + + +
+ Current research in zero-shot translation is plagued by several issues such +as high compute requirements, increased training time and off target +translations. Proposed remedies often come at the cost of additional data or +compute requirements. Pivot based neural machine translation is preferred over +a single-encoder model for most settings despite the increased training and +evaluation time. In this work, we overcome the shortcomings of zero-shot +translation by taking advantage of transliteration and linguistic similarity. +We build a single encoder-decoder neural machine translation system for +Dravidian-Dravidian multilingual translation and perform zero-shot translation. +We compare the data vs zero-shot accuracy tradeoff and evaluate the performance +of our vanilla method against the current state of the art pivot based method. +We also test the theory that morphologically rich languages require large +vocabularies by restricting the vocabulary using an optimal transport based +technique. Our model manages to achieves scores within 3 BLEU of large-scale +pivot-based models when it is trained on 50\% of the language directions. + +
+
+
+
+
+ + ☆ Cross-Domain Product Representation Learning for Rich-Content E-Commerce ICCV23 + + +
+ The proliferation of short video and live-streaming platforms has +revolutionized how consumers engage in online shopping. Instead of browsing +product pages, consumers are now turning to rich-content e-commerce, where they +can purchase products through dynamic and interactive media like short videos +and live streams. This emerging form of online shopping has introduced +technical challenges, as products may be presented differently across various +media domains. Therefore, a unified product representation is essential for +achieving cross-domain product recognition to ensure an optimal user search +experience and effective product recommendations. Despite the urgent industrial +need for a unified cross-domain product representation, previous studies have +predominantly focused only on product pages without taking into account short +videos and live streams. To fill the gap in the rich-content e-commerce area, +in this paper, we introduce a large-scale cRoss-dOmain Product Ecognition +dataset, called ROPE. ROPE covers a wide range of product categories and +contains over 180,000 products, corresponding to millions of short videos and +live streams. It is the first dataset to cover product pages, short videos, and +live streams simultaneously, providing the basis for establishing a unified +product representation across different media domains. Furthermore, we propose +a Cross-dOmain Product rEpresentation framework, namely COPE, which unifies +product representations in different domains through multimodal learning +including text and vision. Extensive experiments on downstream tasks +demonstrate the effectiveness of COPE in learning a joint feature space for all +product domains. + +
+
+ comment: ICCV23 +
+
+
+
+
+ + ☆ Deep Richardson-Lucy Deconvolution for Low-Light Image Deblurring + + +
+ Images taken under the low-light condition often contain blur and saturated +pixels at the same time. Deblurring images with saturated pixels is quite +challenging. Because of the limited dynamic range, the saturated pixels are +usually clipped in the imaging process and thus cannot be modeled by the linear +blur model. Previous methods use manually designed smooth functions to +approximate the clipping procedure. Their deblurring processes often require +empirically defined parameters, which may not be the optimal choices for +different images. In this paper, we develop a data-driven approach to model the +saturated pixels by a learned latent map. Based on the new model, the non-blind +deblurring task can be formulated into a maximum a posterior (MAP) problem, +which can be effectively solved by iteratively computing the latent map and the +latent image. Specifically, the latent map is computed by learning from a map +estimation network (MEN), and the latent image estimation process is +implemented by a Richardson-Lucy (RL)-based updating scheme. To estimate +high-quality deblurred images without amplified artifacts, we develop a prior +estimation network (PEN) to obtain prior information, which is further +integrated into the RL scheme. Experimental results demonstrate that the +proposed method performs favorably against state-of-the-art algorithms both +quantitatively and qualitatively on synthetic and real-world images. + +
+
+ comment: Accepted by IJCV +
+
+
+
+
+ + ☆ Robust Asymmetric Loss for Multi-Label Long-Tailed Learning + + +
+ In real medical data, training samples typically show long-tailed +distributions with multiple labels. Class distribution of the medical data has +a long-tailed shape, in which the incidence of different diseases is quite +varied, and at the same time, it is not unusual for images taken from +symptomatic patients to be multi-label diseases. Therefore, in this paper, we +concurrently address these two issues by putting forth a robust asymmetric loss +on the polynomial function. Since our loss tackles both long-tailed and +multi-label classification problems simultaneously, it leads to a complex +design of the loss function with a large number of hyper-parameters. Although a +model can be highly fine-tuned due to a large number of hyper-parameters, it is +difficult to optimize all hyper-parameters at the same time, and there might be +a risk of overfitting a model. Therefore, we regularize the loss function using +the Hill loss approach, which is beneficial to be less sensitive against the +numerous hyper-parameters so that it reduces the risk of overfitting the model. +For this reason, the proposed loss is a generic method that can be applied to +most medical image classification tasks and does not make the training process +more time-consuming. We demonstrate that the proposed robust asymmetric loss +performs favorably against the long-tailed with multi-label medical image +classification in addition to the various long-tailed single-label datasets. +Notably, our method achieves Top-5 results on the CXR-LT dataset of the ICCV +CVAMD 2023 competition. We opensource our implementation of the robust +asymmetric loss in the public repository: https://github.com/kalelpark/RAL. + +
+
+
+
+
+ + ☆ Is there progress in activity progress prediction? ICCV + + +
+ Activity progress prediction aims to estimate what percentage of an activity +has been completed. Currently this is done with machine learning approaches, +trained and evaluated on complicated and realistic video datasets. The videos +in these datasets vary drastically in length and appearance. And some of the +activities have unanticipated developments, making activity progression +difficult to estimate. In this work, we examine the results obtained by +existing progress prediction methods on these datasets. We find that current +progress prediction methods seem not to extract useful visual information for +the progress prediction task. Therefore, these methods fail to exceed simple +frame-counting baselines. We design a precisely controlled dataset for activity +progress prediction and on this synthetic dataset we show that the considered +methods can make use of the visual information, when this directly relates to +the progress prediction. We conclude that the progress prediction task is +ill-posed on the currently used real-world datasets. Moreover, to fairly +measure activity progression we advise to consider a, simple but effective, +frame-counting baseline. + +
+
+ comment: Accepted at ICCVw-2023 (AI for Creative Video Editing and + Understanding, ICCV workshop 2023) +
+
+
+
+
+ + ☆ Critical Points ++: An Agile Point Cloud Importance Measure for Robust + Classification, Adversarial Defense and Explainable AI + + +
+ The ability to cope accurately and fast with Out-Of-Distribution (OOD) +samples is crucial in real-world safety demanding applications. In this work we +first study the interplay between critical points of 3D point clouds and OOD +samples. Our findings are that common corruptions and outliers are often +interpreted as critical points. We generalize the notion of critical points +into importance measures. We show that training a classification network based +only on less important points dramatically improves robustness, at a cost of +minor performance loss on the clean set. We observe that normalized entropy is +highly informative for corruption analysis. An adaptive threshold based on +normalized entropy is suggested for selecting the set of uncritical points. Our +proposed importance measure is extremely fast to compute. We show it can be +used for a variety of applications, such as Explainable AI (XAI), Outlier +Removal, Uncertainty Estimation, Robust Classification and Adversarial Defense. +We reach SOTA results on the two latter tasks. + +
+
+
+
+
+ + ☆ Look at the Neighbor: Distortion-aware Unsupervised Domain Adaptation + for Panoramic Semantic Segmentation ICCV 2023 + + +
+ Endeavors have been recently made to transfer knowledge from the labeled +pinhole image domain to the unlabeled panoramic image domain via Unsupervised +Domain Adaptation (UDA). The aim is to tackle the domain gaps caused by the +style disparities and distortion problem from the non-uniformly distributed +pixels of equirectangular projection (ERP). Previous works typically focus on +transferring knowledge based on geometric priors with specially designed +multi-branch network architectures. As a result, considerable computational +costs are induced, and meanwhile, their generalization abilities are profoundly +hindered by the variation of distortion among pixels. In this paper, we find +that the pixels' neighborhood regions of the ERP indeed introduce less +distortion. Intuitively, we propose a novel UDA framework that can effectively +address the distortion problems for panoramic semantic segmentation. In +comparison, our method is simpler, easier to implement, and more +computationally efficient. Specifically, we propose distortion-aware attention +(DA) capturing the neighboring pixel distribution without using any geometric +constraints. Moreover, we propose a class-wise feature aggregation (CFA) module +to iteratively update the feature representations with a memory bank. As such, +the feature similarity between two domains can be consistently optimized. +Extensive experiments show that our method achieves new state-of-the-art +performance while remarkably reducing 80% parameters. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ YOLO-MS: Rethinking Multi-Scale Representation Learning for Real-time + Object Detection + + +
+ We aim at providing the object detection community with an efficient and +performant object detector, termed YOLO-MS. The core design is based on a +series of investigations on how convolutions with different kernel sizes affect +the detection performance of objects at different scales. The outcome is a new +strategy that can strongly enhance multi-scale feature representations of +real-time object detectors. To verify the effectiveness of our strategy, we +build a network architecture, termed YOLO-MS. We train our YOLO-MS on the MS +COCO dataset from scratch without relying on any other large-scale datasets, +like ImageNet, or pre-trained weights. Without bells and whistles, our YOLO-MS +outperforms the recent state-of-the-art real-time object detectors, including +YOLO-v7 and RTMDet, when using a comparable number of parameters and FLOPs. +Taking the XS version of YOLO-MS as an example, with only 4.5M learnable +parameters and 8.7G FLOPs, it can achieve an AP score of 43%+ on MS COCO, which +is about 2%+ higher than RTMDet with the same model size. Moreover, our work +can also be used as a plug-and-play module for other YOLO models. Typically, +our method significantly improves the AP of YOLOv8 from 37%+ to 40%+ with even +fewer parameters and FLOPs. Code is available at +https://github.com/FishAndWasabi/YOLO-MS. + +
+
+
+
+
+ + ☆ Reviewing 3D Object Detectors in the Context of High-Resolution 3+1D + Radar CVPR 2023 + + +
+ Recent developments and the beginning market introduction of high-resolution +imaging 4D (3+1D) radar sensors have initialized deep learning-based radar +perception research. We investigate deep learning-based models operating on +radar point clouds for 3D object detection. 3D object detection on lidar point +cloud data is a mature area of 3D vision. Many different architectures have +been proposed, each with strengths and weaknesses. Due to similarities between +3D lidar point clouds and 3+1D radar point clouds, those existing 3D object +detectors are a natural basis to start deep learning-based 3D object detection +on radar data. Thus, the first step is to analyze the detection performance of +the existing models on the new data modality and evaluate them in depth. In +order to apply existing 3D point cloud object detectors developed for lidar +point clouds to the radar domain, they need to be adapted first. While some +detectors, such as PointPillars, have already been adapted to be applicable to +radar data, we have adapted others, e.g., Voxel R-CNN, SECOND, PointRCNN, and +PV-RCNN. To this end, we conduct a cross-model validation (evaluating a set of +models on one particular data set) as well as a cross-data set validation +(evaluating all models in the model set on several data sets). The +high-resolution radar data used are the View-of-Delft and Astyx data sets. +Finally, we evaluate several adaptations of the models and their training +procedures. We also discuss major factors influencing the detection performance +on radar data and propose possible solutions indicating potential future +research avenues. + +
+
+ comment: Published at CVPR 2023 Workshop on 3D Vision and Robotics + (https://drive.google.com/file/d/1xj4R5ucH3PaR7QdRDJbbkjS-3iBUsruR/view) +
+
+
+
+
+ + ☆ Surface Masked AutoEncoder: Self-Supervision for Cortical Imaging Data + + +
+ Self-supervision has been widely explored as a means of addressing the lack +of inductive biases in vision transformer architectures, which limits +generalisation when networks are trained on small datasets. This is crucial in +the context of cortical imaging, where phenotypes are complex and +heterogeneous, but the available datasets are limited in size. This paper +builds upon recent advancements in translating vision transformers to surface +meshes and investigates the potential of Masked AutoEncoder (MAE) +self-supervision for cortical surface learning. By reconstructing surface data +from a masked version of the input, the proposed method effectively models +cortical structure to learn strong representations that translate to improved +performance in downstream tasks. We evaluate our approach on cortical phenotype +regression using the developing Human Connectome Project (dHCP) and demonstrate +that pre-training leads to a 26\% improvement in performance, with an 80\% +faster convergence, compared to models trained from scratch. Furthermore, we +establish that pre-training vision transformer models on large datasets, such +as the UK Biobank (UKB), enables the acquisition of robust representations for +finetuning in low-data scenarios. Our code and pre-trained models are publicly +available at \url{https://github.com/metrics-lab/surface-vision-transformers}. + +
+
+
+
+
+ + ☆ KS-APR: Keyframe Selection for Robust Absolute Pose Regression + + +
+ Markerless Mobile Augmented Reality (AR) aims to anchor digital content in +the physical world without using specific 2D or 3D objects. Absolute Pose +Regressors (APR) are end-to-end machine learning solutions that infer the +device's pose from a single monocular image. Thanks to their low computation +cost, they can be directly executed on the constrained hardware of mobile AR +devices. However, APR methods tend to yield significant inaccuracies for input +images that are too distant from the training set. This paper introduces +KS-APR, a pipeline that assesses the reliability of an estimated pose with +minimal overhead by combining the inference results of the APR and the prior +images in the training set. Mobile AR systems tend to rely upon visual-inertial +odometry to track the relative pose of the device during the experience. As +such, KS-APR favours reliability over frequency, discarding unreliable poses. +This pipeline can integrate most existing APR methods to improve accuracy by +filtering unreliable images with their pose estimates. We implement the +pipeline on three types of APR models on indoor and outdoor datasets. The +median error on position and orientation is reduced for all models, and the +proportion of large errors is minimized across datasets. Our method enables +state-of-the-art APRs such as DFNetdm to outperform single-image and sequential +APR methods. These results demonstrate the scalability and effectiveness of +KS-APR for visual localization tasks that do not require one-shot decisions. + +
+
+
+
+
+ + ☆ Transforming Breast Cancer Diagnosis: Towards Real-Time Ultrasound to + Mammogram Conversion for Cost-Effective Diagnosis + + +
+ Ultrasound (US) imaging is better suited for intraoperative settings because +it is real-time and more portable than other imaging techniques, such as +mammography. However, US images are characterized by lower spatial resolution +noise-like artifacts. This research aims to address these limitations by +providing surgeons with mammogram-like image quality in real-time from noisy US +images. Unlike previous approaches for improving US image quality that aim to +reduce artifacts by treating them as (speckle noise), we recognize their value +as informative wave interference pattern (WIP). To achieve this, we utilize the +Stride software to numerically solve the forward model, generating ultrasound +images from mammograms images by solving wave-equations. Additionally, we +leverage the power of domain adaptation to enhance the realism of the simulated +ultrasound images. Then, we utilize generative adversarial networks (GANs) to +tackle the inverse problem of generating mammogram-quality images from +ultrasound images. The resultant images have considerably more discernible +details than the original US images. + +
+
+
+
+
+ + ☆ A Generalized Physical-knowledge-guided Dynamic Model for Underwater + Image Enhancement + + +
+ Underwater images often suffer from color distortion and low contrast +resulting in various image types, due to the scattering and absorption of light +by water. While it is difficult to obtain high-quality paired training samples +with a generalized model. To tackle these challenges, we design a Generalized +Underwater image enhancement method via a Physical-knowledge-guided Dynamic +Model (short for GUPDM), consisting of three parts: Atmosphere-based Dynamic +Structure (ADS), Transmission-guided Dynamic Structure (TDS), and Prior-based +Multi-scale Structure (PMS). In particular, to cover complex underwater scenes, +this study changes the global atmosphere light and the transmission to simulate +various underwater image types (e.g., the underwater image color ranging from +yellow to blue) through the formation model. We then design ADS and TDS that +use dynamic convolutions to adaptively extract prior information from +underwater images and generate parameters for PMS. These two modules enable the +network to select appropriate parameters for various water types adaptively. +Besides, the multi-scale feature extraction module in PMS uses convolution +blocks with different kernel sizes and obtains weights for each feature map via +channel attention block and fuses them to boost the receptive field of the +network. The source code will be available at +\href{https://github.com/shiningZZ/GUPDM}{https://github.com/shiningZZ/GUPDM}. + +
+
+ comment: Accepted by ACMMM 2023 +
+
+
+
+
+ + ☆ Benchmarking Algorithmic Bias in Face Recognition: An Experimental + Approach Using Synthetic Faces and Human Evaluation + + +
+ We propose an experimental method for measuring bias in face recognition +systems. Existing methods to measure bias depend on benchmark datasets that are +collected in the wild and annotated for protected (e.g., race, gender) and +non-protected (e.g., pose, lighting) attributes. Such observational datasets +only permit correlational conclusions, e.g., "Algorithm A's accuracy is +different on female and male faces in dataset X.". By contrast, experimental +methods manipulate attributes individually and thus permit causal conclusions, +e.g., "Algorithm A's accuracy is affected by gender and skin color." + Our method is based on generating synthetic faces using a neural face +generator, where each attribute of interest is modified independently while +leaving all other attributes constant. Human observers crucially provide the +ground truth on perceptual identity similarity between synthetic image pairs. +We validate our method quantitatively by evaluating race and gender biases of +three research-grade face recognition models. Our synthetic pipeline reveals +that for these algorithms, accuracy is lower for Black and East Asian +population subgroups. Our method can also quantify how perceptual changes in +attributes affect face identity distances reported by these models. Our large +synthetic dataset, consisting of 48,000 synthetic face image pairs (10,200 +unique synthetic faces) and 555,000 human annotations (individual attributes +and pairwise identity comparisons) is available to researchers in this +important area. + +
+
+ comment: accepted to iccv2023; 18 figures +
+
+
+
+
+ + ☆ Deep Fusion Transformer Network with Weighted Vector-Wise Keypoints + Voting for Robust 6D Object Pose Estimation ICCV2023 + + +
+ One critical challenge in 6D object pose estimation from a single RGBD image +is efficient integration of two different modalities, i.e., color and depth. In +this work, we tackle this problem by a novel Deep Fusion Transformer~(DFTr) +block that can aggregate cross-modality features for improving pose estimation. +Unlike existing fusion methods, the proposed DFTr can better model +cross-modality semantic correlation by leveraging their semantic similarity, +such that globally enhanced features from different modalities can be better +integrated for improved information extraction. Moreover, to further improve +robustness and efficiency, we introduce a novel weighted vector-wise voting +algorithm that employs a non-iterative global optimization strategy for precise +3D keypoint localization while achieving near real-time inference. Extensive +experiments show the effectiveness and strong generalization capability of our +proposed 3D keypoint voting algorithm. Results on four widely used benchmarks +also demonstrate that our method outperforms the state-of-the-art methods by +large margins. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Ensemble Modeling for Multimodal Visual Action Recognition + + +
+ In this work, we propose an ensemble modeling approach for multimodal action +recognition. We independently train individual modality models using a variant +of focal loss tailored to handle the long-tailed distribution of the MECCANO +[21] dataset. Based on the underlying principle of focal loss, which captures +the relationship between tail (scarce) classes and their prediction +difficulties, we propose an exponentially decaying variant of focal loss for +our current task. It initially emphasizes learning from the hard misclassified +examples and gradually adapts to the entire range of examples in the dataset. +This annealing process encourages the model to strike a balance between +focusing on the sparse set of hard samples, while still leveraging the +information provided by the easier ones. Additionally, we opt for the late +fusion strategy to combine the resultant probability distributions from RGB and +Depth modalities for final action prediction. Experimental evaluations on the +MECCANO dataset demonstrate the effectiveness of our approach. + +
+
+ comment: Technical Report accepted at the Multimodal Action Recognition + Challenge on the MECCANO Dataset - ICIAP 2023 +
+
+
+
+
+ + ☆ Speech-Driven 3D Face Animation with Composite and Regional Facial + Movements + + +
+ Speech-driven 3D face animation poses significant challenges due to the +intricacy and variability inherent in human facial movements. This paper +emphasizes the importance of considering both the composite and regional +natures of facial movements in speech-driven 3D face animation. The composite +nature pertains to how speech-independent factors globally modulate +speech-driven facial movements along the temporal dimension. Meanwhile, the +regional nature alludes to the notion that facial movements are not globally +correlated but are actuated by local musculature along the spatial dimension. +It is thus indispensable to incorporate both natures for engendering vivid +animation. To address the composite nature, we introduce an adaptive modulation +module that employs arbitrary facial movements to dynamically adjust +speech-driven facial movements across frames on a global scale. To accommodate +the regional nature, our approach ensures that each constituent of the facial +features for every frame focuses on the local spatial movements of 3D faces. +Moreover, we present a non-autoregressive backbone for translating audio to 3D +facial movements, which maintains high-frequency nuances of facial movements +and facilitates efficient inference. Comprehensive experiments and user studies +demonstrate that our method surpasses contemporary state-of-the-art approaches +both qualitatively and quantitatively. + +
+
+ comment: Accepted by MM 2023, 9 pages, 7 figures +
+
+
+
+
+ + ☆ Adaptive Low Rank Adaptation of Segment Anything to Salient Object + Detection + + +
+ Foundation models, such as OpenAI's GPT-3 and GPT-4, Meta's LLaMA, and +Google's PaLM2, have revolutionized the field of artificial intelligence. A +notable paradigm shift has been the advent of the Segment Anything Model (SAM), +which has exhibited a remarkable capability to segment real-world objects, +trained on 1 billion masks and 11 million images. Although SAM excels in +general object segmentation, it lacks the intrinsic ability to detect salient +objects, resulting in suboptimal performance in this domain. To address this +challenge, we present the Segment Salient Object Model (SSOM), an innovative +approach that adaptively fine-tunes SAM for salient object detection by +harnessing the low-rank structure inherent in deep learning. Comprehensive +qualitative and quantitative evaluations across five challenging RGB benchmark +datasets demonstrate the superior performance of our approach, surpassing +state-of-the-art methods. + +
+
+ comment: 13 pages, 0 figures +
+
+
+
+
+ + ☆ Progressive Spatio-temporal Perception for Audio-Visual Question + Answering ACM MM 2023 + + +
+ Audio-Visual Question Answering (AVQA) task aims to answer questions about +different visual objects, sounds, and their associations in videos. Such +naturally multi-modal videos are composed of rich and complex dynamic +audio-visual components, where most of which could be unrelated to the given +questions, or even play as interference in answering the content of interest. +Oppositely, only focusing on the question-aware audio-visual content could get +rid of influence, meanwhile enabling the model to answer more efficiently. In +this paper, we propose a Progressive Spatio-Temporal Perception Network +(PSTP-Net), which contains three modules that progressively identify key +spatio-temporal regions w.r.t. questions. Specifically, a temporal segment +selection module is first introduced to select the most relevant audio-visual +segments related to the given question. Then, a spatial region selection module +is utilized to choose the most relevant regions associated with the question +from the selected temporal segments. To further refine the selection of +features, an audio-guided visual attention module is employed to perceive the +association between auido and selected spatial regions. Finally, the +spatio-temporal features from these modules are integrated for answering the +question. Extensive experimental results on the public MUSIC-AVQA and AVQA +datasets provide compelling evidence of the effectiveness and efficiency of +PSTP-Net. Code is available at: +\href{https://github.com/GeWu-Lab/PSTP-Net}{https://github.com/GeWu-Lab/PSTP-Net} + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ SC3K: Self-supervised and Coherent 3D Keypoints Estimation from Rotated, + Noisy, and Decimated Point Cloud Data ICCV + + +
+ This paper proposes a new method to infer keypoints from arbitrary object +categories in practical scenarios where point cloud data (PCD) are noisy, +down-sampled and arbitrarily rotated. Our proposed model adheres to the +following principles: i) keypoints inference is fully unsupervised (no +annotation given), ii) keypoints position error should be low and resilient to +PCD perturbations (robustness), iii) keypoints should not change their indexes +for the intra-class objects (semantic coherence), iv) keypoints should be close +to or proximal to PCD surface (compactness). We achieve these desiderata by +proposing a new self-supervised training strategy for keypoints estimation that +does not assume any a priori knowledge of the object class, and a model +architecture with coupled auxiliary losses that promotes the desired keypoints +properties. We compare the keypoints estimated by the proposed approach with +those of the state-of-the-art unsupervised approaches. The experiments show +that our approach outperforms by estimating keypoints with improved coverage +(+9.41%) while being semantically consistent (+4.66%) that best characterizes +the object's 3D shape for downstream tasks. Code and data are available at: +https://github.com/IITPAVIS/SC3K + +
+
+ comment: This paper has been accepted in International Conference on Computer + Vision (ICCV) 2023. For code and data, please refer to the following GitHub + page: https://github.com/IITPAVIS/SC3K +
+
+
+
+
+ + ☆ A Comparative Assessment of Multi-view fusion learning for Crop + Classification + + +
+ With a rapidly increasing amount and diversity of remote sensing (RS) data +sources, there is a strong need for multi-view learning modeling. This is a +complex task when considering the differences in resolution, magnitude, and +noise of RS data. The typical approach for merging multiple RS sources has been +input-level fusion, but other - more advanced - fusion strategies may +outperform this traditional approach. This work assesses different fusion +strategies for crop classification in the CropHarvest dataset. The fusion +methods proposed in this work outperform models based on individual views and +previous fusion methods. We do not find one single fusion method that +consistently outperforms all other approaches. Instead, we present a comparison +of multi-view fusion methods for three different datasets and show that, +depending on the test region, different methods obtain the best performance. +Despite this, we suggest a preliminary criterion for the selection of fusion +methods. + +
+
+ comment: Accepted at IEEE International Geoscience and Remote Sensing + Symposium 2023 +
+
+
+
+
+ + ☆ Enhancing Low-light Light Field Images with A Deep Compensation + Unfolding Network + + +
+ This paper presents a novel and interpretable end-to-end learning framework, +called the deep compensation unfolding network (DCUNet), for restoring light +field (LF) images captured under low-light conditions. DCUNet is designed with +a multi-stage architecture that mimics the optimization process of solving an +inverse imaging problem in a data-driven fashion. The framework uses the +intermediate enhanced result to estimate the illumination map, which is then +employed in the unfolding process to produce a new enhanced result. +Additionally, DCUNet includes a content-associated deep compensation module at +each optimization stage to suppress noise and illumination map estimation +errors. To properly mine and leverage the unique characteristics of LF images, +this paper proposes a pseudo-explicit feature interaction module that +comprehensively exploits redundant information in LF images. The experimental +results on both simulated and real datasets demonstrate the superiority of our +DCUNet over state-of-the-art methods, both qualitatively and quantitatively. +Moreover, DCUNet preserves the essential geometric structure of enhanced LF +images much better. The code will be publicly available at +https://github.com/lyuxianqiang/LFLL-DCU. + +
+
+
+
+
+ + ☆ Learning Gabor Texture Features for Fine-Grained Recognition ICCV2023 + + +
+ Extracting and using class-discriminative features is critical for +fine-grained recognition. Existing works have demonstrated the possibility of +applying deep CNNs to exploit features that distinguish similar classes. +However, CNNs suffer from problems including frequency bias and loss of +detailed local information, which restricts the performance of recognizing +fine-grained categories. To address the challenge, we propose a novel texture +branch as complimentary to the CNN branch for feature extraction. We +innovatively utilize Gabor filters as a powerful extractor to exploit texture +features, motivated by the capability of Gabor filters in effectively capturing +multi-frequency features and detailed local information. We implement several +designs to enhance the effectiveness of Gabor filters, including imposing +constraints on parameter values and developing a learning method to determine +the optimal parameters. Moreover, we introduce a statistical feature extractor +to utilize informative statistical information from the signals captured by +Gabor filters, and a gate selection mechanism to enable efficient computation +by only considering qualified regions as input for texture extraction. Through +the integration of features from the Gabor-filter-based texture branch and +CNN-based semantic branch, we achieve comprehensive information extraction. We +demonstrate the efficacy of our method on multiple datasets, including +CUB-200-2011, NA-bird, Stanford Dogs, and GTOS-mobile. State-of-the-art +performance is achieved using our approach. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ☆ Robust Localization with Visual-Inertial Odometry Constraints for + Markerless Mobile AR + + +
+ Visual Inertial Odometry (VIO) is an essential component of modern Augmented +Reality (AR) applications. However, VIO only tracks the relative pose of the +device, leading to drift over time. Absolute pose estimation methods infer the +device's absolute pose, but their accuracy depends on the input quality. This +paper introduces VIO-APR, a new framework for markerless mobile AR that +combines an absolute pose regressor (APR) with a local VIO tracking system. +VIO-APR uses VIO to assess the reliability of the APR and the APR to identify +and compensate for VIO drift. This feedback loop results in more accurate +positioning and more stable AR experiences. To evaluate VIO-APR, we created a +dataset that combines camera images with ARKit's VIO system output for six +indoor and outdoor scenes of various scales. Over this dataset, VIO-APR +improves the median accuracy of popular APR by up to 36\% in position and 29\% +in orientation, increases the percentage of frames in the high ($0.25 m, +2^{\circ}$) accuracy level by up to 112\% and reduces the percentage of frames +predicted below the low ($5 m, 10^\circ$) accuracy greatly. We implement +VIO-APR into a mobile AR application using Unity to demonstrate its +capabilities. VIO-APR results in noticeably more accurate localization and a +more stable overall experience. + +
+
+
+
+
+ + ☆ Product Review Image Ranking for Fashion E-commerce SIGIR + + +
+ In a fashion e-commerce platform where customers can't physically examine the +products on their own, being able to see other customers' text and image +reviews of the product is critical while making purchase decisions. Given the +high reliance on these reviews, over the years we have observed customers +proactively sharing their reviews. With an increase in the coverage of User +Generated Content (UGC), there has been a corresponding increase in the number +of customer images. It is thus imperative to display the most relevant images +on top as it may influence users' online shopping choices and behavior. In this +paper, we propose a simple yet effective training procedure for ranking +customer images. We created a dataset consisting of Myntra (A Major Indian +Fashion e-commerce company) studio posts and highly engaged (upvotes/downvotes) +UGC images as our starting point and used selected distortion techniques on the +images of the above dataset to bring their quality at par with those of bad UGC +images. We train our network to rank bad-quality images lower than high-quality +ones. Our proposed method outperforms the baseline models on two metrics, +namely correlation coefficient, and accuracy, by substantial margins. + +
+
+ comment: Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR + eCom'22) +
+
+
+
+
+ + ☆ HGDNet: A Height-Hierarchy Guided Dual-Decoder Network for Single View + Building Extraction and Height Estimation + + +
+ Unifying the correlative single-view satellite image building extraction and +height estimation tasks indicates a promising way to share representations and +acquire generalist model for large-scale urban 3D reconstruction. However, the +common spatial misalignment between building footprints and +stereo-reconstructed nDSM height labels incurs degraded performance on both +tasks. To address this issue, we propose a Height-hierarchy Guided Dual-decoder +Network (HGDNet) to estimate building height. Under the guidance of synthesized +discrete height-hierarchy nDSM, auxiliary height-hierarchical building +extraction branch enhance the height estimation branch with implicit +constraints, yielding an accuracy improvement of more than 6% on the DFC 2023 +track2 dataset. Additional two-stage cascade architecture is adopted to achieve +more accurate building extraction. Experiments on the DFC 2023 Track 2 dataset +shows the superiority of the proposed method in building height estimation +({\delta}1:0.8012), instance extraction (AP50:0.7730), and the final average +score 0.7871 ranks in the first place in test phase. + +
+
+
+
+
+ + ☆ Interaction-aware Joint Attention Estimation Using People Attributes ICCV2023 + + +
+ This paper proposes joint attention estimation in a single image. Different +from related work in which only the gaze-related attributes of people are +independently employed, (I) their locations and actions are also employed as +contextual cues for weighting their attributes, and (ii) interactions among all +of these attributes are explicitly modeled in our method. For the interaction +modeling, we propose a novel Transformer-based attention network to encode +joint attention as low-dimensional features. We introduce a specialized MLP +head with positional embedding to the Transformer so that it predicts pixelwise +confidence of joint attention for generating the confidence heatmap. This +pixelwise prediction improves the heatmap accuracy by avoiding the ill-posed +problem in which the high-dimensional heatmap is predicted from the +low-dimensional features. The estimated joint attention is further improved by +being integrated with general image-based attention estimation. Our method +outperforms SOTA methods quantitatively in comparative experiments. Code: +https://anonymous.4open.science/r/anonymized_codes-ECA4. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ☆ Flexible Isosurface Extraction for Gradient-Based Mesh Optimization SIGGRAPH 2023 + + +
+ This work considers gradient-based mesh optimization, where we iteratively +optimize for a 3D surface mesh by representing it as the isosurface of a scalar +field, an increasingly common paradigm in applications including +photogrammetry, generative modeling, and inverse physics. Existing +implementations adapt classic isosurface extraction algorithms like Marching +Cubes or Dual Contouring; these techniques were designed to extract meshes from +fixed, known fields, and in the optimization setting they lack the degrees of +freedom to represent high-quality feature-preserving meshes, or suffer from +numerical instabilities. We introduce FlexiCubes, an isosurface representation +specifically designed for optimizing an unknown mesh with respect to geometric, +visual, or even physical objectives. Our main insight is to introduce +additional carefully-chosen parameters into the representation, which allow +local flexible adjustments to the extracted mesh geometry and connectivity. +These parameters are updated along with the underlying scalar field via +automatic differentiation when optimizing for a downstream task. We base our +extraction scheme on Dual Marching Cubes for improved topological properties, +and present extensions to optionally generate tetrahedral and +hierarchically-adaptive meshes. Extensive experiments validate FlexiCubes on +both synthetic benchmarks and real-world applications, showing that it offers +significant improvements in mesh quality and geometric fidelity. + +
+
+ comment: SIGGRAPH 2023. Project page: + https://research.nvidia.com/labs/toronto-ai/flexicubes/ +
+
+
+
+
+ + ☆ TriDo-Former: A Triple-Domain Transformer for Direct PET Reconstruction + from Low-Dose Sinograms + + +
+ To obtain high-quality positron emission tomography (PET) images while +minimizing radiation exposure, various methods have been proposed for +reconstructing standard-dose PET (SPET) images from low-dose PET (LPET) +sinograms directly. However, current methods often neglect boundaries during +sinogram-to-image reconstruction, resulting in high-frequency distortion in the +frequency domain and diminished or fuzzy edges in the reconstructed images. +Furthermore, the convolutional architectures, which are commonly used, lack the +ability to model long-range non-local interactions, potentially leading to +inaccurate representations of global structures. To alleviate these problems, +we propose a transformer-based model that unites triple domains of sinogram, +image, and frequency for direct PET reconstruction, namely TriDo-Former. +Specifically, the TriDo-Former consists of two cascaded networks, i.e., a +sinogram enhancement transformer (SE-Former) for denoising the input LPET +sinograms and a spatial-spectral reconstruction transformer (SSR-Former) for +reconstructing SPET images from the denoised sinograms. Different from the +vanilla transformer that splits an image into 2D patches, based specifically on +the PET imaging mechanism, our SE-Former divides the sinogram into 1D +projection view angles to maintain its inner-structure while denoising, +preventing the noise in the sinogram from prorogating into the image domain. +Moreover, to mitigate high-frequency distortion and improve reconstruction +details, we integrate global frequency parsers (GFPs) into SSR-Former. The GFP +serves as a learnable frequency filter that globally adjusts the frequency +components in the frequency domain, enforcing the network to restore +high-frequency details resembling real SPET images. Validations on a clinical +dataset demonstrate that our TriDo-Former outperforms the state-of-the-art +methods qualitatively and quantitatively. + +
+
+
+
+
+ + ☆ Pseudo-label Alignment for Semi-supervised Instance Segmentation ICCV 2023 + + +
+ Pseudo-labeling is significant for semi-supervised instance segmentation, +which generates instance masks and classes from unannotated images for +subsequent training. However, in existing pipelines, pseudo-labels that contain +valuable information may be directly filtered out due to mismatches in class +and mask quality. To address this issue, we propose a novel framework, called +pseudo-label aligning instance segmentation (PAIS), in this paper. In PAIS, we +devise a dynamic aligning loss (DALoss) that adjusts the weights of +semi-supervised loss terms with varying class and mask score pairs. Through +extensive experiments conducted on the COCO and Cityscapes datasets, we +demonstrate that PAIS is a promising framework for semi-supervised instance +segmentation, particularly in cases where labeled data is severely limited. +Notably, with just 1\% labeled data, PAIS achieves 21.2 mAP (based on +Mask-RCNN) and 19.9 mAP (based on K-Net) on the COCO dataset, outperforming the +current state-of-the-art model, \ie, NoisyBoundary with 7.7 mAP, by a margin of +over 12 points. Code is available at: \url{https://github.com/hujiecpp/PAIS}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Fine-grained building roof instance segmentation based on domain adapted + pretraining and composite dual-backbone + + +
+ The diversity of building architecture styles of global cities situated on +various landforms, the degraded optical imagery affected by clouds and shadows, +and the significant inter-class imbalance of roof types pose challenges for +designing a robust and accurate building roof instance segmentor. To address +these issues, we propose an effective framework to fulfill semantic +interpretation of individual buildings with high-resolution optical satellite +imagery. Specifically, the leveraged domain adapted pretraining strategy and +composite dual-backbone greatly facilitates the discriminative feature +learning. Moreover, new data augmentation pipeline, stochastic weight averaging +(SWA) training and instance segmentation based model ensemble in testing are +utilized to acquire additional performance boost. Experiment results show that +our approach ranks in the first place of the 2023 IEEE GRSS Data Fusion Contest +(DFC) Track 1 test phase ($mAP_{50}$:50.6\%). Note-worthily, we have also +explored the potential of multimodal data fusion with both optical satellite +imagery and SAR data. + +
+
+
+
+
+ + ☆ TCSloT: Text Guided 3D Context and Slope Aware Triple Network for Dental + Implant Position Prediction + + +
+ In implant prosthesis treatment, the surgical guide of implant is used to +ensure accurate implantation. However, such design heavily relies on the manual +location of the implant position. When deep neural network has been proposed to +assist the dentist in locating the implant position, most of them take a single +slice as input, which do not fully explore 3D contextual information and +ignoring the influence of implant slope. In this paper, we design a Text Guided +3D Context and Slope Aware Triple Network (TCSloT) which enables the perception +of contextual information from multiple adjacent slices and awareness of +variation of implant slopes. A Texture Variation Perception (TVP) module is +correspondingly elaborated to process the multiple slices and capture the +texture variation among slices and a Slope-Aware Loss (SAL) is proposed to +dynamically assign varying weights for the regression head. Additionally, we +design a conditional text guidance (CTG) module to integrate the text condition +(i.e., left, middle and right) from the CLIP for assisting the implant position +prediction. Extensive experiments on a dental implant dataset through five-fold +cross-validation demonstrated that the proposed TCSloT achieves superior +performance than existing methods. + +
+
+
+
+
+ + ☆ Towards General and Fast Video Derain via Knowledge Distillation ICME + + +
+ As a common natural weather condition, rain can obscure video frames and thus +affect the performance of the visual system, so video derain receives a lot of +attention. In natural environments, rain has a wide variety of streak types, +which increases the difficulty of the rain removal task. In this paper, we +propose a Rain Review-based General video derain Network via knowledge +distillation (named RRGNet) that handles different rain streak types with one +pre-training weight. Specifically, we design a frame grouping-based +encoder-decoder network that makes full use of the temporal information of the +video. Further, we use the old task model to guide the current model in +learning new rain streak types while avoiding forgetting. To consolidate the +network's ability to derain, we design a rain review module to play back data +from old tasks for the current model. The experimental results show that our +developed general method achieves the best results in terms of running speed +and derain effect. + +
+
+ comment: 6 pages; Accepted at IEEE ICME +
+
+
+
+
+ + ☆ Prostate Age Gap (PAG): An MRI surrogate marker of aging for prostate + cancer detection + + +
+ Background: Prostate cancer (PC) MRI-based risk calculators are commonly +based on biological (e.g. PSA), MRI markers (e.g. volume), and patient age. +Whilst patient age measures the amount of years an individual has existed, +biological age (BA) might better reflect the physiology of an individual. +However, surrogates from prostate MRI and linkage with clinically significant +PC (csPC) remain to be explored. Purpose: To obtain and evaluate Prostate Age +Gap (PAG) as an MRI marker tool for csPC risk. Study type: Retrospective. +Population: A total of 7243 prostate MRI slices from 468 participants who had +undergone prostate biopsies. A deep learning model was trained on 3223 MRI +slices cropped around the gland from 81 low-grade PC (ncsPC, Gleason score <=6) +and 131 negative cases and tested on the remaining 256 participants. +Assessment: Chronological age was defined as the age of the participant at the +time of the visit and used to train the deep learning model to predict the age +of the patient. Following, we obtained PAG, defined as the model predicted age +minus the patient's chronological age. Multivariate logistic regression models +were used to estimate the association through odds ratio (OR) and predictive +value of PAG and compared against PSA levels and PI-RADS>=3. Statistical tests: +T-test, Mann-Whitney U test, Permutation test and ROC curve analysis. Results: +The multivariate adjusted model showed a significant difference in the odds of +clinically significant PC (csPC, Gleason score >=7) (OR =3.78, 95% confidence +interval (CI):2.32-6.16, P <.001). PAG showed a better predictive ability when +compared to PI-RADS>=3 and adjusted by other risk factors, including PSA +levels: AUC =0.981 vs AUC =0.704, p<.001. Conclusion: PAG was significantly +associated with the risk of clinically significant PC and outperformed other +well-established PC risk factors. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Adv-Inpainting: Generating Natural and Transferable Adversarial Patch + via Attention-guided Feature Fusion + + +
+ The rudimentary adversarial attacks utilize additive noise to attack facial +recognition (FR) models. However, because manipulating the total face is +impractical in the physical setting, most real-world FR attacks are based on +adversarial patches, which limit perturbations to a small area. Previous +adversarial patch attacks often resulted in unnatural patterns and clear +boundaries that were easily noticeable. In this paper, we argue that generating +adversarial patches with plausible content can result in stronger +transferability than using additive noise or directly sampling from the latent +space. To generate natural-looking and highly transferable adversarial patches, +we propose an innovative two-stage coarse-to-fine attack framework called +Adv-Inpainting. In the first stage, we propose an attention-guided StyleGAN +(Att-StyleGAN) that adaptively combines texture and identity features based on +the attention map to generate high-transferable and natural adversarial +patches. In the second stage, we design a refinement network with a new +boundary variance loss to further improve the coherence between the patch and +its surrounding area. Experiment results demonstrate that Adv-Inpainting is +stealthy and can produce adversarial patches with stronger transferability and +improved visual quality than previous adversarial patch attacks. + +
+
+
+
+
+ + ☆ RLSAC: Reinforcement Learning enhanced Sample Consensus for End-to-End + Robust Estimation ICCV2023 + + +
+ Robust estimation is a crucial and still challenging task, which involves +estimating model parameters in noisy environments. Although conventional +sampling consensus-based algorithms sample several times to achieve robustness, +these algorithms cannot use data features and historical information +effectively. In this paper, we propose RLSAC, a novel Reinforcement Learning +enhanced SAmple Consensus framework for end-to-end robust estimation. RLSAC +employs a graph neural network to utilize both data and memory features to +guide exploring directions for sampling the next minimum set. The feedback of +downstream tasks serves as the reward for unsupervised training. Therefore, +RLSAC can avoid differentiating to learn the features and the feedback of +downstream tasks for end-to-end robust estimation. In addition, RLSAC +integrates a state transition module that encodes both data and memory +features. Our experimental results demonstrate that RLSAC can learn from +features to gradually explore a better hypothesis. Through analysis, it is +apparent that RLSAC can be easily transferred to other sampling consensus-based +robust estimation tasks. To the best of our knowledge, RLSAC is also the first +method that uses reinforcement learning to sample consensus for end-to-end +robust estimation. We release our codes at https://github.com/IRMVLab/RLSAC. + +
+
+ comment: Accepted by ICCV2023. Codes are released at + https://github.com/IRMVLab/RLSAC +
+
+
+
+
+ + ☆ Deep Semantic Graph Matching for Large-scale Outdoor Point Clouds + Registration + + +
+ The current point cloud registration methods are mainly based on geometric +information and usually ignore the semantic information in the point clouds. In +this paper, we treat the point cloud registration problem as semantic instance +matching and registration task, and propose a deep semantic graph matching +method for large-scale outdoor point cloud registration. Firstly, the semantic +category labels of 3D point clouds are obtained by utilizing large-scale point +cloud semantic segmentation network. The adjacent points with the same category +labels are then clustered together by using Euclidean clustering algorithm to +obtain the semantic instances. Secondly, the semantic adjacency graph is +constructed based on the spatial adjacency relation of semantic instances. +Three kinds of high-dimensional features including geometric shape features, +semantic categorical features and spatial distribution features are learned +through graph convolutional network, and enhanced based on attention mechanism. +Thirdly, the semantic instance matching problem is modeled as an optimal +transport problem, and solved through an optimal matching layer. Finally, +according to the matched semantic instances, the geometric transformation +matrix between two point clouds is first obtained by SVD algorithm and then +refined by ICP algorithm. The experiments are cconducted on the KITTI Odometry +dataset, and the average relative translation error and average relative +rotation error of the proposed method are 6.6cm and 0.229{\deg} respectively. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ DAOT: Domain-Agnostically Aligned Optimal Transport for Domain-Adaptive + Crowd Counting + + +
+ Domain adaptation is commonly employed in crowd counting to bridge the domain +gaps between different datasets. However, existing domain adaptation methods +tend to focus on inter-dataset differences while overlooking the +intra-differences within the same dataset, leading to additional learning +ambiguities. These domain-agnostic factors, e.g., density, surveillance +perspective, and scale, can cause significant in-domain variations, and the +misalignment of these factors across domains can lead to a drop in performance +in cross-domain crowd counting. To address this issue, we propose a +Domain-agnostically Aligned Optimal Transport (DAOT) strategy that aligns +domain-agnostic factors between domains. The DAOT consists of three steps. +First, individual-level differences in domain-agnostic factors are measured +using structural similarity (SSIM). Second, the optimal transfer (OT) strategy +is employed to smooth out these differences and find the optimal +domain-to-domain misalignment, with outlier individuals removed via a virtual +"dustbin" column. Third, knowledge is transferred based on the aligned +domain-agnostic factors, and the model is retrained for domain adaptation to +bridge the gap across domains. We conduct extensive experiments on five +standard crowd-counting benchmarks and demonstrate that the proposed method has +strong generalizability across diverse datasets. Our code will be available at: +https://github.com/HopooLinZ/DAOT/. + +
+
+ comment: 11 pages, 12 figures, 5 tables +
+
+
+
+
+ + ☆ From CNN to Transformer: A Review of Medical Image Segmentation Models + + +
+ Medical image segmentation is an important step in medical image analysis, +especially as a crucial prerequisite for efficient disease diagnosis and +treatment. The use of deep learning for image segmentation has become a +prevalent trend. The widely adopted approach currently is U-Net and its +variants. Additionally, with the remarkable success of pre-trained models in +natural language processing tasks, transformer-based models like TransUNet have +achieved desirable performance on multiple medical image segmentation datasets. +In this paper, we conduct a survey of the most representative four medical +image segmentation models in recent years. We theoretically analyze the +characteristics of these models and quantitatively evaluate their performance +on two benchmark datasets (i.e., Tuberculosis Chest X-rays and ovarian tumors). +Finally, we discuss the main challenges and future trends in medical image +segmentation. Our work can assist researchers in the related field to quickly +establish medical segmentation models tailored to specific regions. + +
+
+ comment: 18 pages, 8 figures +
+
+
+
+
+ + ☆ Multi-Visual-Inertial System: Analysis,Calibration and Estimation + + +
+ In this paper, we study state estimation of multi-visual-inertial systems +(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary +number of asynchronous inertial measurement units (IMUs) or gyroscopes and +global and(or) rolling shutter cameras. We are especially interested in the +full calibration of the associated visual-inertial sensors, including the IMU +or camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as +well as the image readout time of rolling-shutter cameras (if used). To this +end, we develop a new analytic combined IMU integration with intrinsics-termed +ACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary +IMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial +measurements to include all the necessary inertial intrinsic and IMU-IMU +spatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body +constraints to eliminate the necessity of auxiliary inertial poses and thus +reducing computational complexity. By performing observability analysis of +MVIS, we prove that the standard four unobservable directions remain - no +matter how many inertial sensors are used, and also identify, for the first +time, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary +inertial intrinsics. In addition to the extensive simulations that validate our +analysis and algorithms, we have built our own MVIS sensor rig and collected +over 25 real-world datasets to experimentally verify the proposed calibration +against the state-of-the-art calibration method such as Kalibr. We show that +the proposed MVIS calibration is able to achieve competing accuracy with +improved convergence and repeatability, which is open sourced to better benefit +the community. + +
+
+
+
+
+ + ☆ Double-chain Constraints for 3D Human Pose Estimation in Images and + Videos + + +
+ Reconstructing 3D poses from 2D poses lacking depth information is +particularly challenging due to the complexity and diversity of human motion. +The key is to effectively model the spatial constraints between joints to +leverage their inherent dependencies. Thus, we propose a novel model, called +Double-chain Graph Convolutional Transformer (DC-GCT), to constrain the pose +through a double-chain design consisting of local-to-global and global-to-local +chains to obtain a complex representation more suitable for the current human +pose. Specifically, we combine the advantages of GCN and Transformer and design +a Local Constraint Module (LCM) based on GCN and a Global Constraint Module +(GCM) based on self-attention mechanism as well as a Feature Interaction Module +(FIM). The proposed method fully captures the multi-level dependencies between +human body joints to optimize the modeling capability of the model. Moreover, +we propose a method to use temporal information into the single-frame model by +guiding the video sequence embedding through the joint embedding of the target +frame, with negligible increase in computational cost. Experimental results +demonstrate that DC-GCT achieves state-of-the-art performance on two +challenging datasets (Human3.6M and MPI-INF-3DHP). Notably, our model achieves +state-of-the-art performance on all action categories in the Human3.6M dataset +using detected 2D poses from CPN, and our code is available at: +https://github.com/KHB1698/DC-GCT. + +
+
+
+
+
+ + ☆ Informative Scene Graph Generation via Debiasing + + +
+ Scene graph generation aims to detect visual relationship triplets, (subject, +predicate, object). Due to biases in data, current models tend to predict +common predicates, e.g. "on" and "at", instead of informative ones, e.g. +"standing on" and "looking at". This tendency results in the loss of precise +information and overall performance. If a model only uses "stone on road" +rather than "stone blocking road" to describe an image, it may be a grave +misunderstanding. We argue that this phenomenon is caused by two imbalances: +semantic space level imbalance and training sample level imbalance. For this +problem, we propose DB-SGG, an effective framework based on debiasing but not +the conventional distribution fitting. It integrates two components: Semantic +Debiasing (SD) and Balanced Predicate Learning (BPL), for these imbalances. SD +utilizes a confusion matrix and a bipartite graph to construct predicate +relationships. BPL adopts a random undersampling strategy and an ambiguity +removing strategy to focus on informative predicates. Benefiting from the +model-agnostic process, our method can be easily applied to SGG models and +outperforms Transformer by 136.3%, 119.5%, and 122.6% on mR@20 at three SGG +sub-tasks on the SGG-VG dataset. Our method is further verified on another +complex SGG dataset (SGG-GQA) and two downstream tasks (sentence-to-graph +retrieval and image captioning). + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2108.13129 +
+
+
+
+
+ + ☆ Local-Global Information Interaction Debiasing for Dynamic Scene Graph + Generation + + +
+ The task of dynamic scene graph generation (DynSGG) aims to generate scene +graphs for given videos, which involves modeling the spatial-temporal +information in the video. However, due to the long-tailed distribution of +samples in the dataset, previous DynSGG models fail to predict the tail +predicates. We argue that this phenomenon is due to previous methods that only +pay attention to the local spatial-temporal information and neglect the +consistency of multiple frames. To solve this problem, we propose a novel +DynSGG model based on multi-task learning, DynSGG-MTL, which introduces the +local interaction information and global human-action interaction information. +The interaction between objects and frame features makes the model more fully +understand the visual context of the single image. Long-temporal human actions +supervise the model to generate multiple scene graphs that conform to the +global constraints and avoid the model being unable to learn the tail +predicates. Extensive experiments on Action Genome dataset demonstrate the +efficacy of our proposed framework, which not only improves the dynamic scene +graph generation but also alleviates the long-tail problem. + +
+
+
+
+
+ + ☆ TrainFors: A Large Benchmark Training Dataset for Image Manipulation + Detection and Localization + + +
+ The evaluation datasets and metrics for image manipulation detection and +localization (IMDL) research have been standardized. But the training dataset +for such a task is still nonstandard. Previous researchers have used +unconventional and deviating datasets to train neural networks for detecting +image forgeries and localizing pixel maps of manipulated regions. For a fair +comparison, the training set, test set, and evaluation metrics should be +persistent. Hence, comparing the existing methods may not seem fair as the +results depend heavily on the training datasets as well as the model +architecture. Moreover, none of the previous works release the synthetic +training dataset used for the IMDL task. We propose a standardized benchmark +training dataset for image splicing, copy-move forgery, removal forgery, and +image enhancement forgery. Furthermore, we identify the problems with the +existing IMDL datasets and propose the required modifications. We also train +the state-of-the-art IMDL methods on our proposed TrainFors1 dataset for a fair +evaluation and report the actual performance of these methods under similar +conditions. + +
+
+
+
+
+ + ☆ Aphid Cluster Recognition and Detection in the Wild Using Deep Learning + Models + + +
+ Aphid infestation poses a significant threat to crop production, rural +communities, and global food security. While chemical pest control is crucial +for maximizing yields, applying chemicals across entire fields is both +environmentally unsustainable and costly. Hence, precise localization and +management of aphids are essential for targeted pesticide application. The +paper primarily focuses on using deep learning models for detecting aphid +clusters. We propose a novel approach for estimating infection levels by +detecting aphid clusters. To facilitate this research, we have captured a +large-scale dataset from sorghum fields, manually selected 5,447 images +containing aphids, and annotated each individual aphid cluster within these +images. To facilitate the use of machine learning models, we further process +the images by cropping them into patches, resulting in a labeled dataset +comprising 151,380 image patches. Then, we implemented and compared the +performance of four state-of-the-art object detection models (VFNet, GFLV2, +PAA, and ATSS) on the aphid dataset. Extensive experimental results show that +all models yield stable similar performance in terms of average precision and +recall. We then propose to merge close neighboring clusters and remove tiny +clusters caused by cropping, and the performance is further boosted by around +17%. The study demonstrates the feasibility of automatically detecting and +managing insects using machine learning models. The labeled dataset will be +made openly available to the research community. + +
+
+
+
+
+ + ☆ Vision Backbone Enhancement via Multi-Stage Cross-Scale Attention + + +
+ Convolutional neural networks (CNNs) and vision transformers (ViTs) have +achieved remarkable success in various vision tasks. However, many +architectures do not consider interactions between feature maps from different +stages and scales, which may limit their performance. In this work, we propose +a simple add-on attention module to overcome these limitations via multi-stage +and cross-scale interactions. Specifically, the proposed Multi-Stage +Cross-Scale Attention (\meth) module takes feature maps from different stages +to enable multi-stage interactions and achieves cross-scale interactions by +computing self-attention at different scales based on the multi-stage feature +maps. Our experiments on several downstream tasks show that \meth~provides a +significant performance boost with modest additional FLOPs and runtime. + +
+
+
+
+
+ + ☆ The Multi-modality Cell Segmentation Challenge: Towards Universal + Solutions NeurIPS22 + + +
+ Cell segmentation is a critical step for quantitative single-cell analysis in +microscopy images. Existing cell segmentation methods are often tailored to +specific modalities or require manual interventions to specify hyperparameters +in different experimental settings. Here, we present a multi-modality cell +segmentation benchmark, comprising over 1500 labeled images derived from more +than 50 diverse biological experiments. The top participants developed a +Transformer-based deep-learning algorithm that not only exceeds existing +methods, but can also be applied to diverse microscopy images across imaging +platforms and tissue types without manual parameter adjustments. This benchmark +and the improved algorithm offer promising avenues for more accurate and +versatile cell analysis in microscopy imaging. + +
+
+ comment: NeurIPS22 Cell Segmentation Challenge: + https://neurips22-cellseg.grand-challenge.org/ +
+
+
+
+
+ + ☆ Unleashing the Strengths of Unlabeled Data in Pan-cancer Abdominal Organ + Quantification: the FLARE22 Challenge MICCAI + + +
+ Quantitative organ assessment is an essential step in automated abdominal +disease diagnosis and treatment planning. Artificial intelligence (AI) has +shown great potential to automatize this process. However, most existing AI +algorithms rely on many expert annotations and lack a comprehensive evaluation +of accuracy and efficiency in real-world multinational settings. To overcome +these limitations, we organized the FLARE 2022 Challenge, the largest abdominal +organ analysis challenge to date, to benchmark fast, low-resource, accurate, +annotation-efficient, and generalized AI algorithms. We constructed an +intercontinental and multinational dataset from more than 50 medical groups, +including Computed Tomography (CT) scans with different races, diseases, +phases, and manufacturers. We independently validated that a set of AI +algorithms achieved a median Dice Similarity Coefficient (DSC) of 90.0\% by +using 50 labeled scans and 2000 unlabeled scans, which can significantly reduce +annotation requirements. The best-performing algorithms successfully +generalized to holdout external validation sets, achieving a median DSC of +89.5\%, 90.9\%, and 88.3\% on North American, European, and Asian cohorts, +respectively. They also enabled automatic extraction of key organ biology +features, which was labor-intensive with traditional manual measurements. This +opens the potential to use unlabeled data to boost performance and alleviate +annotation shortages for modern AI models. + +
+
+ comment: MICCAI FLARE22: https://flare22.grand-challenge.org/ +
+
+
+
+
+ + ☆ SegDA: Maximum Separable Segment Mask with Pseudo Labels for Domain + Adaptive Semantic Segmentation ICCV + + +
+ Unsupervised Domain Adaptation (UDA) aims to solve the problem of label +scarcity of the target domain by transferring the knowledge from the label rich +source domain. Usually, the source domain consists of synthetic images for +which the annotation is easily obtained using the well known computer graphics +techniques. However, obtaining annotation for real world images (target domain) +require lot of manual annotation effort and is very time consuming because it +requires per pixel annotation. To address this problem we propose SegDA module +to enhance transfer performance of UDA methods by learning the maximum +separable segment representation. This resolves the problem of identifying +visually similar classes like pedestrian/rider, sidewalk/road etc. We leveraged +Equiangular Tight Frame (ETF) classifier inspired from Neural Collapse for +maximal separation between segment classes. This causes the source domain pixel +representation to collapse to a single vector forming a simplex vertices which +are aligned to the maximal separable ETF classifier. We use this phenomenon to +propose the novel architecture for domain adaptation of segment representation +for target domain. Additionally, we proposed to estimate the noise in labelling +the target domain images and update the decoder for noise correction which +encourages the discovery of pixels for classes not identified in pseudo labels. +We have used four UDA benchmarks simulating synthetic-to-real, +daytime-to-nighttime, clear-to-adverse weather scenarios. Our proposed approach +outperforms +2.2 mIoU on GTA -> Cityscapes, +2.0 mIoU on Synthia -> Cityscapes, ++5.9 mIoU on Cityscapes -> DarkZurich, +2.6 mIoU on Cityscapes -> ACDC. + +
+
+ comment: 11 pages, 4 Tables, 3 Figures, accepted at ICCVW 2023 (ICCV 2023: 4th + Workshop on Visual Perception for Navigation in Human Environments) +
+
+
+
+
+ + ☆ Seed Kernel Counting using Domain Randomization and Object Tracking + Neural Networks + + +
+ High-throughput phenotyping (HTP) of seeds, also known as seed phenotyping, +is the comprehensive assessment of complex seed traits such as growth, +development, tolerance, resistance, ecology, yield, and the measurement of +parameters that form more complex traits. One of the key aspects of seed +phenotyping is cereal yield estimation that the seed production industry relies +upon to conduct their business. While mechanized seed kernel counters are +available in the market currently, they are often priced high and sometimes +outside the range of small scale seed production firms' affordability. The +development of object tracking neural network models such as You Only Look Once +(YOLO) enables computer scientists to design algorithms that can estimate +cereal yield inexpensively. The key bottleneck with neural network models is +that they require a plethora of labelled training data before they can be put +to task. We demonstrate that the use of synthetic imagery serves as a feasible +substitute to train neural networks for object tracking that includes the tasks +of object classification and detection. Furthermore, we propose a seed kernel +counter that uses a low-cost mechanical hopper, trained YOLOv8 neural network +model, and object tracking algorithms on StrongSORT and ByteTrack to estimate +cereal yield from videos. The experiment yields a seed kernel count with an +accuracy of 95.2\% and 93.2\% for Soy and Wheat respectively using the +StrongSORT algorithm, and an accuray of 96.8\% and 92.4\% for Soy and Wheat +respectively using the ByteTrack algorithm. + +
+
+
+
+
+ + ☆ Encode-Store-Retrieve: Enhancing Memory Augmentation through + Language-Encoded Egocentric Perception + + +
+ We depend on our own memory to encode, store, and retrieve our experiences. +However, memory lapses can occur. One promising avenue for achieving memory +augmentation is through the use of augmented reality head-mounted displays to +capture and preserve egocentric videos, a practice commonly referred to as life +logging. However, a significant challenge arises from the sheer volume of video +data generated through life logging, as the current technology lacks the +capability to encode and store such large amounts of data efficiently. Further, +retrieving specific information from extensive video archives requires +substantial computational power, further complicating the task of quickly +accessing desired content. To address these challenges, we propose a memory +augmentation system that involves leveraging natural language encoding for +video data and storing them in a vector database. This approach harnesses the +power of large vision language models to perform the language encoding process. +Additionally, we propose using large language models to facilitate natural +language querying. Our system underwent extensive evaluation using the QA-Ego4D +dataset and achieved state-of-the-art results with a BLEU score of 8.3, +outperforming conventional machine learning models that scored between 3.4 and +5.8. Additionally, in a user study, our system received a higher mean response +score of 4.13/5 compared to the human participants' score of 2.46/5 on +real-life episodic memory tasks. + +
+
+
+
+
+ + ☆ Recognizing Handwritten Mathematical Expressions of Vertical Addition + and Subtraction + + +
+ Handwritten Mathematical Expression Recognition (HMER) is a challenging task +with many educational applications. Recent methods for HMER have been developed +for complex mathematical expressions in standard horizontal format. However, +solutions for elementary mathematical expression, such as vertical addition and +subtraction, have not been explored in the literature. This work proposes a new +handwritten elementary mathematical expression dataset composed of addition and +subtraction expressions in a vertical format. We also extended the MNIST +dataset to generate artificial images with this structure. Furthermore, we +proposed a solution for offline HMER, able to recognize vertical addition and +subtraction expressions. Our analysis evaluated the object detection algorithms +YOLO v7, YOLO v8, YOLO-NAS, NanoDet and FCOS for identifying the mathematical +symbols. We also proposed a transcription method to map the bounding boxes from +the object detection stage to a mathematical expression in the LATEX markup +sequence. Results show that our approach is efficient, achieving a high +expression recognition rate. The code and dataset are available at +https://github.com/Danielgol/HME-VAS + +
+
+ comment: Paper accepted at SIBGRAPI 2023 +
+
+
+
+
+ + ☆ Absorption-Based, Passive Range Imaging from Hyperspectral Thermal + Measurements + + +
+ Passive hyperspectral long-wave infrared measurements are remarkably +informative about the surroundings, such as remote object material composition, +temperature, and range; and air temperature and gas concentrations. Remote +object material and temperature determine the spectrum of thermal radiance, and +range, air temperature, and gas concentrations determine how this spectrum is +modified by propagation to the sensor. We computationally separate these +phenomena, introducing a novel passive range imaging method based on +atmospheric absorption of ambient thermal radiance. Previously demonstrated +passive absorption-based ranging methods assume hot and highly emitting +objects. However, the temperature variation in natural scenes is usually low, +making range imaging challenging. Our method benefits from explicit +consideration of air emission and parametric modeling of atmospheric +absorption. To mitigate noise in low-contrast scenarios, we jointly estimate +range and intrinsic object properties by exploiting a variety of absorption +lines spread over the infrared spectrum. Along with Monte Carlo simulations +that demonstrate the importance of regularization, temperature differentials, +and availability of many spectral bands, we apply this method to long-wave +infrared (8--13 $\mu$m) hyperspectral image data acquired from natural scenes +with no active illumination. Range features from 15m to 150m are recovered, +with good qualitative match to unaligned lidar data. + +
+
+ comment: 15 pages, 14 figures +
+
+
+
+
+ + ☆ Spintronics for image recognition : performance benchmarking via + ultrafast data-driven simulations + + +
+ We present a demonstration of image classification using a hardware-based +echo-state network (ESN) that relies on spintronic nanostructures known as +vortex-based spin-torque oscillators (STVOs). Our network is realized using a +single STVO multiplexed in time. To circumvent the challenges associated with +repeated experimental manipulation of such a nanostructured system, we employ +an ultrafast data-driven simulation framework called the data-driven Thiele +equation approach (DD-TEA) to simulate the STVO dynamics. We use this approach +to efficiently develop, optimize and test an STVO-based ESN for image +classification using the MNIST dataset. We showcase the versatility of our +solution by successfully applying it to solve classification challenges with +the EMNIST-letters and Fashion MNIST datasets. Through our simulations, we +determine that within a large ESN the results obtained using the STVO dynamics +as an activation function are comparable to the ones obtained with other +conventional nonlinear activation functions like the reLU and the sigmoid. +While achieving state-of-the-art accuracy levels on the MNIST dataset, our +model's performance on EMNIST-letters and Fashion MNIST is lower due to the +relative simplicity of the system architecture and the increased complexity of +the tasks. We expect that the DD-TEA framework will enable the exploration of +more specialized neural architectures, ultimately leading to improved +classification accuracy. This approach also holds promise for investigating and +developing dedicated learning rules to further enhance classification +performance. + +
+
+
+
+
+ + ☆ Temporally-Adaptive Models for Efficient Video Understanding + + +
+ Spatial convolutions are extensively used in numerous deep video models. It +fundamentally assumes spatio-temporal invariance, i.e., using shared weights +for every location in different frames. This work presents Temporally-Adaptive +Convolutions (TAdaConv) for video understanding, which shows that adaptive +weight calibration along the temporal dimension is an efficient way to +facilitate modeling complex temporal dynamics in videos. Specifically, TAdaConv +empowers spatial convolutions with temporal modeling abilities by calibrating +the convolution weights for each frame according to its local and global +temporal context. Compared to existing operations for temporal modeling, +TAdaConv is more efficient as it operates over the convolution kernels instead +of the features, whose dimension is an order of magnitude smaller than the +spatial resolutions. Further, kernel calibration brings an increased model +capacity. Based on this readily plug-in operation TAdaConv as well as its +extension, i.e., TAdaConvV2, we construct TAdaBlocks to empower ConvNeXt and +Vision Transformer to have strong temporal modeling capabilities. Empirical +results show TAdaConvNeXtV2 and TAdaFormer perform competitively against +state-of-the-art convolutional and Transformer-based models in various video +understanding benchmarks. Our codes and models are released at: +https://github.com/alibaba-mmai-research/TAdaConv. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2110.06178 +
+
+
+
+
+ + ☆ Leverage Weakly Annotation to Pixel-wise Annotation via Zero-shot + Segment Anything Model for Molecular-empowered Learning + + +
+ Precise identification of multiple cell classes in high-resolution Giga-pixel +whole slide imaging (WSI) is critical for various clinical scenarios. Building +an AI model for this purpose typically requires pixel-level annotations, which +are often unscalable and must be done by skilled domain experts (e.g., +pathologists). However, these annotations can be prone to errors, especially +when distinguishing between intricate cell types (e.g., podocytes and mesangial +cells) using only visual inspection. Interestingly, a recent study showed that +lay annotators, when using extra immunofluorescence (IF) images for reference +(referred to as molecular-empowered learning), can sometimes outperform domain +experts in labeling. Despite this, the resource-intensive task of manual +delineation remains a necessity during the annotation process. In this paper, +we explore the potential of bypassing pixel-level delineation by employing the +recent segment anything model (SAM) on weak box annotation in a zero-shot +learning approach. Specifically, we harness SAM's ability to produce +pixel-level annotations from box annotations and utilize these SAM-generated +labels to train a segmentation model. Our findings show that the proposed +SAM-assisted molecular-empowered learning (SAM-L) can diminish the labeling +efforts for lay annotators by only requiring weak box annotations. This is +achieved without compromising annotation accuracy or the performance of the +deep learning-based segmentation. This research represents a significant +advancement in democratizing the annotation process for training pathological +image segmentation, relying solely on non-expert annotators. + +
+
+
+
+
+ + ☆ High-performance Data Management for Whole Slide Image Analysis in + Digital Pathology + + +
+ When dealing with giga-pixel digital pathology in whole-slide imaging, a +notable proportion of data records holds relevance during each analysis +operation. For instance, when deploying an image analysis algorithm on +whole-slide images (WSI), the computational bottleneck often lies in the +input-output (I/O) system. This is particularly notable as patch-level +processing introduces a considerable I/O load onto the computer system. +However, this data management process can be potentially further paralleled, +given the typical independence of patch-level image processes across different +patches. This paper details our endeavors in tackling this data access +challenge through the implementation of the Adaptable IO System version 2 +(ADIOS2). Our focus has been on constructing and releasing a digital +pathology-centric pipeline using ADIOS2, which facilitates streamlined data +management across WSIs. Additionally, we've developed strategies aimed at +curtailing data retrieval times. The performance evaluation encompasses two key +scenarios: (1) a pure CPU-based image analysis scenario (termed the "CPU +scenario"), and (2) a GPU-based deep learning framework scenario (referred to +as the "GPU scenario"). Our findings reveal noteworthy outcomes. Under the CPU +scenario, ADIOS2 showcases an impressive two-fold speed-up in comparison to the +brute-force approach. In the GPU scenario, its performance stands on par with +the cutting-edge GPU I/O acceleration framework, NVIDIA Magnum IO GPU Direct +Storage (GDS). From what we know, this appears to be among the initial +instances, if any, of utilizing ADIOS2 within the field of digital pathology. +The source code has been made publicly available at +https://github.com/hrlblab/adios. + +
+
+
+
+
+ + ☆ Multi-scale Multi-site Renal Microvascular Structures Segmentation for + Whole Slide Imaging in Renal Pathology + + +
+ Segmentation of microvascular structures, such as arterioles, venules, and +capillaries, from human kidney whole slide images (WSI) has become a focal +point in renal pathology. Current manual segmentation techniques are +time-consuming and not feasible for large-scale digital pathology images. While +deep learning-based methods offer a solution for automatic segmentation, most +suffer from a limitation: they are designed for and restricted to training on +single-site, single-scale data. In this paper, we present Omni-Seg, a novel +single dynamic network method that capitalizes on multi-site, multi-scale +training data. Unique to our approach, we utilize partially labeled images, +where only one tissue type is labeled per training image, to segment +microvascular structures. We train a singular deep network using images from +two datasets, HuBMAP and NEPTUNE, across different magnifications (40x, 20x, +10x, and 5x). Experimental results indicate that Omni-Seg outperforms in terms +of both the Dice Similarity Coefficient (DSC) and Intersection over Union +(IoU). Our proposed method provides renal pathologists with a powerful +computational tool for the quantitative analysis of renal microvascular +structures. + +
+
+
+
+
+ + ♻ ☆ InFusion: Inject and Attention Fusion for Multi Concept Zero-Shot + Text-based Video Editing ICCV + + +
+ Large text-to-image diffusion models have achieved remarkable success in +generating diverse, high-quality images. Additionally, these models have been +successfully leveraged to edit input images by just changing the text prompt. +But when these models are applied to videos, the main challenge is to ensure +temporal consistency and coherence across frames. In this paper, we propose +InFusion, a framework for zero-shot text-based video editing leveraging large +pre-trained image diffusion models. Our framework specifically supports editing +of multiple concepts with pixel-level control over diverse concepts mentioned +in the editing prompt. Specifically, we inject the difference in features +obtained with source and edit prompts from U-Net residual blocks of decoder +layers. When these are combined with injected attention features, it becomes +feasible to query the source contents and scale edited concepts along with the +injection of unedited parts. The editing is further controlled in a +fine-grained manner with mask extraction and attention fusion, which cut the +edited part from the source and paste it into the denoising pipeline for the +editing prompt. Our framework is a low-cost alternative to one-shot tuned +models for editing since it does not require training. We demonstrated complex +concept editing with a generalised image model (Stable Diffusion v1.5) using +LoRA. Adaptation is compatible with all the existing image diffusion +techniques. Extensive experimental results demonstrate the effectiveness of +existing methods in rendering high-quality and temporally consistent videos. + +
+
+ comment: 10 pages, 8 figures, 1 Table, accepted at ICCVW 2023 (ICCV 2023 + Workshop on AI for Creative Video Editing and Understanding) +
+
+
+
+
+ + ♻ ☆ Diffusion Denoised Smoothing for Certified and Adversarial Robust + Out-Of-Distribution Detection + + +
+ As the use of machine learning continues to expand, the importance of +ensuring its safety cannot be overstated. A key concern in this regard is the +ability to identify whether a given sample is from the training distribution, +or is an "Out-Of-Distribution" (OOD) sample. In addition, adversaries can +manipulate OOD samples in ways that lead a classifier to make a confident +prediction. In this study, we present a novel approach for certifying the +robustness of OOD detection within a $\ell_2$-norm around the input, regardless +of network architecture and without the need for specific components or +additional training. Further, we improve current techniques for detecting +adversarial attacks on OOD samples, while providing high levels of certified +and adversarial robustness on in-distribution samples. The average of all OOD +detection metrics on CIFAR10/100 shows an increase of $\sim 13 \% / 5\%$ +relative to previous approaches. + +
+
+
+
+
+ + ♻ ☆ STHG: Spatial-Temporal Heterogeneous Graph Learning for Advanced + Audio-Visual Diarization CVPR 2023 + + +
+ This report introduces our novel method named STHG for the Audio-Visual +Diarization task of the Ego4D Challenge 2023. Our key innovation is that we +model all the speakers in a video using a single, unified heterogeneous graph +learning framework. Unlike previous approaches that require a separate +component solely for the camera wearer, STHG can jointly detect the speech +activities of all people including the camera wearer. Our final method obtains +61.1% DER on the test set of Ego4D, which significantly outperforms all the +baselines as well as last year's winner. Our submission achieved 1st place in +the Ego4D Challenge 2023. We additionally demonstrate that applying the +off-the-shelf speech recognition system to the diarized speech segments by STHG +produces a competitive performance on the Speech Transcription task of this +challenge. + +
+
+ comment: Validation report for the Ego4D challenge at CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Intel Labs at Ego4D Challenge 2022: A Better Baseline for Audio-Visual + Diarization ECCV 2022 + + +
+ This report describes our approach for the Audio-Visual Diarization (AVD) +task of the Ego4D Challenge 2022. Specifically, we present multiple technical +improvements over the official baselines. First, we improve the detection +performance of the camera wearer's voice activity by modifying the training +scheme of its model. Second, we discover that an off-the-shelf voice activity +detection model can effectively remove false positives when it is applied +solely to the camera wearer's voice activities. Lastly, we show that better +active speaker detection leads to a better AVD outcome. Our final method +obtains 65.9% DER on the test set of Ego4D, which significantly outperforms all +the baselines. Our submission achieved 1st place in the Ego4D Challenge 2022. + +
+
+ comment: Validation report for the Ego4D challenge at ECCV 2022 +
+
+
+
+
+ + ♻ ☆ Scaling may be all you need for achieving human-level object recognition + capacity with human-like visual experience + + +
+ This paper asks whether current self-supervised learning methods, if +sufficiently scaled up, would be able to reach human-level visual object +recognition capabilities with the same type and amount of visual experience +humans learn from. Previous work on this question only considered the scaling +of data size. Here, we consider the simultaneous scaling of data size, model +size, and image resolution. We perform a scaling experiment with vision +transformers up to 633M parameters in size (ViT-H/14) trained with up to 5K +hours of human-like video data (long, continuous, mostly egocentric videos) +with image resolutions of up to 476x476 pixels. The efficiency of masked +autoencoders (MAEs) as a self-supervised learning algorithm makes it possible +to run this scaling experiment on an unassuming academic budget. We find that +it is feasible to reach human-level object recognition capacity at sub-human +scales of model size, data size, and image size, if these factors are scaled up +simultaneously. To give a concrete example, we estimate that a 2.5B parameter +ViT model trained with 20K hours (2.3 years) of human-like video data with a +spatial resolution of 952x952 pixels should be able to reach roughly +human-level accuracy on ImageNet. Human-level competence is thus achievable for +a fundamental perceptual capability from human-like perceptual experience +(human-like in both amount and type) with extremely generic learning algorithms +and architectures and without any substantive inductive biases. + +
+
+ comment: v2 adds an Appendix containing results with alternative scaling + functions; code & models available from + https://github.com/eminorhan/humanlike-vits +
+
+
+
+
+ + ♻ ☆ Open-vocabulary Object Segmentation with Diffusion Models + + +
+ The goal of this paper is to extract the visual-language correspondence from +a pre-trained text-to-image diffusion model, in the form of segmentation map, +i.e., simultaneously generating images and segmentation masks for the +corresponding visual entities described in the text prompt. We make the +following contributions: (i) we pair the existing Stable Diffusion model with a +novel grounding module, that can be trained to align the visual and textual +embedding space of the diffusion model with only a small number of object +categories; (ii) we establish an automatic pipeline for constructing a dataset, +that consists of {image, segmentation mask, text prompt} triplets, to train the +proposed grounding module; (iii) we evaluate the performance of open-vocabulary +grounding on images generated from the text-to-image diffusion model and show +that the module can well segment the objects of categories beyond seen ones at +training time; (iv) we adopt the augmented diffusion model to build a synthetic +semantic segmentation dataset, and show that, training a standard segmentation +model on such dataset demonstrates competitive performance on the zero-shot +segmentation(ZS3) benchmark, which opens up new opportunities for adopting the +powerful diffusion model for discriminative tasks. + +
+
+
+
+
+ + ♻ ☆ GUNNEL: Guided Mixup Augmentation and Multi-View Fusion for Aquatic + Animal Segmentation + + +
+ Recent years have witnessed great advances in object segmentation research. +In addition to generic objects, aquatic animals have attracted research +attention. Deep learning-based methods are widely used for aquatic animal +segmentation and have achieved promising performance. However, there is a lack +of challenging datasets for benchmarking. In this work, we build a new dataset +dubbed Aquatic Animal Species. We also devise a novel GUided mixup augmeNtatioN +and multi-modEl fusion for aquatic animaL segmentation (GUNNEL) that leverages +the advantages of multiple segmentation models to effectively segment aquatic +animals and improves the training performance by synthesizing hard samples. +Extensive experiments demonstrated the superiority of our proposed framework +over existing state-of-the-art instance segmentation methods. The code is +available at https://github.com/lmquan2000/mask-mixup. The dataset is available +at https://doi.org/10.5281/zenodo.8208877 . + +
+
+ comment: The code is available at https://github.com/lmquan2000/mask-mixup . + The dataset is available at https://doi.org/10.5281/zenodo.8208877 +
+
+
+
+
+ + ♻ ☆ VT-CLIP: Enhancing Vision-Language Models with Visual-guided Texts + + +
+ Contrastive Language-Image Pre-training (CLIP) has drawn increasing attention +recently for its transferable visual representation learning. However, due to +the semantic gap within datasets, CLIP's pre-trained image-text alignment +becomes sub-optimal on downstream tasks, which severely harms its transferring +performance. To better adapt the cross-modality embedding space, we propose to +enhance CLIP via Visual-guided Texts, named VT-CLIP. Specifically, we guide +textual features of different categories to adaptively explore informative +regions on the image and aggregate visual features by attention mechanisms. In +this way, the texts become visual-guided, namely, more semantically correlated +with downstream images, which greatly benefits the category-wise matching +process. In few-shot settings, we evaluate our VT-CLIP on 11 well-known +classification datasets to demonstrate its effectiveness. + +
+
+
+
+
+ + ♻ ☆ Deep learning-based Crop Row Detection for Infield Navigation of + Agri-Robots + + +
+ Autonomous navigation in agricultural environments is challenged by varying +field conditions that arise in arable fields. State-of-the-art solutions for +autonomous navigation in such environments require expensive hardware such as +RTK-GNSS. This paper presents a robust crop row detection algorithm that +withstands such field variations using inexpensive cameras. Existing datasets +for crop row detection does not represent all the possible field variations. A +dataset of sugar beet images was created representing 11 field variations +comprised of multiple grow stages, light levels, varying weed densities, curved +crop rows and discontinuous crop rows. The proposed pipeline segments the crop +rows using a deep learning-based method and employs the predicted segmentation +mask for extraction of the central crop using a novel central crop row +selection algorithm. The novel crop row detection algorithm was tested for crop +row detection performance and the capability of visual servoing along a crop +row. The visual servoing-based navigation was tested on a realistic simulation +scenario with the real ground and plant textures. Our algorithm demonstrated +robust vision-based crop row detection in challenging field conditions +outperforming the baseline. + +
+
+ comment: Published in Journal of Field Robotics: + https://onlinelibrary.wiley.com/doi/epdf/10.1002/rob.22238 +
+
+
+
+
+ + ♻ ☆ Deep Multiview Clustering by Contrasting Cluster Assignments + + +
+ Multiview clustering (MVC) aims to reveal the underlying structure of +multiview data by categorizing data samples into clusters. Deep learning-based +methods exhibit strong feature learning capabilities on large-scale datasets. +For most existing deep MVC methods, exploring the invariant representations of +multiple views is still an intractable problem. In this paper, we propose a +cross-view contrastive learning (CVCL) method that learns view-invariant +representations and produces clustering results by contrasting the cluster +assignments among multiple views. Specifically, we first employ deep +autoencoders to extract view-dependent features in the pretraining stage. Then, +a cluster-level CVCL strategy is presented to explore consistent semantic label +information among the multiple views in the fine-tuning stage. Thus, the +proposed CVCL method is able to produce more discriminative cluster assignments +by virtue of this learning strategy. Moreover, we provide a theoretical +analysis of soft cluster assignment alignment. Extensive experimental results +obtained on several datasets demonstrate that the proposed CVCL method +outperforms several state-of-the-art approaches. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Symmetry Defense Against CNN Adversarial Perturbation Attacks + + +
+ This paper uses symmetry to make Convolutional Neural Network classifiers +(CNNs) robust against adversarial perturbation attacks. Such attacks add +perturbation to original images to generate adversarial images that fool +classifiers such as road sign classifiers of autonomous vehicles. Although +symmetry is a pervasive aspect of the natural world, CNNs are unable to handle +symmetry well. For example, a CNN can classify an image differently from its +mirror image. For an adversarial image that misclassifies with a wrong label +$l_w$, CNN inability to handle symmetry means that a symmetric adversarial +image can classify differently from the wrong label $l_w$. Further than that, +we find that the classification of a symmetric adversarial image reverts to the +correct label. To classify an image when adversaries are unaware of the +defense, we apply symmetry to the image and use the classification label of the +symmetric image. To classify an image when adversaries are aware of the +defense, we use mirror symmetry and pixel inversion symmetry to form a symmetry +group. We apply all the group symmetries to the image and decide on the output +label based on the agreement of any two of the classification labels of the +symmetry images. Adaptive attacks fail because they need to rely on loss +functions that use conflicting CNN output values for symmetric images. Without +attack knowledge, the proposed symmetry defense succeeds against both +gradient-based and random-search attacks, with up to near-default accuracies +for ImageNet. The defense even improves the classification accuracy of original +images. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Decoupled Diffusion Models with Explicit Transition Probability + + +
+ Recent diffusion probabilistic models (DPMs) have shown remarkable abilities +of generated content, however, they often suffer from complex forward +processes, resulting in inefficient solutions for the reversed process and +prolonged sampling times. In this paper, we aim to address the aforementioned +challenges by focusing on the diffusion process itself that we propose to +decouple the intricate diffusion process into two comparatively simpler process +to improve the generative efficacy and speed. In particular, we present a novel +diffusion paradigm named DDM (Decoupled Diffusion Models) based on the Ito +diffusion process, in which the image distribution is approximated by an +explicit transition probability while the noise path is controlled by the +standard Wiener process. We find that decoupling the diffusion process reduces +the learning difficulty and the explicit transition probability improves the +generative speed significantly. We prove a new training objective for DPM, +which enables the model to learn to predict the noise and image components +separately. Moreover, given the novel forward diffusion equation, we derive the +reverse denoising formula of DDM that naturally supports fewer steps of +generation without ordinary differential equation (ODE) based accelerators. Our +experiments demonstrate that DDM outperforms previous DPMs by a large margin in +fewer function evaluations setting and gets comparable performances in long +function evaluations setting. We also show that our framework can be applied to +image-conditioned generation and high-resolution image synthesis, and that it +can generate high-quality images with only 10 function evaluations. + +
+
+
+
+
+ + ♻ ☆ BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained + Diffusion ICCV 2023 + + +
+ Recent text-to-image diffusion models have demonstrated an astonishing +capacity to generate high-quality images. However, researchers mainly studied +the way of synthesizing images with only text prompts. While some works have +explored using other modalities as conditions, considerable paired data, e.g., +box/mask-image pairs, and fine-tuning time are required for nurturing models. +As such paired data is time-consuming and labor-intensive to acquire and +restricted to a closed set, this potentially becomes the bottleneck for +applications in an open world. This paper focuses on the simplest form of +user-provided conditions, e.g., box or scribble. To mitigate the aforementioned +problem, we propose a training-free method to control objects and contexts in +the synthesized images adhering to the given spatial conditions. Specifically, +three spatial constraints, i.e., Inner-Box, Outer-Box, and Corner Constraints, +are designed and seamlessly integrated into the denoising step of diffusion +models, requiring no additional training and massive annotated layout data. +Extensive results show that the proposed constraints can control what and where +to present in the images while retaining the ability of the Stable Diffusion +model to synthesize with high fidelity and diverse concept coverage. The code +is publicly available at https://github.com/Sierkinhane/BoxDiff. + +
+
+ comment: Accepted by ICCV 2023. Code is available at: + https://github.com/Sierkinhane/BoxDiff +
+
+
+
+
+ + ♻ ☆ InstantAvatar: Efficient 3D Head Reconstruction via Surface Rendering + + +
+ Recent advances in full-head reconstruction have been obtained by optimizing +a neural field through differentiable surface or volume rendering to represent +a single scene. While these techniques achieve an unprecedented accuracy, they +take several minutes, or even hours, due to the expensive optimization process +required. In this work, we introduce InstantAvatar, a method that recovers +full-head avatars from few images (down to just one) in a few seconds on +commodity hardware. In order to speed up the reconstruction process, we propose +a system that combines, for the first time, a voxel-grid neural field +representation with a surface renderer. Notably, a naive combination of these +two techniques leads to unstable optimizations that do not converge to valid +solutions. In order to overcome this limitation, we present a novel statistical +model that learns a prior distribution over 3D head signed distance functions +using a voxel-grid based architecture. The use of this prior model, in +combination with other design choices, results into a system that achieves 3D +head reconstructions with comparable accuracy as the state-of-the-art with a +100x speed-up. + +
+
+
+
+
+ + ♻ ☆ Revisiting Domain-Adaptive 3D Object Detection by Reliable, Diverse and + Class-balanced Pseudo-Labeling ICCV 2023 + + +
+ Unsupervised domain adaptation (DA) with the aid of pseudo labeling +techniques has emerged as a crucial approach for domain-adaptive 3D object +detection. While effective, existing DA methods suffer from a substantial drop +in performance when applied to a multi-class training setting, due to the +co-existence of low-quality pseudo labels and class imbalance issues. In this +paper, we address this challenge by proposing a novel ReDB framework tailored +for learning to detect all classes at once. Our approach produces Reliable, +Diverse, and class-Balanced pseudo 3D boxes to iteratively guide the +self-training on a distributionally different target domain. To alleviate +disruptions caused by the environmental discrepancy (e.g., beam numbers), the +proposed cross-domain examination (CDE) assesses the correctness of pseudo +labels by copy-pasting target instances into a source environment and measuring +the prediction consistency. To reduce computational overhead and mitigate the +object shift (e.g., scales and point densities), we design an overlapped boxes +counting (OBC) metric that allows to uniformly downsample pseudo-labeled +objects across different geometric characteristics. To confront the issue of +inter-class imbalance, we progressively augment the target point clouds with a +class-balanced set of pseudo-labeled target instances and source objects, which +boosts recognition accuracies on both frequently appearing and rare classes. +Experimental results on three benchmark datasets using both voxel-based (i.e., +SECOND) and point-based 3D detectors (i.e., PointRCNN) demonstrate that our +proposed ReDB approach outperforms existing 3D domain adaptation methods by a +large margin, improving 23.15% mAP on the nuScenes $\rightarrow$ KITTI task. +The code is available at https://github.com/zhuoxiao-chen/ReDB-DA-3Ddet. + +
+
+ comment: Accepted by ICCV 2023, camera-ready +
+
+
+
+
+ + ♻ ☆ Context Autoencoder for Self-Supervised Representation Learning + + +
+ We present a novel masked image modeling (MIM) approach, context autoencoder +(CAE), for self-supervised representation pretraining. We pretrain an encoder +by making predictions in the encoded representation space. The pretraining +tasks include two tasks: masked representation prediction - predict the +representations for the masked patches, and masked patch reconstruction - +reconstruct the masked patches. The network is an encoder-regressor-decoder +architecture: the encoder takes the visible patches as input; the regressor +predicts the representations of the masked patches, which are expected to be +aligned with the representations computed from the encoder, using the +representations of visible patches and the positions of visible and masked +patches; the decoder reconstructs the masked patches from the predicted encoded +representations. The CAE design encourages the separation of learning the +encoder (representation) from completing the pertaining tasks: masked +representation prediction and masked patch reconstruction tasks, and making +predictions in the encoded representation space empirically shows the benefit +to representation learning. We demonstrate the effectiveness of our CAE through +superior transfer performance in downstream tasks: semantic segmentation, +object detection and instance segmentation, and classification. The code will +be available at https://github.com/Atten4Vis/CAE. + +
+
+ comment: Accepted by International Journal of Computer Vision (IJCV) +
+
+
+
+
+ + ♻ ☆ IDiff-Face: Synthetic-based Face Recognition through Fizzy + Identity-Conditioned Diffusion Models ICCV2023 + + +
+ The availability of large-scale authentic face databases has been crucial to +the significant advances made in face recognition research over the past +decade. However, legal and ethical concerns led to the recent retraction of +many of these databases by their creators, raising questions about the +continuity of future face recognition research without one of its key +resources. Synthetic datasets have emerged as a promising alternative to +privacy-sensitive authentic data for face recognition development. However, +recent synthetic datasets that are used to train face recognition models suffer +either from limitations in intra-class diversity or cross-class (identity) +discrimination, leading to less optimal accuracies, far away from the +accuracies achieved by models trained on authentic data. This paper targets +this issue by proposing IDiff-Face, a novel approach based on conditional +latent diffusion models for synthetic identity generation with realistic +identity variations for face recognition training. Through extensive +evaluations, our proposed synthetic-based face recognition approach pushed the +limits of state-of-the-art performances, achieving, for example, 98.00% +accuracy on the Labeled Faces in the Wild (LFW) benchmark, far ahead from the +recent synthetic-based face recognition solutions with 95.40% and bridging the +gap to authentic-based face recognition with 99.82% accuracy. + +
+
+ comment: Accepted at ICCV2023 +
+
+
+
+
+ + ♻ ☆ Neural Video Depth Stabilizer ICCV2023 + + +
+ Video depth estimation aims to infer temporally consistent depth. Some +methods achieve temporal consistency by finetuning a single-image depth model +during test time using geometry and re-projection constraints, which is +inefficient and not robust. An alternative approach is to learn how to enforce +temporal consistency from data, but this requires well-designed models and +sufficient video depth data. To address these challenges, we propose a +plug-and-play framework called Neural Video Depth Stabilizer (NVDS) that +stabilizes inconsistent depth estimations and can be applied to different +single-image depth models without extra effort. We also introduce a large-scale +dataset, Video Depth in the Wild (VDW), which consists of 14,203 videos with +over two million frames, making it the largest natural-scene video depth +dataset to our knowledge. We evaluate our method on the VDW dataset as well as +two public benchmarks and demonstrate significant improvements in consistency, +accuracy, and efficiency compared to previous approaches. Our work serves as a +solid baseline and provides a data foundation for learning-based video depth +models. We will release our dataset and code for future research. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ TORE: Token Reduction for Efficient Human Mesh Recovery with Transformer ICCV 2023 + + +
+ In this paper, we introduce a set of simple yet effective TOken REduction +(TORE) strategies for Transformer-based Human Mesh Recovery from monocular +images. Current SOTA performance is achieved by Transformer-based structures. +However, they suffer from high model complexity and computation cost caused by +redundant tokens. We propose token reduction strategies based on two important +aspects, i.e., the 3D geometry structure and 2D image feature, where we +hierarchically recover the mesh geometry with priors from body structure and +conduct token clustering to pass fewer but more discriminative image feature +tokens to the Transformer. Our method massively reduces the number of tokens +involved in high-complexity interactions in the Transformer. This leads to a +significantly reduced computational cost while still achieving competitive or +even higher accuracy in shape recovery. Extensive experiments across a wide +range of benchmarks validate the superior effectiveness of the proposed method. +We further demonstrate the generalizability of our method on hand mesh +recovery. Visit our project page at +https://frank-zy-dou.github.io/projects/Tore/index.html. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ MVDiffusion: Enabling Holistic Multi-view Image Generation with + Correspondence-Aware Diffusion + + +
+ This paper introduces MVDiffusion, a simple yet effective method for +generating consistent multi-view images from text prompts given pixel-to-pixel +correspondences (e.g., perspective crops from a panorama or multi-view images +given depth maps and poses). Unlike prior methods that rely on iterative image +warping and inpainting, MVDiffusion simultaneously generates all images with a +global awareness, effectively addressing the prevalent error accumulation +issue. At its core, MVDiffusion processes perspective images in parallel with a +pre-trained text-to-image diffusion model, while integrating novel +correspondence-aware attention layers to facilitate cross-view interactions. +For panorama generation, while only trained with 10k panoramas, MVDiffusion is +able to generate high-resolution photorealistic images for arbitrary texts or +extrapolate one perspective image to a 360-degree view. For multi-view +depth-to-image generation, MVDiffusion demonstrates state-of-the-art +performance for texturing a scene mesh. The project page is at +https://mvdiffusion.github.io/. + +
+
+ comment: Project page, https://mvdiffusion.github.io, new functionality, + improved results, better writing +
+
+
+
+
+ + ♻ ☆ Learning Dense UV Completion for Human Mesh Recovery + + +
+ Human mesh reconstruction from a single image is challenging in the presence +of occlusion, which can be caused by self, objects, or other humans. Existing +methods either fail to separate human features accurately or lack proper +supervision for feature completion. In this paper, we propose Dense Inpainting +Human Mesh Recovery (DIMR), a two-stage method that leverages dense +correspondence maps to handle occlusion. Our method utilizes a dense +correspondence map to separate visible human features and completes human +features on a structured UV map dense human with an attention-based feature +completion module. We also design a feature inpainting training procedure that +guides the network to learn from unoccluded features. We evaluate our method on +several datasets and demonstrate its superior performance under heavily +occluded scenarios compared to other methods. Extensive experiments show that +our method obviously outperforms prior SOTA methods on heavily occluded images +and achieves comparable results on the standard benchmarks (3DPW). + +
+
+
+
+
+ + ♻ ☆ Learning Music-Dance Representations through Explicit-Implicit Rhythm + Synchronization + + +
+ Although audio-visual representation has been proved to be applicable in many +downstream tasks, the representation of dancing videos, which is more specific +and always accompanied by music with complex auditory contents, remains +challenging and uninvestigated. Considering the intrinsic alignment between the +cadent movement of dancer and music rhythm, we introduce MuDaR, a novel +Music-Dance Representation learning framework to perform the synchronization of +music and dance rhythms both in explicit and implicit ways. Specifically, we +derive the dance rhythms based on visual appearance and motion cues inspired by +the music rhythm analysis. Then the visual rhythms are temporally aligned with +the music counterparts, which are extracted by the amplitude of sound +intensity. Meanwhile, we exploit the implicit coherence of rhythms implied in +audio and visual streams by contrastive learning. The model learns the joint +embedding by predicting the temporal consistency between audio-visual pairs. +The music-dance representation, together with the capability of detecting audio +and visual rhythms, can further be applied to three downstream tasks: (a) dance +classification, (b) music-dance retrieval, and (c) music-dance retargeting. +Extensive experiments demonstrate that our proposed framework outperforms other +self-supervised methods by a large margin. + +
+
+ comment: Accepted for publication in IEEE Transactions on Multimedia +
+
+
+
+
+ + ♻ ☆ Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based + Residual U-Blocks Network + + +
+ Nucleus image segmentation is a crucial step in the analysis, pathological +diagnosis, and classification, which heavily relies on the quality of nucleus +segmentation. However, the complexity of issues such as variations in nucleus +size, blurred nucleus contours, uneven staining, cell clustering, and +overlapping cells poses significant challenges. Current methods for nucleus +segmentation primarily rely on nuclear morphology or contour-based approaches. +Nuclear morphology-based methods exhibit limited generalization ability and +struggle to effectively predict irregular-shaped nuclei, while contour-based +extraction methods face challenges in accurately segmenting overlapping nuclei. +To address the aforementioned issues, we propose a dual-branch network using +hybrid attention based residual U-blocks for nucleus instance segmentation. The +network simultaneously predicts target information and target contours. +Additionally, we introduce a post-processing method that combines the target +information and target contours to distinguish overlapping nuclei and generate +an instance segmentation image. Within the network, we propose a context fusion +block (CF-block) that effectively extracts and merges contextual information +from the network. Extensive quantitative evaluations are conducted to assess +the performance of our method. Experimental results demonstrate the superior +performance of the proposed method compared to state-of-the-art approaches on +the BNS, MoNuSeg, CoNSeg, and CPM-17 datasets. + +
+
+ comment: Nucleus segmentation, Deep learning, Instance segmentation, Medical + imaging, Dual-Branch network +
+
+
+
+
+ + ♻ ☆ Contrastive Model Adaptation for Cross-Condition Robustness in Semantic + Segmentation ICCV + + +
+ Standard unsupervised domain adaptation methods adapt models from a source to +a target domain using labeled source data and unlabeled target data jointly. In +model adaptation, on the other hand, access to the labeled source data is +prohibited, i.e., only the source-trained model and unlabeled target data are +available. We investigate normal-to-adverse condition model adaptation for +semantic segmentation, whereby image-level correspondences are available in the +target domain. The target set consists of unlabeled pairs of adverse- and +normal-condition street images taken at GPS-matched locations. Our method -- +CMA -- leverages such image pairs to learn condition-invariant features via +contrastive learning. In particular, CMA encourages features in the embedding +space to be grouped according to their condition-invariant semantic content and +not according to the condition under which respective inputs are captured. To +obtain accurate cross-domain semantic correspondences, we warp the normal image +to the viewpoint of the adverse image and leverage warp-confidence scores to +create robust, aggregated features. With this approach, we achieve +state-of-the-art semantic segmentation performance for model adaptation on +several normal-to-adverse adaptation benchmarks, such as ACDC and Dark Zurich. +We also evaluate CMA on a newly procured adverse-condition generalization +benchmark and report favorable results compared to standard unsupervised domain +adaptation methods, despite the comparative handicap of CMA due to source data +inaccessibility. Code is available at https://github.com/brdav/cma. + +
+
+ comment: International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ♻ ☆ Empowering Vision-Language Models to Follow Interleaved Vision-Language + Instructions + + +
+ Multimodal Large Language Models (MLLMs) have recently sparked significant +interest, which demonstrates emergent capabilities to serve as a +general-purpose model for various vision-language tasks. However, existing +methods mainly focus on limited types of instructions with a single image as +visual context, which hinders the widespread availability of MLLMs. In this +paper, we introduce the I4 benchmark to comprehensively evaluate the +instruction following ability on complicated interleaved vision-language +instructions, which involve intricate image-text sequential context, covering a +diverse range of scenarios (e.g., visually-rich webpages/textbooks, lecture +slides, embodied dialogue). Systematic evaluation on our I4 benchmark reveals a +common defect of existing methods: the Visual Prompt Generator (VPG) trained on +image-captioning alignment objective tends to attend to common foreground +information for captioning but struggles to extract specific information +required by particular tasks. To address this issue, we propose a generic and +lightweight controllable knowledge re-injection module, which utilizes the +sophisticated reasoning ability of LLMs to control the VPG to conditionally +extract instruction-specific visual information and re-inject it into the LLM. +Further, we introduce an annotation-free cross-attention guided counterfactual +image training strategy to methodically learn the proposed module by +collaborating a cascade of foundation models. Enhanced by the proposed module +and training strategy, we present Cheetor, a Transformer-based MLLM that can +effectively handle a wide variety of interleaved vision-language instructions +and achieves state-of-the-art zero-shot performance across all tasks of I4, +without high-quality multimodal instruction tuning data. Cheetor also exhibits +competitive performance compared with state-of-the-art instruction tuned models +on MME benchmark. + +
+
+
+
+
+ + ♻ ☆ Multi-metrics adaptively identifies backdoors in Federated learning ICCV + + +
+ The decentralized and privacy-preserving nature of federated learning (FL) +makes it vulnerable to backdoor attacks aiming to manipulate the behavior of +the resulting model on specific adversary-chosen inputs. However, most existing +defenses based on statistical differences take effect only against specific +attacks, especially when the malicious gradients are similar to benign ones or +the data are highly non-independent and identically distributed (non-IID). In +this paper, we revisit the distance-based defense methods and discover that i) +Euclidean distance becomes meaningless in high dimensions and ii) malicious +gradients with diverse characteristics cannot be identified by a single metric. +To this end, we present a simple yet effective defense strategy with +multi-metrics and dynamic weighting to identify backdoors adaptively. +Furthermore, our novel defense has no reliance on predefined assumptions over +attack settings or data distributions and little impact on benign performance. +To evaluate the effectiveness of our approach, we conduct comprehensive +experiments on different datasets under various attack settings, where our +method achieves the best defensive performance. For instance, we achieve the +lowest backdoor accuracy of 3.06% under the difficult Edge-case PGD, showing +significant superiority over previous defenses. The results also demonstrate +that our method can be well-adapted to a wide range of non-IID degrees without +sacrificing the benign performance. + +
+
+ comment: 14 pages, 8 figures and 7 tables; 2023 IEEE/CVF International + Conference on Computer Vision (ICCV) +
+
+
+
+
+ + ♻ ☆ Geometric Learning-Based Transformer Network for Estimation of + Segmentation Errors MICCAI + + +
+ Many segmentation networks have been proposed for 3D volumetric segmentation +of tumors and organs at risk. Hospitals and clinical institutions seek to +accelerate and minimize the efforts of specialists in image segmentation. +Still, in case of errors generated by these networks, clinicians would have to +manually edit the generated segmentation maps. Given a 3D volume and its +putative segmentation map, we propose an approach to identify and measure +erroneous regions in the segmentation map. Our method can estimate error at any +point or node in a 3D mesh generated from a possibly erroneous volumetric +segmentation map, serving as a Quality Assurance tool. We propose a graph +neural network-based transformer based on the Nodeformer architecture to +measure and classify the segmentation errors at any point. We have evaluated +our network on a high-resolution micro-CT dataset of the human inner-ear bony +labyrinth structure by simulating erroneous 3D segmentation maps. Our network +incorporates a convolutional encoder to compute node-centric features from the +input micro-CT data, the Nodeformer to learn the latent graph embeddings, and a +Multi-Layer Perceptron (MLP) to compute and classify the node-wise errors. Our +network achieves a mean absolute error of ~0.042 over other Graph Neural +Networks (GNN) and an accuracy of 79.53% over other GNNs in estimating and +classifying the node-wise errors, respectively. We also put forth vertex-normal +prediction as a custom pretext task for pre-training the CNN encoder to improve +the network's overall performance. Qualitative analysis shows the efficiency of +our network in correctly classifying errors and reducing misclassifications. + +
+
+ comment: Accepted in MICCAI workshop on ShapeMI, 2023 +
+
+
+
+
+ + ♻ ☆ A Closer Look at Audio-Visual Semantic Segmentation + + +
+ Audio-visual segmentation (AVS) is a complex task that involves accurately +segmenting the corresponding sounding object based on audio-visual queries. +Successful audio-visual learning requires two essential components: 1) an +unbiased dataset with high-quality pixel-level multi-class labels, and 2) a +model capable of effectively linking audio information with its corresponding +visual object. However, these two requirements are only partially addressed by +current methods, with training sets containing biased audio-visual data, and +models that generalise poorly beyond this biased training set. In this work, we +propose a new strategy to build cost-effective and relatively unbiased +audio-visual semantic segmentation benchmarks. Our strategy, called Visual +Post-production (VPO), explores the observation that it is not necessary to +have explicit audio-visual pairs extracted from single video sources to build +such benchmarks. We also refine the previously proposed AVSBench to transform +it into the audio-visual semantic segmentation benchmark AVSBench-Single+. +Furthermore, this paper introduces a new pixel-wise audio-visual contrastive +learning method to enable a better generalisation of the model beyond the +training set. We verify the validity of the VPO strategy by showing that +state-of-the-art (SOTA) models trained with datasets built by matching audio +and visual data from different sources or with datasets containing audio and +visual data from the same video source produce almost the same accuracy. Then, +using the proposed VPO benchmarks and AVSBench-Single+, we show that our method +produces more accurate audio-visual semantic segmentation than SOTA models. +Code and dataset will be available. + +
+
+
+
+
+ + ♻ ☆ CLIP-Count: Towards Text-Guided Zero-Shot Object Counting ACM MM 2023 + + +
+ Recent advances in visual-language models have shown remarkable zero-shot +text-image matching ability that is transferable to downstream tasks such as +object detection and segmentation. Adapting these models for object counting, +however, remains a formidable challenge. In this study, we first investigate +transferring vision-language models (VLMs) for class-agnostic object counting. +Specifically, we propose CLIP-Count, the first end-to-end pipeline that +estimates density maps for open-vocabulary objects with text guidance in a +zero-shot manner. To align the text embedding with dense visual features, we +introduce a patch-text contrastive loss that guides the model to learn +informative patch-level visual representations for dense prediction. Moreover, +we design a hierarchical patch-text interaction module to propagate semantic +information across different resolution levels of visual features. Benefiting +from the full exploitation of the rich image-text alignment knowledge of +pretrained VLMs, our method effectively generates high-quality density maps for +objects-of-interest. Extensive experiments on FSC-147, CARPK, and ShanghaiTech +crowd counting datasets demonstrate state-of-the-art accuracy and +generalizability of the proposed method. Code is available: +https://github.com/songrise/CLIP-Count. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ StableVQA: A Deep No-Reference Quality Assessment Model for Video + Stability + + +
+ Video shakiness is an unpleasant distortion of User Generated Content (UGC) +videos, which is usually caused by the unstable hold of cameras. In recent +years, many video stabilization algorithms have been proposed, yet no specific +and accurate metric enables comprehensively evaluating the stability of videos. +Indeed, most existing quality assessment models evaluate video quality as a +whole without specifically taking the subjective experience of video stability +into consideration. Therefore, these models cannot measure the video stability +explicitly and precisely when severe shakes are present. In addition, there is +no large-scale video database in public that includes various degrees of shaky +videos with the corresponding subjective scores available, which hinders the +development of Video Quality Assessment for Stability (VQA-S). To this end, we +build a new database named StableDB that contains 1,952 diversely-shaky UGC +videos, where each video has a Mean Opinion Score (MOS) on the degree of video +stability rated by 34 subjects. Moreover, we elaborately design a novel VQA-S +model named StableVQA, which consists of three feature extractors to acquire +the optical flow, semantic, and blur features respectively, and a regression +layer to predict the final stability score. Extensive experiments demonstrate +that the StableVQA achieves a higher correlation with subjective opinions than +the existing VQA-S models and generic VQA models. The database and codes are +available at https://github.com/QMME/StableVQA. + +
+
+
+
+
+ + ♻ ☆ Generative Semantic Segmentation CVPR2023 + + +
+ We present Generative Semantic Segmentation (GSS), a generative learning +approach for semantic segmentation. Uniquely, we cast semantic segmentation as +an image-conditioned mask generation problem. This is achieved by replacing the +conventional per-pixel discriminative learning with a latent prior learning +process. Specifically, we model the variational posterior distribution of +latent variables given the segmentation mask. To that end, the segmentation +mask is expressed with a special type of image (dubbed as maskige). This +posterior distribution allows to generate segmentation masks unconditionally. +To achieve semantic segmentation on a given image, we further introduce a +conditioning network. It is optimized by minimizing the divergence between the +posterior distribution of maskige (i.e., segmentation masks) and the latent +prior distribution of input training images. Extensive experiments on standard +benchmarks show that our GSS can perform competitively to prior art +alternatives in the standard semantic segmentation setting, whilst achieving a +new state of the art in the more challenging cross-domain setting. + +
+
+ comment: To appear at CVPR2023, code at http://github.com/fudan-zvg/GSS +
+
+
+
+
+ + ♻ ☆ Benchmarking and Analyzing Robust Point Cloud Recognition: Bag of Tricks + for Defending Adversarial Examples + + +
+ Deep Neural Networks (DNNs) for 3D point cloud recognition are vulnerable to +adversarial examples, threatening their practical deployment. Despite the many +research endeavors have been made to tackle this issue in recent years, the +diversity of adversarial examples on 3D point clouds makes them more +challenging to defend against than those on 2D images. For examples, attackers +can generate adversarial examples by adding, shifting, or removing points. +Consequently, existing defense strategies are hard to counter unseen point +cloud adversarial examples. In this paper, we first establish a comprehensive, +and rigorous point cloud adversarial robustness benchmark to evaluate +adversarial robustness, which can provide a detailed understanding of the +effects of the defense and attack methods. We then collect existing defense +tricks in point cloud adversarial defenses and then perform extensive and +systematic experiments to identify an effective combination of these tricks. +Furthermore, we propose a hybrid training augmentation methods that consider +various types of point cloud adversarial examples to adversarial training, +significantly improving the adversarial robustness. By combining these tricks, +we construct a more robust defense framework achieving an average accuracy of +83.45\% against various attacks, demonstrating its capability to enabling +robust learners. Our codebase are open-sourced on: +\url{https://github.com/qiufan319/benchmark_pc_attack.git}. + +
+
+ comment: 8 pages 6 figures +
+
+
+
+
+ + ♻ ☆ RegFormer: An Efficient Projection-Aware Transformer Network for + Large-Scale Point Cloud Registration ICCV2023 + + +
+ Although point cloud registration has achieved remarkable advances in +object-level and indoor scenes, large-scale registration methods are rarely +explored. Challenges mainly arise from the huge point number, complex +distribution, and outliers of outdoor LiDAR scans. In addition, most existing +registration works generally adopt a two-stage paradigm: They first find +correspondences by extracting discriminative local features and then leverage +estimators (eg. RANSAC) to filter outliers, which are highly dependent on +well-designed descriptors and post-processing choices. To address these +problems, we propose an end-to-end transformer network (RegFormer) for +large-scale point cloud alignment without any further post-processing. +Specifically, a projection-aware hierarchical transformer is proposed to +capture long-range dependencies and filter outliers by extracting point +features globally. Our transformer has linear complexity, which guarantees high +efficiency even for large-scale scenes. Furthermore, to effectively reduce +mismatches, a bijective association transformer is designed for regressing the +initial transformation. Extensive experiments on KITTI and NuScenes datasets +demonstrate that our RegFormer achieves competitive performance in terms of +both accuracy and efficiency. + +
+
+ comment: Accepted by ICCV2023. Codes are released at + https://github.com/IRMVLab/RegFormer +
+
+
+
+
+ + ♻ ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video + Synthesis + + +
+ In recent years, diffusion models have emerged as the most powerful approach +in image synthesis. However, applying these models directly to video synthesis +presents challenges, as it often leads to noticeable flickering contents. +Although recently proposed zero-shot methods can alleviate flicker to some +extent, we still struggle to generate coherent videos. In this paper, we +propose DiffSynth, a novel approach that aims to convert image synthesis +pipelines to video synthesis pipelines. DiffSynth consists of two key +components: a latent in-iteration deflickering framework and a video +deflickering algorithm. The latent in-iteration deflickering framework applies +video deflickering to the latent space of diffusion models, effectively +preventing flicker accumulation in intermediate steps. Additionally, we propose +a video deflickering algorithm, named patch blending algorithm, that remaps +objects in different frames and blends them together to enhance video +consistency. One of the notable advantages of DiffSynth is its general +applicability to various video synthesis tasks, including text-guided video +stylization, fashion video synthesis, image-guided video stylization, video +restoring, and 3D rendering. In the task of text-guided video stylization, we +make it possible to synthesize high-quality videos without cherry-picking. The +experimental results demonstrate the effectiveness of DiffSynth. All videos can +be viewed on our project page. Source codes will also be released. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ RemoteCLIP: A Vision Language Foundation Model for Remote Sensing + + +
+ General-purpose foundation models have become increasingly important in the +field of artificial intelligence. While self-supervised learning (SSL) and +Masked Image Modeling (MIM) have led to promising results in building such +foundation models for remote sensing, these models primarily learn low-level +features, require annotated data for fine-tuning, and not applicable for +retrieval and zero-shot applications due to the lack of language understanding. +In response to these limitations, we propose RemoteCLIP, the first +vision-language foundation model for remote sensing that aims to learn robust +visual features with rich semantics, as well as aligned text embeddings for +seamless downstream application. To address the scarcity of pre-training data, +we leverage data scaling, converting heterogeneous annotations based on +Box-to-Caption (B2C) and Mask-to-Box (M2B) conversion, and further +incorporating UAV imagery, resulting a 12xlarger pretraining dataset. +RemoteCLIP can be applied to a variety of downstream tasks, including zero-shot +image classification, linear probing, k-NN classification, few-shot +classification, image-text retrieval, and object counting. Evaluations on 16 +datasets, including a newly introduced RemoteCount benchmark to test the object +counting ability, show that RemoteCLIP consistently outperforms baseline +foundation models across different model scales. Impressively, RemoteCLIP +outperform previous SoTA by 9.14% mean recall on RSICD dataset and by 8.92% on +RSICD dataset. For zero-shot classification, our RemoteCLIP outperform CLIP +baseline by up to 6.39% average accuracy on 12 downstream datasets.Pretrained +models is available at https://github.com/ChenDelong1999/RemoteCLIP . + +
+
+
+
+
+ + ♻ ☆ Prototypical Kernel Learning and Open-set Foreground Perception for + Generalized Few-shot Semantic Segmentation ICCV2023 + + +
+ Generalized Few-shot Semantic Segmentation (GFSS) extends Few-shot Semantic +Segmentation (FSS) to simultaneously segment unseen classes and seen classes +during evaluation. Previous works leverage additional branch or prototypical +aggregation to eliminate the constrained setting of FSS. However, +representation division and embedding prejudice, which heavily results in poor +performance of GFSS, have not been synthetical considered. We address the +aforementioned problems by jointing the prototypical kernel learning and +open-set foreground perception. Specifically, a group of learnable kernels is +proposed to perform segmentation with each kernel in charge of a stuff class. +Then, we explore to merge the prototypical learning to the update of base-class +kernels, which is consistent with the prototype knowledge aggregation of +few-shot novel classes. In addition, a foreground contextual perception module +cooperating with conditional bias based inference is adopted to perform +class-agnostic as well as open-set foreground detection, thus to mitigate the +embedding prejudice and prevent novel targets from being misclassified as +background. Moreover, we also adjust our method to the Class Incremental +Few-shot Semantic Segmentation (CIFSS) which takes the knowledge of novel +classes in a incremental stream. Extensive experiments on PASCAL-5i and +COCO-20i datasets demonstrate that our method performs better than previous +state-of-the-art. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ TextPainter: Multimodal Text Image Generation withVisual-harmony and + Text-comprehension for Poster Design ACM MM 2023 + + +
+ Text design is one of the most critical procedures in poster design, as it +relies heavily on the creativity and expertise of humans to design text images +considering the visual harmony and text-semantic. This study introduces +TextPainter, a novel multimodal approach that leverages contextual visual +information and corresponding text semantics to generate text images. +Specifically, TextPainter takes the global-local background image as a hint of +style and guides the text image generation with visual harmony. Furthermore, we +leverage the language model and introduce a text comprehension module to +achieve both sentence-level and word-level style variations. Besides, we +construct the PosterT80K dataset, consisting of about 80K posters annotated +with sentence-level bounding boxes and text contents. We hope this dataset will +pave the way for further research on multimodal text image generation. +Extensive quantitative and qualitative experiments demonstrate that TextPainter +can generate visually-and-semantically-harmonious text images for posters. + +
+
+ comment: Accepted to ACM MM 2023. Dataset Link: + https://tianchi.aliyun.com/dataset/160034 +
+
+
+
+
+ + ♻ ☆ Open Problems in Computer Vision for Wilderness SAR and The Search for + Patricia Wu-Murad + + +
+ This paper details the challenges in applying two computer vision systems, an +EfficientDET supervised learning model and the unsupervised RX spectral +classifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and +rescue (WSAR) effort in Japan and identifies 3 directions for future research. +There have been at least 19 proposed approaches and 3 datasets aimed at +locating missing persons in drone imagery, but only 3 approaches (2 +unsupervised and 1 of an unknown structure) are referenced in the literature as +having been used in an actual WSAR operation. Of these proposed approaches, the +EfficientDET architecture and the unsupervised spectral RX classifier were +selected as the most appropriate for this setting. The EfficientDET model was +applied to the HERIDAL dataset and despite achieving performance that is +statistically equivalent to the state-of-the-art, the model fails to translate +to the real world in terms of false positives (e.g., identifying tree limbs and +rocks as people), and false negatives (e.g., failing to identify members of the +search team). The poor results in practice for algorithms that showed good +results on datasets suggest 3 areas of future research: more realistic datasets +for wilderness SAR, computer vision models that are capable of seamlessly +handling the variety of imagery that can be collected during actual WSAR +operations, and better alignment on performance measures. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Will Large-scale Generative Models Corrupt Future Datasets? ICCV 2023 + + +
+ Recently proposed large-scale text-to-image generative models such as +DALL$\cdot$E 2, Midjourney, and StableDiffusion can generate high-quality and +realistic images from users' prompts. Not limited to the research community, +ordinary Internet users enjoy these generative models, and consequently, a +tremendous amount of generated images have been shared on the Internet. +Meanwhile, today's success of deep learning in the computer vision field owes a +lot to images collected from the Internet. These trends lead us to a research +question: "\textbf{will such generated images impact the quality of future +datasets and the performance of computer vision models positively or +negatively?}" This paper empirically answers this question by simulating +contamination. Namely, we generate ImageNet-scale and COCO-scale datasets using +a state-of-the-art generative model and evaluate models trained with +"contaminated" datasets on various tasks, including image classification and +image generation. Throughout experiments, we conclude that generated images +negatively affect downstream performance, while the significance depends on +tasks and the amount of generated images. The generated datasets and the codes +for experiments will be publicly released for future research. Generated +datasets and source codes are available from +\url{https://github.com/moskomule/dataset-contamination}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Dual-level Interaction for Domain Adaptive Semantic Segmentation ICCV + + +
+ Self-training approach recently secures its position in domain adaptive +semantic segmentation, where a model is trained with target domain +pseudo-labels. Current advances have mitigated noisy pseudo-labels resulting +from the domain gap. However, they still struggle with erroneous pseudo-labels +near the boundaries of the semantic classifier. In this paper, we tackle this +issue by proposing a dual-level interaction for domain adaptation (DIDA) in +semantic segmentation. Explicitly, we encourage the different augmented views +of the same pixel to have not only similar class prediction (semantic-level) +but also akin similarity relationship with respect to other pixels +(instance-level). As it's impossible to keep features of all pixel instances +for a dataset, we, therefore, maintain a labeled instance bank with dynamic +updating strategies to selectively store the informative features of instances. +Further, DIDA performs cross-level interaction with scattering and gathering +techniques to regenerate more reliable pseudo-labels. Our method outperforms +the state-of-the-art by a notable margin, especially on confusing and +long-tailed classes. Code is available at +\href{https://github.com/RainJamesY/DIDA} + +
+
+ comment: Accepted to ICCVW on Uncertainty Quantification for Computer Vision + (UnCV), 2023 +
+
+
+
+
+ + ♻ ☆ Non-linear Neurons with Human-like Apical Dendrite Activations + + +
+ In order to classify linearly non-separable data, neurons are typically +organized into multi-layer neural networks that are equipped with at least one +hidden layer. Inspired by some recent discoveries in neuroscience, we propose a +new model of artificial neuron along with a novel activation function enabling +the learning of nonlinear decision boundaries using a single neuron. We show +that a standard neuron followed by our novel apical dendrite activation (ADA) +can learn the XOR logical function with 100% accuracy. Furthermore, we conduct +experiments on six benchmark data sets from computer vision, signal processing +and natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST, +Tiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions +provide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and +Swish, for various neural network architectures, e.g. one-hidden-layer or +two-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural +networks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain +further performance improvements when we change the standard model of the +neuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our +code is available at: https://github.com/raduionescu/pynada. + +
+
+ comment: Accepted for publication in Applied Intelligence +
+
+
+
+
+ + ♻ ☆ Expeditious Saliency-guided Mix-up through Random Gradient Thresholding AAAI 2023 + + +
+ Mix-up training approaches have proven to be effective in improving the +generalization ability of Deep Neural Networks. Over the years, the research +community expands mix-up methods into two directions, with extensive efforts to +improve saliency-guided procedures but minimal focus on the arbitrary path, +leaving the randomization domain unexplored. In this paper, inspired by the +superior qualities of each direction over one another, we introduce a novel +method that lies at the junction of the two routes. By combining the best +elements of randomness and saliency utilization, our method balances speed, +simplicity, and accuracy. We name our method R-Mix following the concept of +"Random Mix-up". We demonstrate its effectiveness in generalization, weakly +supervised object localization, calibration, and robustness to adversarial +attacks. Finally, in order to address the question of whether there exists a +better decision protocol, we train a Reinforcement Learning agent that decides +the mix-up policies based on the classifier's performance, reducing dependency +on human-designed objectives and hyperparameter tuning. Extensive experiments +further show that the agent is capable of performing at the cutting-edge level, +laying the foundation for a fully automatic mix-up. Our code is released at +[https://github.com/minhlong94/Random-Mixup]. + +
+
+ comment: Accepted Long paper at 2nd Practical-DL Workshop at AAAI 2023 +
+
+
+
+
+ + ♻ ☆ The Imaginative Generative Adversarial Network: Automatic Data + Augmentation for Dynamic Skeleton-Based Hand Gesture and Human Action + Recognition + + +
+ Deep learning approaches deliver state-of-the-art performance in recognition +of spatiotemporal human motion data. However, one of the main challenges in +these recognition tasks is limited available training data. Insufficient +training data results in over-fitting and data augmentation is one approach to +address this challenge. Existing data augmentation strategies based on scaling, +shifting and interpolating offer limited generalizability and typically require +detailed inspection of the dataset as well as hundreds of GPU hours for +hyperparameter optimization. In this paper, we present a novel automatic data +augmentation model, the Imaginative Generative Adversarial Network (GAN), that +approximates the distribution of the input data and samples new data from this +distribution. It is automatic in that it requires no data inspection and little +hyperparameter tuning and therefore it is a low-cost and low-effort approach to +generate synthetic data. We demonstrate our approach on small-scale +skeleton-based datasets with a comprehensive experimental analysis. Our results +show that the augmentation strategy is fast to train and can improve +classification accuracy for both conventional neural networks and +state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Vision-Based UAV Self-Positioning in Low-Altitude Urban Environments + + +
+ Unmanned Aerial Vehicles (UAVs) rely on satellite systems for stable +positioning. However, due to limited satellite coverage or communication +disruptions, UAVs may lose signals from satellite-based positioning systems. In +such situations, vision-based techniques can serve as an alternative, ensuring +the self-positioning capability of UAVs. However, most of the existing datasets +are developed for the geo-localization tasks of the objects identified by UAVs, +rather than the self-positioning task of UAVs. Furthermore, the current UAV +datasets use discrete sampling on synthetic data, such as Google Maps, thereby +neglecting the crucial aspects of dense sampling and the uncertainties commonly +experienced in real-world scenarios. To address these issues, this paper +presents a new dataset, DenseUAV, which is the first publicly available dataset +designed for the UAV self-positioning task. DenseUAV adopts dense sampling on +UAV images obtained in low-altitude urban settings. In total, over 27K UAV-view +and satellite-view images of 14 university campuses are collected and +annotated, establishing a new benchmark. In terms of model development, we +first verify the superiority of Transformers over CNNs in this task. Then, we +incorporate metric learning into representation learning to enhance the +discriminative capacity of the model and to lessen the modality discrepancy. +Besides, to facilitate joint learning from both perspectives, we propose a +mutually supervised learning approach. Last, we enhance the Recall@K metric and +introduce a new measurement, SDM@K, to evaluate the performance of a trained +model from both the retrieval and localization perspectives simultaneously. As +a result, the proposed baseline method achieves a remarkable Recall@1 score of +83.05% and an SDM@1 score of 86.24% on DenseUAV. The dataset and code will be +made publicly available on https://github.com/Dmmm1997/DenseUAV. + +
+
+ comment: 13 pages,8 figures +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ SSLRec: A Self-Supervised Learning Library for Recommendation + + +
+ Self-supervised learning (SSL) has gained significant interest in recent +years as a solution to address the challenges posed by sparse and noisy data in +recommender systems. Despite the growing number of SSL algorithms designed to +provide state-of-the-art performance in various recommendation scenarios (e.g., +graph collaborative filtering, sequential recommendation, social +recommendation, KG-enhanced recommendation), there is still a lack of unified +frameworks that integrate recommendation algorithms across different domains. +Such a framework could serve as the cornerstone for self-supervised +recommendation algorithms, unifying the validation of existing methods and +driving the design of new ones. To address this gap, we introduce SSLRec, a +novel benchmark platform that provides a standardized, flexible, and +comprehensive framework for evaluating various SSL-enhanced recommenders. The +SSLRec library features a modular architecture that allows users to easily +evaluate state-of-the-art models and a complete set of data augmentation and +self-supervised toolkits to help create SSL recommendation models with specific +needs. Furthermore, SSLRec simplifies the process of training and evaluating +different recommendation models with consistent and fair settings. Our SSLRec +platform covers a comprehensive set of state-of-the-art SSL-enhanced +recommendation models across different scenarios, enabling researchers to +evaluate these cutting-edge models and drive further innovation in the field. +Our implemented SSLRec framework is available at the source code repository +https://github.com/HKUDS/SSLRec. + +
+
+
+
+
+ + ☆ Finding Already Debunked Narratives via Multistage Retrieval: Enabling + Cross-Lingual, Cross-Dataset and Zero-Shot Learning + + +
+ The task of retrieving already debunked narratives aims to detect stories +that have already been fact-checked. The successful detection of claims that +have already been debunked not only reduces the manual efforts of professional +fact-checkers but can also contribute to slowing the spread of misinformation. +Mainly due to the lack of readily available data, this is an understudied +problem, particularly when considering the cross-lingual task, i.e. the +retrieval of fact-checking articles in a language different from the language +of the online post being checked. This paper fills this gap by (i) creating a +novel dataset to enable research on cross-lingual retrieval of already debunked +narratives, using tweets as queries to a database of fact-checking articles; +(ii) presenting an extensive experiment to benchmark fine-tuned and +off-the-shelf multilingual pre-trained Transformer models for this task; and +(iii) proposing a novel multistage framework that divides this cross-lingual +debunk retrieval task into refinement and re-ranking stages. Results show that +the task of cross-lingual retrieval of already debunked narratives is +challenging and off-the-shelf Transformer models fail to outperform a strong +lexical-based baseline (BM25). Nevertheless, our multistage retrieval framework +is robust, outperforming BM25 in most scenarios and enabling cross-domain and +zero-shot learning, without significantly harming the model's performance. + +
+
+
+
+
+ + ☆ LASIGE and UNICAGE solution to the NASA LitCoin NLP Competition + + +
+ Biomedical Natural Language Processing (NLP) tends to become cumbersome for +most researchers, frequently due to the amount and heterogeneity of text to be +processed. To address this challenge, the industry is continuously developing +highly efficient tools and creating more flexible engineering solutions. This +work presents the integration between industry data engineering solutions for +efficient data processing and academic systems developed for Named Entity +Recognition (LasigeUnicage\_NER) and Relation Extraction (BiOnt). Our design +reflects an integration of those components with external knowledge in the form +of additional training data from other datasets and biomedical ontologies. We +used this pipeline in the 2022 LitCoin NLP Challenge, where our team +LasigeUnicage was awarded the 7th Prize out of approximately 200 participating +teams, reflecting a successful collaboration between the academia (LASIGE) and +the industry (Unicage). The software supporting this work is available at +\url{https://github.com/lasigeBioTM/Litcoin-Lasige_Unicage}. + +
+
+
+
+
+ + ☆ Multi-domain Recommendation with Embedding Disentangling and Domain + Alignment CIKM'23 + + +
+ Multi-domain recommendation (MDR) aims to provide recommendations for +different domains (e.g., types of products) with overlapping users/items and is +common for platforms such as Amazon, Facebook, and LinkedIn that host multiple +services. Existing MDR models face two challenges: First, it is difficult to +disentangle knowledge that generalizes across domains (e.g., a user likes cheap +items) and knowledge specific to a single domain (e.g., a user likes blue +clothing but not blue cars). Second, they have limited ability to transfer +knowledge across domains with small overlaps. We propose a new MDR method named +EDDA with two key components, i.e., embedding disentangling recommender and +domain alignment, to tackle the two challenges respectively. In particular, the +embedding disentangling recommender separates both the model and embedding for +the inter-domain part and the intra-domain part, while most existing MDR +methods only focus on model-level disentangling. The domain alignment leverages +random walks from graph processing to identify similar user/item pairs from +different domains and encourages similar user/item pairs to have similar +embeddings, enhancing knowledge transfer. We compare EDDA with 12 +state-of-the-art baselines on 3 real datasets. The results show that EDDA +consistently outperforms the baselines on all datasets and domains. All +datasets and codes are available at https://github.com/Stevenn9981/EDDA. + +
+
+ comment: Accepted by CIKM'23 +
+
+
+
+
+ + ☆ Bringing order into the realm of Transformer-based language models for + artificial intelligence and law + + +
+ Transformer-based language models (TLMs) have widely been recognized to be a +cutting-edge technology for the successful development of deep-learning-based +solutions to problems and applications that require natural language processing +and understanding. Like for other textual domains, TLMs have indeed pushed the +state-of-the-art of AI approaches for many tasks of interest in the legal +domain. Despite the first Transformer model being proposed about six years ago, +there has been a rapid progress of this technology at an unprecedented rate, +whereby BERT and related models represent a major reference, also in the legal +domain. This article provides the first systematic overview of TLM-based +methods for AI-driven problems and tasks in the legal sphere. A major goal is +to highlight research advances in this field so as to understand, on the one +hand, how the Transformers have contributed to the success of AI in supporting +legal processes, and on the other hand, what are the current limitations and +opportunities for further research development. + +
+
+ comment: Accepted for publication with Artificial Intelligence and Law, + Springer Nature +
+
+
+
+
+ + ☆ Product Review Image Ranking for Fashion E-commerce SIGIR + + +
+ In a fashion e-commerce platform where customers can't physically examine the +products on their own, being able to see other customers' text and image +reviews of the product is critical while making purchase decisions. Given the +high reliance on these reviews, over the years we have observed customers +proactively sharing their reviews. With an increase in the coverage of User +Generated Content (UGC), there has been a corresponding increase in the number +of customer images. It is thus imperative to display the most relevant images +on top as it may influence users' online shopping choices and behavior. In this +paper, we propose a simple yet effective training procedure for ranking +customer images. We created a dataset consisting of Myntra (A Major Indian +Fashion e-commerce company) studio posts and highly engaged (upvotes/downvotes) +UGC images as our starting point and used selected distortion techniques on the +images of the above dataset to bring their quality at par with those of bad UGC +images. We train our network to rank bad-quality images lower than high-quality +ones. Our proposed method outperforms the baseline models on two metrics, +namely correlation coefficient, and accuracy, by substantial margins. + +
+
+ comment: Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR + eCom'22) +
+
+
+
+
+ + ☆ Beyond Semantics: Learning a Behavior Augmented Relevance Model with + Self-supervised Learning CIKM2023 + + +
+ Relevance modeling aims to locate desirable items for corresponding queries, +which is crucial for search engines to ensure user experience. Although most +conventional approaches address this problem by assessing the semantic +similarity between the query and item, pure semantic matching is not +everything. In reality, auxiliary query-item interactions extracted from user +historical behavior data of the search log could provide hints to reveal users' +search intents further. Drawing inspiration from this, we devise a novel +Behavior Augmented Relevance Learning model for Alipay Search (BARL-ASe) that +leverages neighbor queries of target item and neighbor items of target query to +complement target query-item semantic matching. Specifically, our model builds +multi-level co-attention for distilling coarse-grained and fine-grained +semantic representations from both neighbor and target views. The model +subsequently employs neighbor-target self-supervised learning to improve the +accuracy and robustness of BARL-ASe by strengthening representation and logit +learning. Furthermore, we discuss how to deal with the long-tail query-item +matching of the mini apps search scenario of Alipay practically. Experiments on +real-world industry data and online A/B testing demonstrate our proposal +achieves promising performance with low latency. + +
+
+ comment: CIKM2023 +
+
+
+
+
+ + ☆ Investigating disaster response through social media data and the + Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S. + wildfire season + + +
+ Effective disaster response is critical for affected communities. Responders +and decision-makers would benefit from reliable, timely measures of the issues +impacting their communities during a disaster, and social media offers a +potentially rich data source. Social media can reflect public concerns and +demands during a disaster, offering valuable insights for decision-makers to +understand evolving situations and optimize resource allocation. We used +Bidirectional Encoder Representations from Transformers (BERT) topic modeling +to cluster topics from Twitter data. Then, we conducted a temporal-spatial +analysis to examine the distribution of these topics across different regions +during the 2020 western U.S. wildfire season. Our results show that Twitter +users mainly focused on three topics:"health impact," "damage," and +"evacuation." We used the Susceptible-Infected-Recovered (SIR) theory to +explore the magnitude and velocity of topic diffusion on Twitter. The results +displayed a clear relationship between topic trends and wildfire propagation +patterns. The estimated parameters obtained from the SIR model in selected +cities revealed that residents exhibited a high level of several concerns +during the wildfire. Our study details how the SIR model and topic modeling +using social media data can provide decision-makers with a quantitative +approach to measure disaster response and support their decision-making +processes. + +
+
+
+
+
+ + ☆ CSPM: A Contrastive Spatiotemporal Preference Model for CTR Prediction + in On-Demand Food Delivery Services + + +
+ Click-through rate (CTR) prediction is a crucial task in the context of an +online on-demand food delivery (OFD) platform for precisely estimating the +probability of a user clicking on food items. Unlike universal e-commerce +platforms such as Taobao and Amazon, user behaviors and interests on the OFD +platform are more location and time-sensitive due to limited delivery ranges +and regional commodity supplies. However, existing CTR prediction algorithms in +OFD scenarios concentrate on capturing interest from historical behavior +sequences, which fails to effectively model the complex spatiotemporal +information within features, leading to poor performance. To address this +challenge, this paper introduces the Contrastive Sres under different search +states using three modules: contrastive spatiotemporal representation learning +(CSRL), spatiotemporal preference extractor (StPE), and spatiotemporal +information filter (StIF). CSRL utilizes a contrastive learning framework to +generate a spatiotemporal activation representation (SAR) for the search +action. StPE employs SAR to activate users' diverse preferences related to +location and time from the historical behavior sequence field, using a +multi-head attention mechanism. StIF incorporates SAR into a gating network to +automatically capture important features with latent spatiotemporal effects. +Extensive experiments conducted on two large-scale industrial datasets +demonstrate the state-of-the-art performance of CSPM. Notably, CSPM has been +successfully deployed in Alibaba's online OFD platform Ele.me, resulting in a +significant 0.88% lift in CTR, which has substantial business implications. + +
+
+
+
+
+ + ♻ ☆ From Retrieval to Generation: Efficient and Effective Entity Set + Expansion + + +
+ Entity Set Expansion (ESE) is a critical task aiming to expand entities of +the target semantic class described by a small seed entity set. Most existing +ESE methods are retrieval-based frameworks that need to extract the contextual +features of entities and calculate the similarity between seed entities and +candidate entities. To achieve the two purposes, they should iteratively +traverse the corpus and the entity vocabulary provided in the datasets, +resulting in poor efficiency and scalability. The experimental results indicate +that the time consumed by the retrieval-based ESE methods increases linearly +with entity vocabulary and corpus size. In this paper, we firstly propose a +generative ESE framework, Generative Entity Set Expansion (GenExpan), which +utilizes a generative pre-trained language model to accomplish ESE task. +Specifically, a prefix tree is employed to guarantee the validity of entity +generation, and automatically generated class names are adopted to guide the +model to generate target entities. Moreover, we propose Knowledge Calibration +and Generative Ranking to further bridge the gap between generic knowledge of +the language model and the goal of ESE task. Experiments on publicly available +datasets show that GenExpan is efficient and effective. For efficiency, +expansion time consumed by GenExpan is independent of entity vocabulary and +corpus size, and GenExpan achieves an average 600% speedup compared to strong +baselines. For expansion performance, our framework outperforms previous +state-of-the-art ESE methods. + +
+
+
+
+
+ + ♻ ☆ Metric Search for Rank List Compatibility Matching with Applications + + +
+ As online dating has become more popular in the past few years, an efficient +and effective algorithm to match users is needed. In this project, we proposed +a new dating matching algorithm that uses Kendall-Tau distance to measure the +similarity between users based on their ranking for items in a list. (e.g., +their favourite sports, music, etc.) To increase the performance of the search +process, we applied a tree-based searching structure, Cascading Metric Tree +(CMT), on this metric. The tree is built on ranked lists from all the users; +when a query target and a radius are provided, our algorithm can return users +within the radius of the target. We tested the scaling of this searching method +on a synthetic dataset by varying list length, population size, and query +radius. We observed that the algorithm is able to query the best matching +people for the user in a practical time, given reasonable parameters. We also +provided potential future improvements that can be made to this algorithm based +on the limitations. Finally, we offered more use cases of this search structure +on Kendall-Tau distance and new insight into real-world applications of +distance search structures. + +
+
+ comment: Paper for 2023 Multidisciplinary Undergraduate Research Conference + (MURC) +
+
+
+
+
+ + ♻ ☆ Dual Intents Graph Modeling for User-centric Group Discovery CIKM'23 + + +
+ Online groups have become increasingly prevalent, providing users with space +to share experiences and explore interests. Therefore, user-centric group +discovery task, i.e., recommending groups to users can help both users' online +experiences and platforms' long-term developments. Existing recommender methods +can not deal with this task as modeling user-group participation into a +bipartite graph overlooks their item-side interests. Although there exist a few +works attempting to address this task, they still fall short in fully +preserving the social context and ensuring effective interest representation +learning. + In this paper, we focus on exploring the intents that motivate users to +participate in groups, which can be categorized into different types, like the +social-intent and the personal interest-intent. The former refers to users +joining a group affected by their social links, while the latter relates to +users joining groups with like-minded people for self-enjoyment. To comprehend +different intents, we propose a novel model, DiRec, that first models each +intent separately and then fuses them together for predictions. Specifically, +for social-intent, we introduce the hypergraph structure to model the +relationship between groups and members, leading to a richer understanding of +the social context. As for interest-intent, we employ novel structural +refinement on the interactive graph to uncover more intricate user behaviors +and group interests, realizing better representation learning of interests. +Furthermore, we also observe the intent overlapping in real-world scenarios and +devise a novel self-supervised learning loss that encourages such alignment for +final recommendations. Extensive experiments on three public datasets show the +significant improvement of DiRec over the state-of-the-art methods. + +
+
+ comment: Accepted by CIKM'23 as Long Paper +
+
+
+
+
+ + ♻ ☆ Collaborative filtering to capture AI user's preferences as norms + + +
+ Customising AI technologies to each user's preferences is fundamental to them +functioning well. Unfortunately, current methods require too much user +involvement and fail to capture their true preferences. In fact, to avoid the +nuisance of manually setting preferences, users usually accept the default +settings even if these do not conform to their true preferences. Norms can be +useful to regulate behaviour and ensure it adheres to user preferences but, +while the literature has thoroughly studied norms, most proposals take a formal +perspective. Indeed, while there has been some research on constructing norms +to capture a user's privacy preferences, these methods rely on domain knowledge +which, in the case of AI technologies, is difficult to obtain and maintain. We +argue that a new perspective is required when constructing norms, which is to +exploit the large amount of preference information readily available from whole +systems of users. Inspired by recommender systems, we believe that +collaborative filtering can offer a suitable approach to identifying a user's +norm preferences without excessive user involvement. + +
+
+ comment: Accepted manuscript at the 24th International Conference on + Principles and Practice of Multi-Agent Systems (PRIMA 2022) +
+
+
+
+
+
+
+
+ + Machine Learning 102 + +
+
+
+ + ☆ Neural Progressive Meshes SIGGRAPH 2023 + + +
+ The recent proliferation of 3D content that can be consumed on hand-held +devices necessitates efficient tools for transmitting large geometric data, +e.g., 3D meshes, over the Internet. Detailed high-resolution assets can pose a +challenge to storage as well as transmission bandwidth, and level-of-detail +techniques are often used to transmit an asset using an appropriate bandwidth +budget. It is especially desirable for these methods to transmit data +progressively, improving the quality of the geometry with more data. Our key +insight is that the geometric details of 3D meshes often exhibit similar local +patterns even across different shapes, and thus can be effectively represented +with a shared learned generative space. We learn this space using a +subdivision-based encoder-decoder architecture trained in advance on a large +collection of surfaces. We further observe that additional residual features +can be transmitted progressively between intermediate levels of subdivision +that enable the client to control the tradeoff between bandwidth cost and +quality of reconstruction, providing a neural progressive mesh representation. +We evaluate our method on a diverse set of complex 3D shapes and demonstrate +that it outperforms baselines in terms of compression ratio and reconstruction +quality. + +
+
+ comment: SIGGRAPH 2023 +
+
+
+
+
+ + ☆ Zero Grads Ever Given: Learning Local Surrogate Losses for + Non-Differentiable Graphics + + +
+ Gradient-based optimization is now ubiquitous across graphics, but +unfortunately can not be applied to problems with undefined or zero gradients. +To circumvent this issue, the loss function can be manually replaced by a +"surrogate" that has similar minima but is differentiable. Our proposed +framework, ZeroGrads, automates this process by learning a neural approximation +of the objective function, the surrogate, which in turn can be used to +differentiate through arbitrary black-box graphics pipelines. We train the +surrogate on an actively smoothed version of the objective and encourage +locality, focusing the surrogate's capacity on what matters at the current +training episode. The fitting is performed online, alongside the parameter +optimization, and self-supervised, without pre-computed data or pre-trained +models. As sampling the objective is expensive (it requires a full rendering or +simulator run), we devise an efficient sampling scheme that allows for +tractable run-times and competitive performance at little overhead. We +demonstrate optimizing diverse non-convex, non-differentiable black-box +problems in graphics, such as visibility in rendering, discrete parameter +spaces in procedural modelling or optimal control in physics-driven animation. +In contrast to more traditional algorithms, our approach scales well to higher +dimensions, which we demonstrate on problems with up to 35k interlinked +variables. + +
+
+
+
+
+ + ☆ Follow Anything: Open-set detection, tracking, and following in + real-time + + +
+ Tracking and following objects of interest is critical to several robotics +use cases, ranging from industrial automation to logistics and warehousing, to +healthcare and security. In this paper, we present a robotic system to detect, +track, and follow any object in real-time. Our approach, dubbed ``follow +anything'' (FAn), is an open-vocabulary and multimodal model -- it is not +restricted to concepts seen at training time and can be applied to novel +classes at inference time using text, images, or click queries. Leveraging rich +visual descriptors from large-scale pre-trained models (foundation models), FAn +can detect and segment objects by matching multimodal queries (text, images, +clicks) against an input image sequence. These detected and segmented objects +are tracked across image frames, all while accounting for occlusion and object +re-emergence. We demonstrate FAn on a real-world robotic system (a micro aerial +vehicle) and report its ability to seamlessly follow the objects of interest in +a real-time control loop. FAn can be deployed on a laptop with a lightweight +(6-8 GB) graphics card, achieving a throughput of 6-20 frames per second. To +enable rapid adoption, deployment, and extensibility, we open-source all our +code on our project webpage at https://github.com/alaamaalouf/FollowAnything . +We also encourage the reader the watch our 5-minutes explainer video in this +https://www.youtube.com/watch?v=6Mgt3EPytrw . + +
+
+ comment: Project webpage: https://github.com/alaamaalouf/FollowAnything + Explainer video: https://www.youtube.com/watch?v=6Mgt3EPytrw +
+
+
+
+
+ + ☆ PDE-Refiner: Achieving Accurate Long Rollouts with Neural PDE Solvers + + +
+ Time-dependent partial differential equations (PDEs) are ubiquitous in +science and engineering. Recently, mostly due to the high computational cost of +traditional solution techniques, deep neural network based surrogates have +gained increased interest. The practical utility of such neural PDE solvers +relies on their ability to provide accurate, stable predictions over long time +horizons, which is a notoriously hard problem. In this work, we present a +large-scale analysis of common temporal rollout strategies, identifying the +neglect of non-dominant spatial frequency information, often associated with +high frequencies in PDE solutions, as the primary pitfall limiting stable, +accurate rollout performance. Based on these insights, we draw inspiration from +recent advances in diffusion models to introduce PDE-Refiner; a novel model +class that enables more accurate modeling of all frequency components via a +multistep refinement process. We validate PDE-Refiner on challenging benchmarks +of complex fluid dynamics, demonstrating stable and accurate rollouts that +consistently outperform state-of-the-art models, including neural, numerical, +and hybrid neural-numerical architectures. We further demonstrate that +PDE-Refiner greatly enhances data efficiency, since the denoising objective +implicitly induces a novel form of spectral data augmentation. Finally, +PDE-Refiner's connection to diffusion models enables an accurate and efficient +assessment of the model's predictive uncertainty, allowing us to estimate when +the surrogate becomes inaccurate. + +
+
+ comment: Project website: https://phlippe.github.io/PDERefiner/ +
+
+
+
+
+ + ☆ Rethinking Integration of Prediction and Planning in Deep Learning-Based + Automated Driving Systems: A Review + + +
+ Automated driving has the potential to revolutionize personal, public, and +freight mobility. Besides the enormous challenge of perception, i.e. accurately +perceiving the environment using available sensor data, automated driving +comprises planning a safe, comfortable, and efficient motion trajectory. To +promote safety and progress, many works rely on modules that predict the future +motion of surrounding traffic. Modular automated driving systems commonly +handle prediction and planning as sequential separate tasks. While this +accounts for the influence of surrounding traffic on the ego-vehicle, it fails +to anticipate the reactions of traffic participants to the ego-vehicle's +behavior. Recent works suggest that integrating prediction and planning in an +interdependent joint step is necessary to achieve safe, efficient, and +comfortable driving. While various models implement such integrated systems, a +comprehensive overview and theoretical understanding of different principles +are lacking. We systematically review state-of-the-art deep learning-based +prediction, planning, and integrated prediction and planning models. Different +facets of the integration ranging from model architecture and model design to +behavioral aspects are considered and related to each other. Moreover, we +discuss the implications, strengths, and limitations of different integration +methods. By pointing out research gaps, describing relevant future challenges, +and highlighting trends in the research field, we identify promising directions +for future research. + +
+
+
+
+
+ + ☆ EXPRESSO: A Benchmark and Analysis of Discrete Expressive Speech + Resynthesis + + +
+ Recent work has shown that it is possible to resynthesize high-quality speech +based, not on text, but on low bitrate discrete units that have been learned in +a self-supervised fashion and can therefore capture expressive aspects of +speech that are hard to transcribe (prosody, voice styles, non-verbal +vocalization). The adoption of these methods is still limited by the fact that +most speech synthesis datasets are read, severely limiting spontaneity and +expressivity. Here, we introduce Expresso, a high-quality expressive speech +dataset for textless speech synthesis that includes both read speech and +improvised dialogues rendered in 26 spontaneous expressive styles. We +illustrate the challenges and potentials of this dataset with an expressive +resynthesis benchmark where the task is to encode the input in low-bitrate +units and resynthesize it in a target voice while preserving content and style. +We evaluate resynthesis quality with automatic metrics for different +self-supervised discrete encoders, and explore tradeoffs between quality, +bitrate and invariance to speaker and style. All the dataset, evaluation +metrics and baseline models are open source + +
+
+
+
+
+ + ☆ Optimizing Performance of Feedforward and Convolutional Neural Networks + through Dynamic Activation Functions + + +
+ Deep learning training training algorithms are a huge success in recent years +in many fields including speech, text,image video etc. Deeper and deeper layers +are proposed with huge success with resnet structures having around 152 layers. +Shallow convolution neural networks(CNN's) are still an active research, where +some phenomena are still unexplained. Activation functions used in the network +are of utmost importance, as they provide non linearity to the networks. Relu's +are the most commonly used activation function.We show a complex piece-wise +linear(PWL) activation in the hidden layer. We show that these PWL activations +work much better than relu activations in our networks for convolution neural +networks and multilayer perceptrons. Result comparison in PyTorch for shallow +and deep CNNs are given to further strengthen our case. + +
+
+ comment: Under submission in Neurocomputing +
+
+
+
+
+ + ☆ A Comparison of Classical and Deep Reinforcement Learning Methods for + HVAC Control + + +
+ Reinforcement learning (RL) is a promising approach for optimizing HVAC +control. RL offers a framework for improving system performance, reducing +energy consumption, and enhancing cost efficiency. We benchmark two popular +classical and deep RL methods (Q-Learning and Deep-Q-Networks) across multiple +HVAC environments and explore the practical consideration of model +hyper-parameter selection and reward tuning. The findings provide insight for +configuring RL agents in HVAC systems, promoting energy-efficient and +cost-effective operation. + +
+
+
+
+
+ + ☆ Shadow Datasets, New challenging datasets for Causal Representation + Learning + + +
+ Discovering causal relations among semantic factors is an emergent topic in +representation learning. Most causal representation learning (CRL) methods are +fully supervised, which is impractical due to costly labeling. To resolve this +restriction, weakly supervised CRL methods were introduced. To evaluate CRL +performance, four existing datasets, Pendulum, Flow, CelebA(BEARD) and +CelebA(SMILE), are utilized. However, existing CRL datasets are limited to +simple graphs with few generative factors. Thus we propose two new datasets +with a larger number of diverse generative factors and more sophisticated +causal graphs. In addition, current real datasets, CelebA(BEARD) and +CelebA(SMILE), the originally proposed causal graphs are not aligned with the +dataset distributions. Thus, we propose modifications to them. + +
+
+
+
+
+ + ☆ Hard No-Box Adversarial Attack on Skeleton-Based Human Action + Recognition with Skeleton-Motion-Informed Gradient + + +
+ Recently, methods for skeleton-based human activity recognition have been +shown to be vulnerable to adversarial attacks. However, these attack methods +require either the full knowledge of the victim (i.e. white-box attacks), +access to training data (i.e. transfer-based attacks) or frequent model queries +(i.e. black-box attacks). All their requirements are highly restrictive, +raising the question of how detrimental the vulnerability is. In this paper, we +show that the vulnerability indeed exists. To this end, we consider a new +attack task: the attacker has no access to the victim model or the training +data or labels, where we coin the term hard no-box attack. Specifically, we +first learn a motion manifold where we define an adversarial loss to compute a +new gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our +gradient contains information of the motion dynamics, which is different from +existing gradient-based attack methods that compute the loss gradient assuming +each dimension in the data is independent. The SMI gradient can augment many +gradient-based attack methods, leading to a new family of no-box attack +methods. Extensive evaluation and comparison show that our method imposes a +real threat to existing classifiers. They also show that the SMI gradient +improves the transferability and imperceptibility of adversarial samples in +both no-box and transfer-based black-box settings. + +
+
+
+
+
+ + ☆ Finding Already Debunked Narratives via Multistage Retrieval: Enabling + Cross-Lingual, Cross-Dataset and Zero-Shot Learning + + +
+ The task of retrieving already debunked narratives aims to detect stories +that have already been fact-checked. The successful detection of claims that +have already been debunked not only reduces the manual efforts of professional +fact-checkers but can also contribute to slowing the spread of misinformation. +Mainly due to the lack of readily available data, this is an understudied +problem, particularly when considering the cross-lingual task, i.e. the +retrieval of fact-checking articles in a language different from the language +of the online post being checked. This paper fills this gap by (i) creating a +novel dataset to enable research on cross-lingual retrieval of already debunked +narratives, using tweets as queries to a database of fact-checking articles; +(ii) presenting an extensive experiment to benchmark fine-tuned and +off-the-shelf multilingual pre-trained Transformer models for this task; and +(iii) proposing a novel multistage framework that divides this cross-lingual +debunk retrieval task into refinement and re-ranking stages. Results show that +the task of cross-lingual retrieval of already debunked narratives is +challenging and off-the-shelf Transformer models fail to outperform a strong +lexical-based baseline (BM25). Nevertheless, our multistage retrieval framework +is robust, outperforming BM25 in most scenarios and enabling cross-domain and +zero-shot learning, without significantly harming the model's performance. + +
+
+
+
+
+ + ☆ AST-MHSA : Code Summarization using Multi-Head Self-Attention + + +
+ Code summarization aims to generate concise natural language descriptions for +source code. The prevailing approaches adopt transformer-based encoder-decoder +architectures, where the Abstract Syntax Tree (AST) of the source code is +utilized for encoding structural information. However, ASTs are much longer +than the corresponding source code, and existing methods ignore this size +constraint by directly feeding the entire linearized AST into the encoders. +This simplistic approach makes it challenging to extract truly valuable +dependency relations from the overlong input sequence and leads to significant +computational overhead due to self-attention applied to all nodes in the AST. + To address this issue effectively and efficiently, we present a model, +AST-MHSA that uses multi-head attention to extract the important semantic +information from the AST. The model consists of two main components: an encoder +and a decoder. The encoder takes as input the abstract syntax tree (AST) of the +code and generates a sequence of hidden states. The decoder then takes these +hidden states as input and generates a natural language summary of the code. + The multi-head attention mechanism allows the model to learn different +representations of the input code, which can be combined to generate a more +comprehensive summary. The model is trained on a dataset of code and summaries, +and the parameters of the model are optimized to minimize the loss between the +generated summaries and the ground-truth summaries. + +
+
+
+
+
+ + ☆ IIHT: Medical Report Generation with Image-to-Indicator Hierarchical + Transformer + + +
+ Automated medical report generation has become increasingly important in +medical analysis. It can produce computer-aided diagnosis descriptions and thus +significantly alleviate the doctors' work. Inspired by the huge success of +neural machine translation and image captioning, various deep learning methods +have been proposed for medical report generation. However, due to the inherent +properties of medical data, including data imbalance and the length and +correlation between report sequences, the generated reports by existing methods +may exhibit linguistic fluency but lack adequate clinical accuracy. In this +work, we propose an image-to-indicator hierarchical transformer (IIHT) +framework for medical report generation. It consists of three modules, i.e., a +classifier module, an indicator expansion module and a generator module. The +classifier module first extracts image features from the input medical images +and produces disease-related indicators with their corresponding states. The +disease-related indicators are subsequently utilised as input for the indicator +expansion module, incorporating the "data-text-data" strategy. The +transformer-based generator then leverages these extracted features along with +image features as auxiliary information to generate final reports. Furthermore, +the proposed IIHT method is feasible for radiologists to modify disease +indicators in real-world scenarios and integrate the operations into the +indicator expansion module for fluent and accurate medical report generation. +Extensive experiments and comparisons with state-of-the-art methods under +various evaluation metrics demonstrate the great performance of the proposed +method. + +
+
+
+
+
+ + ☆ ReLU and Addition-based Gated RNN + + +
+ We replace the multiplication and sigmoid function of the conventional +recurrent gate with addition and ReLU activation. This mechanism is designed to +maintain long-term memory for sequence processing but at a reduced +computational cost, thereby opening up for more efficient execution or larger +models on restricted hardware. Recurrent Neural Networks (RNNs) with gating +mechanisms such as LSTM and GRU have been widely successful in learning from +sequential data due to their ability to capture long-term dependencies. +Conventionally, the update based on current inputs and the previous state +history is each multiplied with dynamic weights and combined to compute the +next state. However, multiplication can be computationally expensive, +especially for certain hardware architectures or alternative arithmetic systems +such as homomorphic encryption. It is demonstrated that the novel gating +mechanism can capture long-term dependencies for a standard synthetic sequence +learning task while significantly reducing computational costs such that +execution time is reduced by half on CPU and by one-third under encryption. +Experimental results on handwritten text recognition tasks furthermore show +that the proposed architecture can be trained to achieve comparable accuracy to +conventional GRU and LSTM baselines. The gating mechanism introduced in this +paper may enable privacy-preserving AI applications operating under homomorphic +encryption by avoiding the multiplication of encrypted variables. It can also +support quantization in (unencrypted) plaintext applications, with the +potential for substantial performance gains since the addition-based +formulation can avoid the expansion to double precision often required for +multiplication. + +
+
+ comment: 12 pages, 4 tables +
+
+
+
+
+ + ☆ Normalized Gradients for All + + +
+ In this short note, I show how to adapt to H\"{o}lder smoothness using +normalized gradients in a black-box way. Moreover, the bound will depend on a +novel notion of local H\"{o}lder smoothness. The main idea directly comes from +Levy [2017]. + +
+
+
+
+
+ + ☆ Updating Clinical Risk Stratification Models Using Rank-Based + Compatibility: Approaches for Evaluating and Optimizing Clinician-Model Team + Performance + + +
+ As data shift or new data become available, updating clinical machine +learning models may be necessary to maintain or improve performance over time. +However, updating a model can introduce compatibility issues when the behavior +of the updated model does not align with user expectations, resulting in poor +user-model team performance. Existing compatibility measures depend on model +decision thresholds, limiting their applicability in settings where models are +used to generate rankings based on estimated risk. To address this limitation, +we propose a novel rank-based compatibility measure, $C^R$, and a new loss +function that aims to optimize discriminative performance while encouraging +good compatibility. Applied to a case study in mortality risk stratification +leveraging data from MIMIC, our approach yields more compatible models while +maintaining discriminative performance compared to existing model selection +techniques, with an increase in $C^R$ of $0.019$ ($95\%$ confidence interval: +$0.005$, $0.035$). This work provides new tools to analyze and update risk +stratification models used in clinical care. + +
+
+ comment: Conference paper accepted at the 2023 Machine Learning for Healthcare + Conference Includes supplemental: 32 pages, 17 figures +
+
+
+
+
+ + ☆ Multi-graph Spatio-temporal Graph Convolutional Network for Traffic Flow + Prediction + + +
+ Inter-city highway transportation is significant for urban life. As one of +the key functions in intelligent transportation system (ITS), traffic +evaluation always plays significant role nowadays, and daily traffic flow +prediction still faces challenges at network-wide toll stations. On the one +hand, the data imbalance in practice among various locations deteriorates the +performance of prediction. On the other hand, complex correlative +spatio-temporal factors cannot be comprehensively employed in long-term +duration. In this paper, a prediction method is proposed for daily traffic flow +in highway domain through spatio-temporal deep learning. In our method, data +normalization strategy is used to deal with data imbalance, due to long-tail +distribution of traffic flow at network-wide toll stations. And then, based on +graph convolutional network, we construct networks in distinct semantics to +capture spatio-temporal features. Beside that, meteorology and calendar +features are used by our model in the full connection stage to extra external +characteristics of traffic flow. By extensive experiments and case studies in +one Chinese provincial highway, our method shows clear improvement in +predictive accuracy than baselines and practical benefits in business. + +
+
+
+
+
+ + ☆ NUPES : Non-Uniform Post-Training Quantization via Power Exponent Search + + +
+ Deep neural network (DNN) deployment has been confined to larger hardware +devices due to their expensive computational requirements. This challenge has +recently reached another scale with the emergence of large language models +(LLMs). In order to reduce both their memory footprint and latency, a promising +technique is quantization. It consists in converting floating point +representations to low bit-width fixed point representations, usually by +assuming a uniform mapping onto a regular grid. This process, referred to in +the literature as uniform quantization, may however be ill-suited as most DNN +weights and activations follow a bell-shaped distribution. This is even worse +on LLMs whose weight distributions are known to exhibit large, high impact, +outlier values. In this work, we propose an improvement over the most commonly +adopted way to tackle this limitation in deep learning models quantization, +namely, non-uniform quantization. NUPES leverages automorphisms to preserve the +scalar multiplications. Such transformations are derived from power functions. +However, the optimization of the exponent parameter and weight values remains a +challenging and novel problem which could not be solved with previous post +training optimization techniques which only learn to round up or down weight +values in order to preserve the predictive function. We circumvent this +limitation with a new paradigm: learning new quantized weights over the entire +quantized space. Similarly, we enable the optimization of the power exponent, +i.e. the optimization of the quantization operator itself during training by +alleviating all the numerical instabilities. The resulting predictive function +is compatible with integer-only low-bit inference. We show the ability of the +method to achieve state-of-the-art compression rates in both, data-free and +data-driven configurations. + +
+
+
+
+
+ + ☆ Symmetry Defense Against XGBoost Adversarial Perturbation Attacks + + +
+ We examine whether symmetry can be used to defend tree-based ensemble +classifiers such as gradient-boosting decision trees (GBDTs) against +adversarial perturbation attacks. The idea is based on a recent symmetry +defense for convolutional neural network classifiers (CNNs) that utilizes CNNs' +lack of invariance with respect to symmetries. CNNs lack invariance because +they can classify a symmetric sample, such as a horizontally flipped image, +differently from the original sample. CNNs' lack of invariance also means that +CNNs can classify symmetric adversarial samples differently from the incorrect +classification of adversarial samples. Using CNNs' lack of invariance, the +recent CNN symmetry defense has shown that the classification of symmetric +adversarial samples reverts to the correct sample classification. In order to +apply the same symmetry defense to GBDTs, we examine GBDT invariance and are +the first to show that GBDTs also lack invariance with respect to symmetries. +We apply and evaluate the GBDT symmetry defense for nine datasets against six +perturbation attacks with a threat model that ranges from zero-knowledge to +perfect-knowledge adversaries. Using the feature inversion symmetry against +zero-knowledge adversaries, we achieve up to 100% accuracy on adversarial +samples even when default and robust classifiers have 0% accuracy. Using the +feature inversion and horizontal flip symmetries against perfect-knowledge +adversaries, we achieve up to over 95% accuracy on adversarial samples for the +GBDT classifier of the F-MNIST dataset even when default and robust classifiers +have 0% accuracy. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ AutoGluon-TimeSeries: AutoML for Probabilistic Time Series Forecasting + + +
+ We introduce AutoGluon-TimeSeries - an open-source AutoML library for +probabilistic time series forecasting. Focused on ease of use and robustness, +AutoGluon-TimeSeries enables users to generate accurate point and quantile +forecasts with just 3 lines of Python code. Built on the design philosophy of +AutoGluon, AutoGluon-TimeSeries leverages ensembles of diverse forecasting +models to deliver high accuracy within a short training time. +AutoGluon-TimeSeries combines both conventional statistical models, +machine-learning based forecasting approaches, and ensembling techniques. In +our evaluation on 29 benchmark datasets, AutoGluon-TimeSeries demonstrates +strong empirical performance, outperforming a range of forecasting methods in +terms of both point and quantile forecast accuracy, and often even improving +upon the best-in-hindsight combination of prior methods. + +
+
+ comment: Published at AutoML Conference 2023 +
+
+
+
+
+ + ☆ Efficient Variational Inference for Large Skew-t Copulas with + Application to Intraday Equity Returns + + +
+ Large skew-t factor copula models are attractive for the modeling of +financial data because they allow for asymmetric and extreme tail dependence. +We show that the copula implicit in the skew-t distribution of Azzalini and +Capitanio (2003) allows for a higher level of pairwise asymmetric dependence +than two popular alternative skew-t copulas. Estimation of this copula in high +dimensions is challenging, and we propose a fast and accurate Bayesian +variational inference (VI) approach to do so. The method uses a conditionally +Gaussian generative representation of the skew-t distribution to define an +augmented posterior that can be approximated accurately. A fast stochastic +gradient ascent algorithm is used to solve the variational optimization. The +new methodology is used to estimate copula models for intraday returns from +2017 to 2021 on 93 U.S. equities. The copula captures substantial heterogeneity +in asymmetric dependence over equity pairs, in addition to the variability in +pairwise correlations. We show that intraday predictive densities from the +skew-t copula are more accurate than from some other copula models, while +portfolio selection strategies based on the estimated pairwise tail +dependencies improve performance relative to the benchmark index. + +
+
+
+
+
+ + ☆ Critical Points ++: An Agile Point Cloud Importance Measure for Robust + Classification, Adversarial Defense and Explainable AI + + +
+ The ability to cope accurately and fast with Out-Of-Distribution (OOD) +samples is crucial in real-world safety demanding applications. In this work we +first study the interplay between critical points of 3D point clouds and OOD +samples. Our findings are that common corruptions and outliers are often +interpreted as critical points. We generalize the notion of critical points +into importance measures. We show that training a classification network based +only on less important points dramatically improves robustness, at a cost of +minor performance loss on the clean set. We observe that normalized entropy is +highly informative for corruption analysis. An adaptive threshold based on +normalized entropy is suggested for selecting the set of uncritical points. Our +proposed importance measure is extremely fast to compute. We show it can be +used for a variety of applications, such as Explainable AI (XAI), Outlier +Removal, Uncertainty Estimation, Robust Classification and Adversarial Defense. +We reach SOTA results on the two latter tasks. + +
+
+
+
+
+ + ☆ Models Matter: The Impact of Single-Step Retrosynthesis on Synthesis + Planning + + +
+ Retrosynthesis consists of breaking down a chemical compound recursively +step-by-step into molecular precursors until a set of commercially available +molecules is found with the goal to provide a synthesis route. Its two primary +research directions, single-step retrosynthesis prediction, which models the +chemical reaction logic, and multi-step synthesis planning, which tries to find +the correct sequence of reactions, are inherently intertwined. Still, this +connection is not reflected in contemporary research. In this work, we combine +these two major research directions by applying multiple single-step +retrosynthesis models within multi-step synthesis planning and analyzing their +impact using public and proprietary reaction data. We find a disconnection +between high single-step performance and potential route-finding success, +suggesting that single-step models must be evaluated within synthesis planning +in the future. Furthermore, we show that the commonly used single-step +retrosynthesis benchmark dataset USPTO-50k is insufficient as this evaluation +task does not represent model performance and scalability on larger and more +diverse datasets. For multi-step synthesis planning, we show that the choice of +the single-step model can improve the overall success rate of synthesis +planning by up to +28% compared to the commonly used baseline model. Finally, +we show that each single-step model finds unique synthesis routes, and differs +in aspects such as route-finding success, the number of found synthesis routes, +and chemical validity, making the combination of single-step retrosynthesis +prediction and multi-step synthesis planning a crucial aspect when developing +future methods. + +
+
+ comment: The following authors contributed equally: Paula Torren-Peraire, Alan + Kai Hassen +
+
+
+
+
+ + ☆ On the Optimal Expressive Power of ReLU DNNs and Its Application in + Approximation with Kolmogorov Superposition Theorem + + +
+ This paper is devoted to studying the optimal expressive power of ReLU deep +neural networks (DNNs) and its application in approximation via the Kolmogorov +Superposition Theorem. We first constructively prove that any continuous +piecewise linear functions on $[0,1]$, comprising $O(N^2L)$ segments, can be +represented by ReLU DNNs with $L$ hidden layers and $N$ neurons per layer. +Subsequently, we demonstrate that this construction is optimal regarding the +parameter count of the DNNs, achieved through investigating the shattering +capacity of ReLU DNNs. Moreover, by invoking the Kolmogorov Superposition +Theorem, we achieve an enhanced approximation rate for ReLU DNNs of arbitrary +width and depth when dealing with continuous functions in high-dimensional +spaces. + +
+
+
+
+
+ + ☆ Quality Diversity under Sparse Reward and Sparse Interaction: + Application to Grasping in Robotics + + +
+ Quality-Diversity (QD) methods are algorithms that aim to generate a set of +diverse and high-performing solutions to a given problem. Originally developed +for evolutionary robotics, most QD studies are conducted on a limited set of +domains - mainly applied to locomotion, where the fitness and the behavior +signal are dense. Grasping is a crucial task for manipulation in robotics. +Despite the efforts of many research communities, this task is yet to be +solved. Grasping cumulates unprecedented challenges in QD literature: it +suffers from reward sparsity, behavioral sparsity, and behavior space +misalignment. The present work studies how QD can address grasping. Experiments +have been conducted on 15 different methods on 10 grasping domains, +corresponding to 2 different robot-gripper setups and 5 standard objects. An +evaluation framework that distinguishes the evaluation of an algorithm from its +internal components has also been proposed for a fair comparison. The obtained +results show that MAP-Elites variants that select successful solutions in +priority outperform all the compared methods on the studied metrics by a large +margin. We also found experimental evidence that sparse interaction can lead to +deceptive novelty. To our knowledge, the ability to efficiently produce +examples of grasping trajectories demonstrated in this work has no precedent in +the literature. + +
+
+ comment: 37 pages, 17 figures. Draft version +
+
+
+
+
+ + ☆ LLM As DBA + + +
+ Database administrators (DBAs) play a crucial role in managing, maintaining +and optimizing a database system to ensure data availability, performance, and +reliability. However, it is hard and tedious for DBAs to manage a large number +of database instances (e.g., millions of instances on the cloud databases). +Recently large language models (LLMs) have shown great potential to understand +valuable documents and accordingly generate reasonable answers. Thus, we +propose D-Bot, a LLM-based database administrator that can continuously acquire +database maintenance experience from textual sources, and provide reasonable, +well-founded, in-time diagnosis and optimization advice for target databases. +This paper presents a revolutionary LLM-centric framework for database +maintenance, including (i) database maintenance knowledge detection from +documents and tools, (ii) tree of thought reasoning for root cause analysis, +and (iii) collaborative diagnosis among multiple LLMs. Our preliminary +experimental results that D-Bot can efficiently and effectively diagnose the +root causes and our code is available at +github.com/TsinghuaDatabaseGroup/DB-GPT. + +
+
+
+
+
+ + ☆ Exploring Machine Learning and Transformer-based Approaches for + Deceptive Text Classification: A Comparative Analysis + + +
+ Deceptive text classification is a critical task in natural language +processing that aims to identify deceptive or fraudulent content. This study +presents a comparative analysis of machine learning and transformer-based +approaches for deceptive text classification. We investigate the effectiveness +of traditional machine learning algorithms and state-of-the-art transformer +models, such as BERT, XLNET, DistilBERT, and RoBERTa, in detecting deceptive +text. A labeled dataset consisting of deceptive and non-deceptive texts is used +for training and evaluation purposes. Through extensive experimentation, we +compare the performance metrics, including accuracy, precision, recall, and F1 +score, of the different approaches. The results of this study shed light on the +strengths and limitations of machine learning and transformer-based methods for +deceptive text classification, enabling researchers and practitioners to make +informed decisions when dealing with deceptive content + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Provably Efficient Algorithm for Nonstationary Low-Rank MDPs + + +
+ Reinforcement learning (RL) under changing environment models many real-world +applications via nonstationary Markov Decision Processes (MDPs), and hence +gains considerable interest. However, theoretical studies on nonstationary MDPs +in the literature have mainly focused on tabular and linear (mixture) MDPs, +which do not capture the nature of unknown representation in deep RL. In this +paper, we make the first effort to investigate nonstationary RL under episodic +low-rank MDPs, where both transition kernels and rewards may vary over time, +and the low-rank model contains unknown representation in addition to the +linear state embedding function. We first propose a parameter-dependent policy +optimization algorithm called PORTAL, and further improve PORTAL to its +parameter-free version of Ada-PORTAL, which is able to tune its +hyper-parameters adaptively without any prior knowledge of nonstationarity. For +both algorithms, we provide upper bounds on the average dynamic suboptimality +gap, which show that as long as the nonstationarity is not significantly large, +PORTAL and Ada-PORTAL are sample-efficient and can achieve arbitrarily small +average dynamic suboptimality gap with polynomial sample complexity. + +
+
+
+
+
+ + ☆ $\mathcal{G}^2Pxy$: Generative Open-Set Node Classification on Graphs + with Proxy Unknowns + + +
+ Node classification is the task of predicting the labels of unlabeled nodes +in a graph. State-of-the-art methods based on graph neural networks achieve +excellent performance when all labels are available during training. But in +real-life, models are often applied on data with new classes, which can lead to +massive misclassification and thus significantly degrade performance. Hence, +developing open-set classification methods is crucial to determine if a given +sample belongs to a known class. Existing methods for open-set node +classification generally use transductive learning with part or all of the +features of real unseen class nodes to help with open-set classification. In +this paper, we propose a novel generative open-set node classification method, +i.e. $\mathcal{G}^2Pxy$, which follows a stricter inductive learning setting +where no information about unknown classes is available during training and +validation. Two kinds of proxy unknown nodes, inter-class unknown proxies and +external unknown proxies are generated via mixup to efficiently anticipate the +distribution of novel classes. Using the generated proxies, a closed-set +classifier can be transformed into an open-set one, by augmenting it with an +extra proxy classifier. Under the constraints of both cross entropy loss and +complement entropy loss, $\mathcal{G}^2Pxy$ achieves superior effectiveness for +unknown class detection and known class classification, which is validated by +experiments on benchmark graph datasets. Moreover, $\mathcal{G}^2Pxy$ does not +have specific requirement on the GNN architecture and shows good +generalizations. + +
+
+ comment: 8 pages, 1 figure +
+
+
+
+
+ + ☆ A Forecaster's Review of Judea Pearl's Causality: Models, Reasoning and + Inference, Second Edition, 2009 + + +
+ With the big popularity and success of Judea Pearl's original causality book, +this review covers the main topics updated in the second edition in 2009 and +illustrates an easy-to-follow causal inference strategy in a forecast scenario. +It further discusses some potential benefits and challenges for causal +inference with time series forecasting when modeling the counterfactuals, +estimating the uncertainty and incorporating prior knowledge to estimate causal +effects in different forecasting scenarios. + +
+
+
+
+
+ + ☆ Explainable AI applications in the Medical Domain: a systematic review + + +
+ Artificial Intelligence in Medicine has made significant progress with +emerging applications in medical imaging, patient care, and other areas. While +these applications have proven successful in retrospective studies, very few of +them were applied in practice.The field of Medical AI faces various challenges, +in terms of building user trust, complying with regulations, using data +ethically.Explainable AI (XAI) aims to enable humans understand AI and trust +its results. This paper presents a literature review on the recent developments +of XAI solutions for medical decision support, based on a representative sample +of 198 articles published in recent years. The systematic synthesis of the +relevant articles resulted in several findings. (1) model-agnostic XAI +techniques were mostly employed in these solutions, (2) deep learning models +are utilized more than other types of machine learning models, (3) +explainability was applied to promote trust, but very few works reported the +physicians participation in the loop, (4) visual and interactive user interface +is more useful in understanding the explanation and the recommendation of the +system. More research is needed in collaboration between medical and AI +experts, that could guide the development of suitable frameworks for the +design, implementation, and evaluation of XAI solutions in medicine. + +
+
+
+
+
+ + ☆ A Comparative Assessment of Multi-view fusion learning for Crop + Classification + + +
+ With a rapidly increasing amount and diversity of remote sensing (RS) data +sources, there is a strong need for multi-view learning modeling. This is a +complex task when considering the differences in resolution, magnitude, and +noise of RS data. The typical approach for merging multiple RS sources has been +input-level fusion, but other - more advanced - fusion strategies may +outperform this traditional approach. This work assesses different fusion +strategies for crop classification in the CropHarvest dataset. The fusion +methods proposed in this work outperform models based on individual views and +previous fusion methods. We do not find one single fusion method that +consistently outperforms all other approaches. Instead, we present a comparison +of multi-view fusion methods for three different datasets and show that, +depending on the test region, different methods obtain the best performance. +Despite this, we suggest a preliminary criterion for the selection of fusion +methods. + +
+
+ comment: Accepted at IEEE International Geoscience and Remote Sensing + Symposium 2023 +
+
+
+
+
+ + ☆ Product Review Image Ranking for Fashion E-commerce SIGIR + + +
+ In a fashion e-commerce platform where customers can't physically examine the +products on their own, being able to see other customers' text and image +reviews of the product is critical while making purchase decisions. Given the +high reliance on these reviews, over the years we have observed customers +proactively sharing their reviews. With an increase in the coverage of User +Generated Content (UGC), there has been a corresponding increase in the number +of customer images. It is thus imperative to display the most relevant images +on top as it may influence users' online shopping choices and behavior. In this +paper, we propose a simple yet effective training procedure for ranking +customer images. We created a dataset consisting of Myntra (A Major Indian +Fashion e-commerce company) studio posts and highly engaged (upvotes/downvotes) +UGC images as our starting point and used selected distortion techniques on the +images of the above dataset to bring their quality at par with those of bad UGC +images. We train our network to rank bad-quality images lower than high-quality +ones. Our proposed method outperforms the baseline models on two metrics, +namely correlation coefficient, and accuracy, by substantial margins. + +
+
+ comment: Accepted in Proceedings of ACM SIGIR Workshop on eCommerce (SIGIR + eCom'22) +
+
+
+
+
+ + ☆ Trustworthy LLMs: a Survey and Guideline for Evaluating Large Language + Models' Alignment + + +
+ Ensuring alignment, which refers to making models behave in accordance with +human intentions [1,2], has become a critical task before deploying large +language models (LLMs) in real-world applications. For instance, OpenAI devoted +six months to iteratively aligning GPT-4 before its release [3]. However, a +major challenge faced by practitioners is the lack of clear guidance on +evaluating whether LLM outputs align with social norms, values, and +regulations. This obstacle hinders systematic iteration and deployment of LLMs. +To address this issue, this paper presents a comprehensive survey of key +dimensions that are crucial to consider when assessing LLM trustworthiness. The +survey covers seven major categories of LLM trustworthiness: reliability, +safety, fairness, resistance to misuse, explainability and reasoning, adherence +to social norms, and robustness. Each major category is further divided into +several sub-categories, resulting in a total of 29 sub-categories. +Additionally, a subset of 8 sub-categories is selected for further +investigation, where corresponding measurement studies are designed and +conducted on several widely-used LLMs. The measurement results indicate that, +in general, more aligned models tend to perform better in terms of overall +trustworthiness. However, the effectiveness of alignment varies across the +different trustworthiness categories considered. This highlights the importance +of conducting more fine-grained analyses, testing, and making continuous +improvements on LLM alignment. By shedding light on these key dimensions of LLM +trustworthiness, this paper aims to provide valuable insights and guidance to +practitioners in the field. Understanding and addressing these concerns will be +crucial in achieving reliable and ethically sound deployment of LLMs in various +applications. + +
+
+
+
+
+ + ☆ Flexible Isosurface Extraction for Gradient-Based Mesh Optimization SIGGRAPH 2023 + + +
+ This work considers gradient-based mesh optimization, where we iteratively +optimize for a 3D surface mesh by representing it as the isosurface of a scalar +field, an increasingly common paradigm in applications including +photogrammetry, generative modeling, and inverse physics. Existing +implementations adapt classic isosurface extraction algorithms like Marching +Cubes or Dual Contouring; these techniques were designed to extract meshes from +fixed, known fields, and in the optimization setting they lack the degrees of +freedom to represent high-quality feature-preserving meshes, or suffer from +numerical instabilities. We introduce FlexiCubes, an isosurface representation +specifically designed for optimizing an unknown mesh with respect to geometric, +visual, or even physical objectives. Our main insight is to introduce +additional carefully-chosen parameters into the representation, which allow +local flexible adjustments to the extracted mesh geometry and connectivity. +These parameters are updated along with the underlying scalar field via +automatic differentiation when optimizing for a downstream task. We base our +extraction scheme on Dual Marching Cubes for improved topological properties, +and present extensions to optionally generate tetrahedral and +hierarchically-adaptive meshes. Extensive experiments validate FlexiCubes on +both synthetic benchmarks and real-world applications, showing that it offers +significant improvements in mesh quality and geometric fidelity. + +
+
+ comment: SIGGRAPH 2023. Project page: + https://research.nvidia.com/labs/toronto-ai/flexicubes/ +
+
+
+
+
+ + ☆ Machine Learning aided Computer Architecture Design for CNN Inferencing + Systems + + +
+ Efficient and timely calculations of Machine Learning (ML) algorithms are +essential for emerging technologies like autonomous driving, the Internet of +Things (IoT), and edge computing. One of the primary ML algorithms used in such +systems is Convolutional Neural Networks (CNNs), which demand high +computational resources. This requirement has led to the use of ML accelerators +like GPGPUs to meet design constraints. However, selecting the most suitable +accelerator involves Design Space Exploration (DSE), a process that is usually +time-consuming and requires significant manual effort. Our work presents +approaches to expedite the DSE process by identifying the most appropriate +GPGPU for CNN inferencing systems. We have developed a quick and precise +technique for forecasting the power and performance of CNNs during inference, +with a MAPE of 5.03% and 5.94%, respectively. Our approach empowers computer +architects to estimate power and performance in the early stages of +development, reducing the necessity for numerous prototypes. This saves time +and money while also improving the time-to-market period. + +
+
+
+
+
+ + ☆ FINER: Enhancing State-of-the-art Classifiers with Feature Attribution + to Facilitate Security Analysis + + +
+ Deep learning classifiers achieve state-of-the-art performance in various +risk detection applications. They explore rich semantic representations and are +supposed to automatically discover risk behaviors. However, due to the lack of +transparency, the behavioral semantics cannot be conveyed to downstream +security experts to reduce their heavy workload in security analysis. Although +feature attribution (FA) methods can be used to explain deep learning, the +underlying classifier is still blind to what behavior is suspicious, and the +generated explanation cannot adapt to downstream tasks, incurring poor +explanation fidelity and intelligibility. In this paper, we propose FINER, the +first framework for risk detection classifiers to generate high-fidelity and +high-intelligibility explanations. The high-level idea is to gather explanation +efforts from model developer, FA designer, and security experts. To improve +fidelity, we fine-tune the classifier with an explanation-guided multi-task +learning strategy. To improve intelligibility, we engage task knowledge to +adjust and ensemble FA methods. Extensive evaluations show that FINER improves +explanation quality for risk detection. Moreover, we demonstrate that FINER +outperforms a state-of-the-art tool in facilitating malware analysis. + +
+
+
+
+
+ + ☆ Preemptive Detection of Fake Accounts on Social Networks via Multi-Class + Preferential Attachment Classifiers + + +
+ In this paper, we describe a new algorithm called Preferential Attachment +k-class Classifier (PreAttacK) for detecting fake accounts in a social network. +Recently, several algorithms have obtained high accuracy on this problem. +However, they have done so by relying on information about fake accounts' +friendships or the content they share with others--the very things we seek to +prevent. + PreAttacK represents a significant departure from these approaches. We +provide some of the first detailed distributional analyses of how new fake (and +real) accounts first attempt to request friends after joining a major network +(Facebook). We show that even before a new account has made friends or shared +content, these initial friend request behaviors evoke a natural multi-class +extension of the canonical Preferential Attachment model of social network +growth. + We use this model to derive a new algorithm, PreAttacK. We prove that in +relevant problem instances, PreAttacK near-optimally approximates the posterior +probability that a new account is fake under this multi-class Preferential +Attachment model of new accounts' (not-yet-answered) friend requests. These are +the first provable guarantees for fake account detection that apply to new +users, and that do not require strong homophily assumptions. + This principled approach also makes PreAttacK the only algorithm with +provable guarantees that obtains state-of-the-art performance on new users on +the global Facebook network, where it converges to AUC=0.9 after new users send ++ receive a total of just 20 not-yet-answered friend requests. For comparison, +state-of-the-art benchmarks do not obtain this AUC even after observing +additional data on new users' first 100 friend requests. Thus, unlike +mainstream algorithms, PreAttacK converges before the median new fake account +has made a single friendship (accepted friend request) with a human. + +
+
+
+
+
+ + ☆ RTLLM: An Open-Source Benchmark for Design RTL Generation with Large + Language Model + + +
+ Inspired by the recent success of large language models (LLMs) like ChatGPT, +researchers start to explore the adoption of LLMs for agile hardware design, +such as generating design RTL based on natural-language instructions. However, +in existing works, their target designs are all relatively simple and in a +small scale, and proposed by the authors themselves, making a fair comparison +among different LLM solutions challenging. In addition, many prior works only +focus on the design correctness, without evaluating the design qualities of +generated design RTL. In this work, we propose an open-source benchmark named +RTLLM, for generating design RTL with natural language instructions. To +systematically evaluate the auto-generated design RTL, we summarized three +progressive goals, named syntax goal, functionality goal, and design quality +goal. This benchmark can automatically provide a quantitative evaluation of any +given LLM-based solution. Furthermore, we propose an easy-to-use yet +surprisingly effective prompt engineering technique named self-planning, which +proves to significantly boost the performance of GPT-3.5 in our proposed +benchmark. + +
+
+
+
+
+ + ☆ OpenProteinSet: Training data for structural biology at scale + + +
+ Multiple sequence alignments (MSAs) of proteins encode rich biological +information and have been workhorses in bioinformatic methods for tasks like +protein design and protein structure prediction for decades. Recent +breakthroughs like AlphaFold2 that use transformers to attend directly over +large quantities of raw MSAs have reaffirmed their importance. Generation of +MSAs is highly computationally intensive, however, and no datasets comparable +to those used to train AlphaFold2 have been made available to the research +community, hindering progress in machine learning for proteins. To remedy this +problem, we introduce OpenProteinSet, an open-source corpus of more than 16 +million MSAs, associated structural homologs from the Protein Data Bank, and +AlphaFold2 protein structure predictions. We have previously demonstrated the +utility of OpenProteinSet by successfully retraining AlphaFold2 on it. We +expect OpenProteinSet to be broadly useful as training and validation data for +1) diverse tasks focused on protein structure, function, and design and 2) +large-scale multimodal machine learning research. + +
+
+
+
+
+ + ☆ Homophily-enhanced Structure Learning for Graph Clustering + + +
+ Graph clustering is a fundamental task in graph analysis, and recent advances +in utilizing graph neural networks (GNNs) have shown impressive results. +Despite the success of existing GNN-based graph clustering methods, they often +overlook the quality of graph structure, which is inherent in real-world graphs +due to their sparse and multifarious nature, leading to subpar performance. +Graph structure learning allows refining the input graph by adding missing +links and removing spurious connections. However, previous endeavors in graph +structure learning have predominantly centered around supervised settings, and +cannot be directly applied to our specific clustering tasks due to the absence +of ground-truth labels. To bridge the gap, we propose a novel method called +\textbf{ho}mophily-enhanced structure \textbf{le}arning for graph clustering +(HoLe). Our motivation stems from the observation that subtly enhancing the +degree of homophily within the graph structure can significantly improve GNNs +and clustering outcomes. To realize this objective, we develop two +clustering-oriented structure learning modules, i.e., hierarchical correlation +estimation and cluster-aware sparsification. The former module enables a more +accurate estimation of pairwise node relationships by leveraging guidance from +latent and clustering spaces, while the latter one generates a sparsified +structure based on the similarity matrix and clustering assignments. +Additionally, we devise a joint optimization approach alternating between +training the homophily-enhanced structure learning and GNN-based clustering, +thereby enforcing their reciprocal effects. Extensive experiments on seven +benchmark datasets of various types and scales, across a range of clustering +metrics, demonstrate the superiority of HoLe against state-of-the-art +baselines. + +
+
+ comment: 11 pages with 7 figures +
+
+
+
+
+ + ☆ From CNN to Transformer: A Review of Medical Image Segmentation Models + + +
+ Medical image segmentation is an important step in medical image analysis, +especially as a crucial prerequisite for efficient disease diagnosis and +treatment. The use of deep learning for image segmentation has become a +prevalent trend. The widely adopted approach currently is U-Net and its +variants. Additionally, with the remarkable success of pre-trained models in +natural language processing tasks, transformer-based models like TransUNet have +achieved desirable performance on multiple medical image segmentation datasets. +In this paper, we conduct a survey of the most representative four medical +image segmentation models in recent years. We theoretically analyze the +characteristics of these models and quantitatively evaluate their performance +on two benchmark datasets (i.e., Tuberculosis Chest X-rays and ovarian tumors). +Finally, we discuss the main challenges and future trends in medical image +segmentation. Our work can assist researchers in the related field to quickly +establish medical segmentation models tailored to specific regions. + +
+
+ comment: 18 pages, 8 figures +
+
+
+
+
+ + ☆ Byzantine-Robust Decentralized Stochastic Optimization with Stochastic + Gradient Noise-Independent Learning Error + + +
+ This paper studies Byzantine-robust stochastic optimization over a +decentralized network, where every agent periodically communicates with its +neighbors to exchange local models, and then updates its own local model by +stochastic gradient descent (SGD). The performance of such a method is affected +by an unknown number of Byzantine agents, which conduct adversarially during +the optimization process. To the best of our knowledge, there is no existing +work that simultaneously achieves a linear convergence speed and a small +learning error. We observe that the learning error is largely dependent on the +intrinsic stochastic gradient noise. Motivated by this observation, we +introduce two variance reduction methods, stochastic average gradient algorithm +(SAGA) and loopless stochastic variance-reduced gradient (LSVRG), to +Byzantine-robust decentralized stochastic optimization for eliminating the +negative effect of the stochastic gradient noise. The two resulting methods, +BRAVO-SAGA and BRAVO-LSVRG, enjoy both linear convergence speeds and stochastic +gradient noise-independent learning errors. Such learning errors are optimal +for a class of methods based on total variation (TV)-norm regularization and +stochastic subgradient update. We conduct extensive numerical experiments to +demonstrate their effectiveness under various Byzantine attacks. + +
+
+
+
+
+ + ☆ Investigating disaster response through social media data and the + Susceptible-Infected-Recovered (SIR) model: A case study of 2020 Western U.S. + wildfire season + + +
+ Effective disaster response is critical for affected communities. Responders +and decision-makers would benefit from reliable, timely measures of the issues +impacting their communities during a disaster, and social media offers a +potentially rich data source. Social media can reflect public concerns and +demands during a disaster, offering valuable insights for decision-makers to +understand evolving situations and optimize resource allocation. We used +Bidirectional Encoder Representations from Transformers (BERT) topic modeling +to cluster topics from Twitter data. Then, we conducted a temporal-spatial +analysis to examine the distribution of these topics across different regions +during the 2020 western U.S. wildfire season. Our results show that Twitter +users mainly focused on three topics:"health impact," "damage," and +"evacuation." We used the Susceptible-Infected-Recovered (SIR) theory to +explore the magnitude and velocity of topic diffusion on Twitter. The results +displayed a clear relationship between topic trends and wildfire propagation +patterns. The estimated parameters obtained from the SIR model in selected +cities revealed that residents exhibited a high level of several concerns +during the wildfire. Our study details how the SIR model and topic modeling +using social media data can provide decision-makers with a quantitative +approach to measure disaster response and support their decision-making +processes. + +
+
+
+
+
+ + ☆ Cross-heterogeneity Graph Few-shot Learning + + +
+ In recent years, heterogeneous graph few-shot learning has been proposed to +address the label sparsity issue in heterogeneous graphs (HGs), which contain +various types of nodes and edges. The existing methods have achieved good +performance by transferring generalized knowledge extracted from rich-labeled +classes in source HG(s) to few-labeled classes in a target HG. However, these +methods only consider the single-heterogeneity scenario where the source and +target HGs share a fixed set of node/edge types, ignoring the more general +scenario of cross-heterogeneity, where each HG can have a different and +non-fixed set of node/edge types. To this end, we focus on the unexplored +cross-heterogeneity scenario and propose a novel model for Cross-heterogeneity +Graph Few-shot Learning, namely CGFL. In CGFL, we first extract meta-patterns +to capture heterogeneous information and propose a multi-view heterogeneous +graph neural network (MHGN) to learn meta-patterns across HGs. Then, we propose +a score module to measure the informativeness of labeled samples and determine +the transferability of each source HG. Finally, by integrating MHGN and the +score module into a meta-learning mechanism, CGFL can effectively transfer +generalized knowledge to predict new classes with few-labeled data. Extensive +experiments on four real-world datasets have demonstrated the superior +performance of CGFL over the state-of-the-art methods. + +
+
+
+
+
+ + ☆ Preemptive Detection of Fake Accounts on Social Networks via Multi-Class + Preferential Attachment Classifiers + + +
+ In this paper, we describe a new algorithm called Preferential Attachment +k-class Classifier (PreAttacK) for detecting fake accounts in a social network. +Recently, several algorithms have obtained high accuracy on this problem. +However, they have done so by relying on information about fake accounts' +friendships or the content they share with others--the very things we seek to +prevent. PreAttacK represents a significant departure from these approaches. We +provide some of the first detailed distributional analyses of how new fake (and +real) accounts first attempt to request friends after joining a major network +(Facebook). We show that even before a new account has made friends or shared +content, these initial friend request behaviors evoke a natural multi-class +extension of the canonical Preferential Attachment model of social network +growth. We use this model to derive a new algorithm, PreAttacK. We prove that +in relevant problem instances, PreAttacK near-optimally approximates the +posterior probability that a new account is fake under this multi-class +Preferential Attachment model of new accounts' (not-yet-answered) friend +requests. These are the first provable guarantees for fake account detection +that apply to new users, and that do not require strong homophily assumptions. +This principled approach also makes PreAttacK the only algorithm with provable +guarantees that obtains state-of-the-art performance on new users on the global +Facebook network, where it converges to AUC=0.9 after new users send + receive +a total of just 20 not-yet-answered friend requests. For comparison, +state-of-the-art benchmarks do not obtain this AUC even after observing +additional data on new users' first 100 friend requests. Thus, unlike +mainstream algorithms, PreAttacK converges before the median new fake account +has made a single friendship (accepted friend request) with a human. + +
+
+
+
+
+ + ☆ GPLaSDI: Gaussian Process-based Interpretable Latent Space Dynamics + Identification through Deep Autoencoder + + +
+ Numerically solving partial differential equations (PDEs) can be challenging +and computationally expensive. This has led to the development of reduced-order +models (ROMs) that are accurate but faster than full order models (FOMs). +Recently, machine learning advances have enabled the creation of non-linear +projection methods, such as Latent Space Dynamics Identification (LaSDI). LaSDI +maps full-order PDE solutions to a latent space using autoencoders and learns +the system of ODEs governing the latent space dynamics. By interpolating and +solving the ODE system in the reduced latent space, fast and accurate ROM +predictions can be made by feeding the predicted latent space dynamics into the +decoder. In this paper, we introduce GPLaSDI, a novel LaSDI-based framework +that relies on Gaussian process (GP) for latent space ODE interpolations. Using +GPs offers two significant advantages. First, it enables the quantification of +uncertainty over the ROM predictions. Second, leveraging this prediction +uncertainty allows for efficient adaptive training through a greedy selection +of additional training data points. This approach does not require prior +knowledge of the underlying PDEs. Consequently, GPLaSDI is inherently +non-intrusive and can be applied to problems without a known PDE or its +residual. We demonstrate the effectiveness of our approach on the Burgers +equation, Vlasov equation for plasma physics, and a rising thermal bubble +problem. Our proposed method achieves between 200 and 100,000 times speed-up, +with up to 7% relative error. + +
+
+
+
+
+ + ☆ Aphid Cluster Recognition and Detection in the Wild Using Deep Learning + Models + + +
+ Aphid infestation poses a significant threat to crop production, rural +communities, and global food security. While chemical pest control is crucial +for maximizing yields, applying chemicals across entire fields is both +environmentally unsustainable and costly. Hence, precise localization and +management of aphids are essential for targeted pesticide application. The +paper primarily focuses on using deep learning models for detecting aphid +clusters. We propose a novel approach for estimating infection levels by +detecting aphid clusters. To facilitate this research, we have captured a +large-scale dataset from sorghum fields, manually selected 5,447 images +containing aphids, and annotated each individual aphid cluster within these +images. To facilitate the use of machine learning models, we further process +the images by cropping them into patches, resulting in a labeled dataset +comprising 151,380 image patches. Then, we implemented and compared the +performance of four state-of-the-art object detection models (VFNet, GFLV2, +PAA, and ATSS) on the aphid dataset. Extensive experimental results show that +all models yield stable similar performance in terms of average precision and +recall. We then propose to merge close neighboring clusters and remove tiny +clusters caused by cropping, and the performance is further boosted by around +17%. The study demonstrates the feasibility of automatically detecting and +managing insects using machine learning models. The labeled dataset will be +made openly available to the research community. + +
+
+
+
+
+ + ☆ Composable Core-sets for Diversity Approximation on Multi-Dataset + Streams + + +
+ Core-sets refer to subsets of data that maximize some function that is +commonly a diversity or group requirement. These subsets are used in place of +the original data to accomplish a given task with comparable or even enhanced +performance if biases are removed. Composable core-sets are core-sets with the +property that subsets of the core set can be unioned together to obtain an +approximation for the original data; lending themselves to be used for streamed +or distributed data. Recent work has focused on the use of core-sets for +training machine learning models. Preceding solutions such as CRAIG have been +proven to approximate gradient descent while providing a reduced training time. +In this paper, we introduce a core-set construction algorithm for constructing +composable core-sets to summarize streamed data for use in active learning +environments. If combined with techniques such as CRAIG and heuristics to +enhance construction speed, composable core-sets could be used for real time +training of models when the amount of sensor data is large. We provide +empirical analysis by considering extrapolated data for the runtime of such a +brute force algorithm. This algorithm is then analyzed for efficiency through +averaged empirical regression and key results and improvements are suggested +for further research on the topic. + +
+
+
+
+
+ + ☆ Revisiting N-CNN for Clinical Practice MICCAI + + +
+ This paper revisits the Neonatal Convolutional Neural Network (N-CNN) by +optimizing its hyperparameters and evaluating how they affect its +classification metrics, explainability and reliability, discussing their +potential impact in clinical practice. We have chosen hyperparameters that do +not modify the original N-CNN architecture, but mainly modify its learning rate +and training regularization. The optimization was done by evaluating the +improvement in F1 Score for each hyperparameter individually, and the best +hyperparameters were chosen to create a Tuned N-CNN. We also applied soft +labels derived from the Neonatal Facial Coding System, proposing a novel +approach for training facial expression classification models for neonatal pain +assessment. Interestingly, while the Tuned N-CNN results point towards +improvements in classification metrics and explainability, these improvements +did not directly translate to calibration performance. We believe that such +insights might have the potential to contribute to the development of more +reliable pain evaluation tools for newborns, aiding healthcare professionals in +delivering appropriate interventions and improving patient outcomes. + +
+
+ comment: AICAI 2023 in conjuction with MICCAI +
+
+
+
+
+ + ☆ UFed-GAN: A Secure Federated Learning Framework with Constrained + Computation and Unlabeled Data + + +
+ To satisfy the broad applications and insatiable hunger for deploying low +latency multimedia data classification and data privacy in a cloud-based +setting, federated learning (FL) has emerged as an important learning paradigm. +For the practical cases involving limited computational power and only +unlabeled data in many wireless communications applications, this work +investigates FL paradigm in a resource-constrained and label-missing +environment. Specifically, we propose a novel framework of UFed-GAN: +Unsupervised Federated Generative Adversarial Network, which can capture +user-side data distribution without local classification training. We also +analyze the convergence and privacy of the proposed UFed-GAN. Our experimental +results demonstrate the strong potential of UFed-GAN in addressing limited +computational resources and unlabeled data while preserving privacy. + +
+
+
+
+
+ + ☆ Using Twitter Data to Determine Hurricane Category: An Experiment SC + + +
+ Social media posts contain an abundant amount of information about public +opinion on major events, especially natural disasters such as hurricanes. Posts +related to an event, are usually published by the users who live near the place +of the event at the time of the event. Special correlation between the social +media data and the events can be obtained using data mining approaches. This +paper presents research work to find the mappings between social media data and +the severity level of a disaster. Specifically, we have investigated the +Twitter data posted during hurricanes Harvey and Irma, and attempted to find +the correlation between the Twitter data of a specific area and the hurricane +level in that area. Our experimental results indicate a positive correlation +between them. We also present a method to predict the hurricane category for a +specific area using relevant Twitter data. + +
+
+ comment: 9 Pages, 6 Figures, in Proceedings of the 15th ISCRAM Conference + Rochester, NY, USA May 2018 +
+
+
+
+
+ + ☆ The Multi-modality Cell Segmentation Challenge: Towards Universal + Solutions NeurIPS22 + + +
+ Cell segmentation is a critical step for quantitative single-cell analysis in +microscopy images. Existing cell segmentation methods are often tailored to +specific modalities or require manual interventions to specify hyperparameters +in different experimental settings. Here, we present a multi-modality cell +segmentation benchmark, comprising over 1500 labeled images derived from more +than 50 diverse biological experiments. The top participants developed a +Transformer-based deep-learning algorithm that not only exceeds existing +methods, but can also be applied to diverse microscopy images across imaging +platforms and tissue types without manual parameter adjustments. This benchmark +and the improved algorithm offer promising avenues for more accurate and +versatile cell analysis in microscopy imaging. + +
+
+ comment: NeurIPS22 Cell Segmentation Challenge: + https://neurips22-cellseg.grand-challenge.org/ +
+
+
+
+
+ + ☆ Knowledge Propagation over Conditional Independence Graphs + + +
+ Conditional Independence (CI) graph is a special type of a Probabilistic +Graphical Model (PGM) where the feature connections are modeled using an +undirected graph and the edge weights show the partial correlation strength +between the features. Since the CI graphs capture direct dependence between +features, they have been garnering increasing interest within the research +community for gaining insights into the systems from various domains, in +particular discovering the domain topology. In this work, we propose algorithms +for performing knowledge propagation over the CI graphs. Our experiments +demonstrate that our techniques improve upon the state-of-the-art on the +publicly available Cora and PubMed datasets. + +
+
+
+
+
+ + ☆ GaborPINN: Efficient physics informed neural networks using + multiplicative filtered networks + + +
+ The computation of the seismic wavefield by solving the Helmholtz equation is +crucial to many practical applications, e.g., full waveform inversion. +Physics-informed neural networks (PINNs) provide functional wavefield solutions +represented by neural networks (NNs), but their convergence is slow. To address +this problem, we propose a modified PINN using multiplicative filtered +networks, which embeds some of the known characteristics of the wavefield in +training, e.g., frequency, to achieve much faster convergence. Specifically, we +use the Gabor basis function due to its proven ability to represent wavefields +accurately and refer to the implementation as GaborPINN. Meanwhile, we +incorporate prior information on the frequency of the wavefield into the design +of the method to mitigate the influence of the discontinuity of the represented +wavefield by GaborPINN. The proposed method achieves up to a two-magnitude +increase in the speed of convergence as compared with conventional PINNs. + +
+
+
+
+
+ + ☆ FLShield: A Validation Based Federated Learning Framework to Defend + Against Poisoning Attacks + + +
+ Federated learning (FL) is revolutionizing how we learn from data. With its +growing popularity, it is now being used in many safety-critical domains such +as autonomous vehicles and healthcare. Since thousands of participants can +contribute in this collaborative setting, it is, however, challenging to ensure +security and reliability of such systems. This highlights the need to design FL +systems that are secure and robust against malicious participants' actions +while also ensuring high utility, privacy of local data, and efficiency. In +this paper, we propose a novel FL framework dubbed as FLShield that utilizes +benign data from FL participants to validate the local models before taking +them into account for generating the global model. This is in stark contrast +with existing defenses relying on server's access to clean datasets -- an +assumption often impractical in real-life scenarios and conflicting with the +fundamentals of FL. We conduct extensive experiments to evaluate our FLShield +framework in different settings and demonstrate its effectiveness in thwarting +various types of poisoning and backdoor attacks including a defense-aware one. +FLShield also preserves privacy of local data against gradient inversion +attacks. + +
+
+
+
+
+ + ♻ ☆ RobustPdM: Designing Robust Predictive Maintenance against Adversarial + Attacks + + +
+ The state-of-the-art predictive maintenance (PdM) techniques have shown great +success in reducing maintenance costs and downtime of complicated machines +while increasing overall productivity through extensive utilization of +Internet-of-Things (IoT) and Deep Learning (DL). Unfortunately, IoT sensors and +DL algorithms are both prone to cyber-attacks. For instance, DL algorithms are +known for their susceptibility to adversarial examples. Such adversarial +attacks are vastly under-explored in the PdM domain. This is because the +adversarial attacks in the computer vision domain for classification tasks +cannot be directly applied to the PdM domain for multivariate time series (MTS) +regression tasks. In this work, we propose an end-to-end methodology to design +adversarially robust PdM systems by extensively analyzing the effect of +different types of adversarial attacks and proposing a novel adversarial +defense technique for DL-enabled PdM models. First, we propose novel MTS +Projected Gradient Descent (PGD) and MTS PGD with random restarts (PGD_r) +attacks. Then, we evaluate the impact of MTS PGD and PGD_r along with MTS Fast +Gradient Sign Method (FGSM) and MTS Basic Iterative Method (BIM) on Long +Short-Term Memory (LSTM), Gated Recurrent Unit (GRU), Convolutional Neural +Network (CNN), and Bi-directional LSTM based PdM system. Our results using +NASA's turbofan engine dataset show that adversarial attacks can cause a severe +defect (up to 11X) in the RUL prediction, outperforming the effectiveness of +the state-of-the-art PdM attacks by 3X. Furthermore, we present a novel +approximate adversarial training method to defend against adversarial attacks. +We observe that approximate adversarial training can significantly improve the +robustness of PdM models (up to 54X) and outperforms the state-of-the-art PdM +defense methods by offering 3X more robustness. + +
+
+
+
+
+ + ♻ ☆ Diffusion Denoised Smoothing for Certified and Adversarial Robust + Out-Of-Distribution Detection + + +
+ As the use of machine learning continues to expand, the importance of +ensuring its safety cannot be overstated. A key concern in this regard is the +ability to identify whether a given sample is from the training distribution, +or is an "Out-Of-Distribution" (OOD) sample. In addition, adversaries can +manipulate OOD samples in ways that lead a classifier to make a confident +prediction. In this study, we present a novel approach for certifying the +robustness of OOD detection within a $\ell_2$-norm around the input, regardless +of network architecture and without the need for specific components or +additional training. Further, we improve current techniques for detecting +adversarial attacks on OOD samples, while providing high levels of certified +and adversarial robustness on in-distribution samples. The average of all OOD +detection metrics on CIFAR10/100 shows an increase of $\sim 13 \% / 5\%$ +relative to previous approaches. + +
+
+
+
+
+ + ♻ ☆ AI-GOMS: Large AI-Driven Global Ocean Modeling System + + +
+ Ocean modeling is a powerful tool for simulating the physical, chemical, and +biological processes of the ocean, which is the foundation for marine science +research and operational oceanography. Modern numerical ocean modeling mainly +consists of governing equations and numerical algorithms. Nonlinear +instability, computational expense, low reusability efficiency and high +coupling costs have gradually become the main bottlenecks for the further +development of numerical ocean modeling. Recently, artificial +intelligence-based modeling in scientific computing has shown revolutionary +potential for digital twins and scientific simulations, but the bottlenecks of +numerical ocean modeling have not been further solved. Here, we present +AI-GOMS, a large AI-driven global ocean modeling system, for accurate and +efficient global ocean daily prediction. AI-GOMS consists of a backbone model +with the Fourier-based Masked Autoencoder structure for basic ocean variable +prediction and lightweight fine-tuning models incorporating regional +downscaling, wave decoding, and biochemistry coupling modules. AI-GOMS has +achieved the best performance in 30 days of prediction for the global ocean +basic variables with 15 depth layers at 1/4{\deg} spatial resolution. Beyond +the good performance in statistical metrics, AI-GOMS realizes the simulation of +mesoscale eddies in the Kuroshio region at 1/12{\deg} spatial resolution and +ocean stratification in the tropical Pacific Ocean. AI-GOMS provides a new +backbone-downstream paradigm for Earth system modeling, which makes the system +transferable, scalable and reusable. + +
+
+
+
+
+ + ♻ ☆ Synthesizing Mixed-type Electronic Health Records using Diffusion Models + + +
+ Electronic Health Records (EHRs) contain sensitive patient information, which +presents privacy concerns when sharing such data. Synthetic data generation is +a promising solution to mitigate these risks, often relying on deep generative +models such as Generative Adversarial Networks (GANs). However, recent studies +have shown that diffusion models offer several advantages over GANs, such as +generation of more realistic synthetic data and stable training in generating +data modalities, including image, text, and sound. In this work, we investigate +the potential of diffusion models for generating realistic mixed-type tabular +EHRs, comparing TabDDPM model with existing methods on four datasets in terms +of data quality, utility, privacy, and augmentation. Our experiments +demonstrate that TabDDPM outperforms the state-of-the-art models across all +evaluation metrics, except for privacy, which confirms the trade-off between +privacy and utility. + +
+
+ comment: Page 2, Figure 1 is updated +
+
+
+
+
+ + ♻ ☆ Autonomous sputter synthesis of thin film nitrides with composition + controlled by Bayesian optimization of optical plasma emission + + +
+ Autonomous experimentation has emerged as an efficient approach to accelerate +the pace of materials discovery. Although instruments for autonomous synthesis +have become popular in molecular and polymer science, solution processing of +hybrid materials and nanoparticles, examples of autonomous tools for physical +vapor deposition are scarce yet important for the semiconductor industry. Here, +we report the design and implementation of an autonomous workflow for sputter +deposition of thin films with controlled composition, leveraging a highly +automated sputtering reactor custom-controlled by Python, optical emission +spectroscopy (OES), and a Bayesian optimization algorithm. We modeled film +composition, measured by x-ray fluorescence, as a linear function of emission +lines monitored during the co-sputtering from elemental Zn and Ti targets in +N$_2$ atmosphere. A Bayesian control algorithm, informed by OES, navigates the +space of sputtering power to fabricate films with user-defined composition, by +minimizing the absolute error between desired and measured emission signals. We +validated our approach by autonomously fabricating Zn$_x$Ti$_{1-x}$N$_y$ films +with deviations from the targeted cation composition within relative 3.5 %, +even for 15 nm thin films, demonstrating that the proposed approach can +reliably synthesize thin films with specific composition and minimal human +interference. Moreover, the proposed method can be extended to more difficult +synthesis experiments where plasma intensity depends non-linearly on pressure, +or the elemental sticking coefficients strongly depend on the substrate +temperature. + +
+
+
+
+
+ + ♻ ☆ Scaling may be all you need for achieving human-level object recognition + capacity with human-like visual experience + + +
+ This paper asks whether current self-supervised learning methods, if +sufficiently scaled up, would be able to reach human-level visual object +recognition capabilities with the same type and amount of visual experience +humans learn from. Previous work on this question only considered the scaling +of data size. Here, we consider the simultaneous scaling of data size, model +size, and image resolution. We perform a scaling experiment with vision +transformers up to 633M parameters in size (ViT-H/14) trained with up to 5K +hours of human-like video data (long, continuous, mostly egocentric videos) +with image resolutions of up to 476x476 pixels. The efficiency of masked +autoencoders (MAEs) as a self-supervised learning algorithm makes it possible +to run this scaling experiment on an unassuming academic budget. We find that +it is feasible to reach human-level object recognition capacity at sub-human +scales of model size, data size, and image size, if these factors are scaled up +simultaneously. To give a concrete example, we estimate that a 2.5B parameter +ViT model trained with 20K hours (2.3 years) of human-like video data with a +spatial resolution of 952x952 pixels should be able to reach roughly +human-level accuracy on ImageNet. Human-level competence is thus achievable for +a fundamental perceptual capability from human-like perceptual experience +(human-like in both amount and type) with extremely generic learning algorithms +and architectures and without any substantive inductive biases. + +
+
+ comment: v2 adds an Appendix containing results with alternative scaling + functions; code & models available from + https://github.com/eminorhan/humanlike-vits +
+
+
+
+
+ + ♻ ☆ Width and Depth Limits Commute in Residual Networks + + +
+ We show that taking the width and depth to infinity in a deep neural network +with skip connections, when branches are scaled by $1/\sqrt{depth}$ (the only +nontrivial scaling), result in the same covariance structure no matter how that +limit is taken. This explains why the standard infinite-width-then-depth +approach provides practical insights even for networks with depth of the same +order as width. We also demonstrate that the pre-activations, in this case, +have Gaussian distributions which has direct applications in Bayesian deep +learning. We conduct extensive simulations that show an excellent match with +our theoretical findings. + +
+
+ comment: 24 pages, 8 figures. arXiv admin note: text overlap with + arXiv:2210.00688 +
+
+
+
+
+ + ♻ ☆ Improving Image-Based Precision Medicine with Uncertainty-Aware Causal + Models + + +
+ Image-based precision medicine aims to personalize treatment decisions based +on an individual's unique imaging features so as to improve their clinical +outcome. Machine learning frameworks that integrate uncertainty estimation as +part of their treatment recommendations would be safer and more reliable. +However, little work has been done in adapting uncertainty estimation +techniques and validation metrics for precision medicine. In this paper, we use +Bayesian deep learning for estimating the posterior distribution over factual +and counterfactual outcomes on several treatments. This allows for estimating +the uncertainty for each treatment option and for the individual treatment +effects (ITE) between any two treatments. We train and evaluate this model to +predict future new and enlarging T2 lesion counts on a large, multi-center +dataset of MR brain images of patients with multiple sclerosis, exposed to +several treatments during randomized controlled trials. We evaluate the +correlation of the uncertainty estimate with the factual error, and, given the +lack of ground truth counterfactual outcomes, demonstrate how uncertainty for +the ITE prediction relates to bounds on the ITE error. Lastly, we demonstrate +how knowledge of uncertainty could modify clinical decision-making to improve +individual patient and clinical trial outcomes. + +
+
+
+
+
+ + ♻ ☆ From NeurODEs to AutoencODEs: a mean-field control framework for + width-varying Neural Networks + + +
+ The connection between Residual Neural Networks (ResNets) and continuous-time +control systems (known as NeurODEs) has led to a mathematical analysis of +neural networks which has provided interesting results of both theoretical and +practical significance. However, by construction, NeurODEs have been limited to +describing constant-width layers, making them unsuitable for modeling deep +learning architectures with layers of variable width. In this paper, we propose +a continuous-time Autoencoder, which we call AutoencODE, based on a +modification of the controlled field that drives the dynamics. This adaptation +enables the extension of the mean-field control framework originally devised +for conventional NeurODEs. In this setting, we tackle the case of low Tikhonov +regularization, resulting in potentially non-convex cost landscapes. While the +global results obtained for high Tikhonov regularization may not hold globally, +we show that many of them can be recovered in regions where the loss function +is locally convex. Inspired by our theoretical findings, we develop a training +method tailored to this specific type of Autoencoders with residual +connections, and we validate our approach through numerical experiments +conducted on various examples. + +
+
+ comment: 35 pages, 11 figures. Minor adjustments and new bibliographical + references +
+
+
+
+
+ + ♻ ☆ Adaptive Gated Graph Convolutional Network for Explainable Diagnosis of + Alzheimer's Disease using EEG Data + + +
+ Graph neural network (GNN) models are increasingly being used for the +classification of electroencephalography (EEG) data. However, GNN-based +diagnosis of neurological disorders, such as Alzheimer's disease (AD), remains +a relatively unexplored area of research. Previous studies have relied on +functional connectivity methods to infer brain graph structures and used simple +GNN architectures for the diagnosis of AD. In this work, we propose a novel +adaptive gated graph convolutional network (AGGCN) that can provide explainable +predictions. AGGCN adaptively learns graph structures by combining +convolution-based node feature enhancement with a well-known correlation-based +measure of functional connectivity. Furthermore, the gated graph convolution +can dynamically weigh the contribution of various spatial scales. The proposed +model achieves high accuracy in both eyes-closed and eyes-open conditions, +indicating the stability of learned representations. Finally, we demonstrate +that the proposed AGGCN model generates consistent explanations of its +predictions that might be relevant for further study of AD-related alterations +of brain networks. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ From Random Search to Bandit Learning in Metric Measure Spaces + + +
+ Random Search is one of the most widely-used method for Hyperparameter +Optimization, and is critical to the success of deep learning models. Despite +its astonishing performance, little non-heuristic theory has been developed to +describe the underlying working mechanism. This paper gives a theoretical +accounting of Random Search. We introduce the concept of \emph{scattering +dimension} that describes the landscape of the underlying function, and +quantifies the performance of random search. We show that, when the environment +is noise-free, the output of random search converges to the optimal value in +probability at rate $ \widetilde{\mathcal{O}} \left( \left( \frac{1}{T} +\right)^{ \frac{1}{d_s} } \right) $, where $ d_s \ge 0 $ is the scattering +dimension of the underlying function. When the observed function values are +corrupted by bounded $iid$ noise, the output of random search converges to the +optimal value in probability at rate $ \widetilde{\mathcal{O}} \left( \left( +\frac{1}{T} \right)^{ \frac{1}{d_s + 1} } \right) $. In addition, based on the +principles of random search, we introduce an algorithm, called BLiN-MOS, for +Lipschitz bandits in doubling metric spaces that are also endowed with a +probability measure, and show that BLiN-MOS achieves a regret rate of order $ +\widetilde{\mathcal{O}} \left( T^{ \frac{d_z}{d_z + 1} } \right) $, where $d_z$ +is the zooming dimension of the problem instance. + +
+
+
+
+
+ + ♻ ☆ Forecasting Irregularly Sampled Time Series using Graphs + + +
+ Forecasting irregularly sampled time series with missing values is a crucial +task for numerous real-world applications such as healthcare, astronomy, and +climate sciences. State-of-the-art approaches to this problem rely on Ordinary +Differential Equations (ODEs) which are known to be slow and often require +additional features to handle missing values. To address this issue, we propose +a novel model using Graphs for Forecasting Irregularly Sampled Time Series with +missing values which we call GraFITi. GraFITi first converts the time series to +a Sparsity Structure Graph which is a sparse bipartite graph, and then +reformulates the forecasting problem as the edge weight prediction task in the +graph. It uses the power of Graph Neural Networks to learn the graph and +predict the target edge weights. GraFITi has been tested on 3 real-world and 1 +synthetic irregularly sampled time series dataset with missing values and +compared with various state-of-the-art models. The experimental results +demonstrate that GraFITi improves the forecasting accuracy by up to 17% and +reduces the run time up to 5 times compared to the state-of-the-art forecasting +models. + +
+
+
+
+
+ + ♻ ☆ Online learning techniques for prediction of temporal tabular datasets + with regime changes + + +
+ The application of deep learning to non-stationary temporal datasets can lead +to overfitted models that underperform under regime changes. In this work, we +propose a modular machine learning pipeline for ranking predictions on temporal +panel datasets which is robust under regime changes. The modularity of the +pipeline allows the use of different models, including Gradient Boosting +Decision Trees (GBDTs) and Neural Networks, with and without feature +engineering. We evaluate our framework on financial data for stock portfolio +prediction, and find that GBDT models with dropout display high performance, +robustness and generalisability with reduced complexity and computational cost. +We then demonstrate how online learning techniques, which require no retraining +of models, can be used post-prediction to enhance the results. First, we show +that dynamic feature projection improves robustness by reducing drawdown in +regime changes. Second, we demonstrate that dynamical model ensembling based on +selection of models with good recent performance leads to improved Sharpe and +Calmar ratios of out-of-sample predictions. We also evaluate the robustness of +our pipeline across different data splits and random seeds with good +reproducibility. + +
+
+
+
+
+ + ♻ ☆ Multi-Class Deep SVDD: Anomaly Detection Approach in Astronomy with + Distinct Inlier Categories ICML 2023 + + +
+ With the increasing volume of astronomical data generated by modern survey +telescopes, automated pipelines and machine learning techniques have become +crucial for analyzing and extracting knowledge from these datasets. Anomaly +detection, i.e. the task of identifying irregular or unexpected patterns in the +data, is a complex challenge in astronomy. In this paper, we propose +Multi-Class Deep Support Vector Data Description (MCDSVDD), an extension of the +state-of-the-art anomaly detection algorithm One-Class Deep SVDD, specifically +designed to handle different inlier categories with distinct data +distributions. MCDSVDD uses a neural network to map the data into hyperspheres, +where each hypersphere represents a specific inlier category. The distance of +each sample from the centers of these hyperspheres determines the anomaly +score. We evaluate the effectiveness of MCDSVDD by comparing its performance +with several anomaly detection algorithms on a large dataset of astronomical +light-curves obtained from the Zwicky Transient Facility. Our results +demonstrate the efficacy of MCDSVDD in detecting anomalous sources while +leveraging the presence of different inlier categories. The code and the data +needed to reproduce our results are publicly available at +https://github.com/mperezcarrasco/AnomalyALeRCE. + +
+
+ comment: Accepted to ICML 2023 Workshop on Machine Learning for Astrophysics +
+
+
+
+
+ + ♻ ☆ Deep incremental learning models for financial temporal tabular datasets + with distribution shifts + + +
+ We present a robust deep incremental learning framework for regression tasks +on financial temporal tabular datasets which is built upon the incremental use +of commonly available tabular and time series prediction models to adapt to +distributional shifts typical of financial datasets. The framework uses a +simple basic building block (decision trees) to build self-similar models of +any required complexity to deliver robust performance under adverse situations +such as regime changes, fat-tailed distributions, and low signal-to-noise +ratios. As a detailed study, we demonstrate our scheme using XGBoost models +trained on the Numerai dataset and show that a two layer deep ensemble of +XGBoost models over different model snapshots delivers high quality predictions +under different market regimes. We also show that the performance of XGBoost +models with different number of boosting rounds in three scenarios (small, +standard and large) is monotonically increasing with respect to model size and +converges towards the generalisation upper bound. We also evaluate the +robustness of the model under variability of different hyperparameters, such as +model complexity and data sampling settings. Our model has low hardware +requirements as no specialised neural architectures are used and each base +model can be independently trained in parallel. + +
+
+
+
+
+ + ♻ ☆ A hybrid deep-learning-metaheuristic framework for bi-level network + design problems + + +
+ This study proposes a hybrid deep-learning-metaheuristic framework with a +bi-level architecture for road network design problems (NDPs). We train a graph +neural network (GNN) to approximate the solution of the user equilibrium (UE) +traffic assignment problem and use inferences made by the trained model to +calculate fitness function evaluations of a genetic algorithm (GA) to +approximate solutions for NDPs. Using three test networks, two NDP variants and +an exact solver as benchmark, we show that on average, our proposed framework +can provide solutions within 1.5% gap of the best results in less than 0.5% of +the time used by the exact solution procedure. Our framework can be utilized +within an expert system for infrastructure planning to determine the best +infrastructure planning and management decisions under different scenarios. +Given the flexibility of the framework, it can easily be adapted to many other +decision problems that can be modeled as bi-level problems on graphs. Moreover, +we foreseen interesting future research directions, thus we also put forward a +brief research agenda for this topic. The key observation from our research +that can shape future research is that the fitness function evaluation time +using the inferences made by the GNN model was in the order of milliseconds, +which points to an opportunity and a need for novel heuristics that 1) can cope +well with noisy fitness function values provided by deep learning models, and +2) can use the significantly enlarged efficiency of the evaluation step to +explore the search space effectively (rather than efficiently). This opens a +new avenue for a modern class of metaheuristics that are crafted for use with +AI-powered predictors. + +
+
+ comment: Two case studies added, intro, discussion and conclusion extended, + details added to method and experiments, typos fixed, title revised, + references added +
+
+
+
+
+ + ♻ ☆ Symmetry Defense Against CNN Adversarial Perturbation Attacks + + +
+ This paper uses symmetry to make Convolutional Neural Network classifiers +(CNNs) robust against adversarial perturbation attacks. Such attacks add +perturbation to original images to generate adversarial images that fool +classifiers such as road sign classifiers of autonomous vehicles. Although +symmetry is a pervasive aspect of the natural world, CNNs are unable to handle +symmetry well. For example, a CNN can classify an image differently from its +mirror image. For an adversarial image that misclassifies with a wrong label +$l_w$, CNN inability to handle symmetry means that a symmetric adversarial +image can classify differently from the wrong label $l_w$. Further than that, +we find that the classification of a symmetric adversarial image reverts to the +correct label. To classify an image when adversaries are unaware of the +defense, we apply symmetry to the image and use the classification label of the +symmetric image. To classify an image when adversaries are aware of the +defense, we use mirror symmetry and pixel inversion symmetry to form a symmetry +group. We apply all the group symmetries to the image and decide on the output +label based on the agreement of any two of the classification labels of the +symmetry images. Adaptive attacks fail because they need to rely on loss +functions that use conflicting CNN output values for symmetric images. Without +attack knowledge, the proposed symmetry defense succeeds against both +gradient-based and random-search attacks, with up to near-default accuracies +for ImageNet. The defense even improves the classification accuracy of original +images. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Functional Neural Networks: Shift invariant models for functional data + with applications to EEG classification + + +
+ It is desirable for statistical models to detect signals of interest +independently of their position. If the data is generated by some smooth +process, this additional structure should be taken into account. We introduce a +new class of neural networks that are shift invariant and preserve smoothness +of the data: functional neural networks (FNNs). For this, we use methods from +functional data analysis (FDA) to extend multi-layer perceptrons and +convolutional neural networks to functional data. We propose different model +architectures, show that the models outperform a benchmark model from FDA in +terms of accuracy and successfully use FNNs to classify electroencephalography +(EEG) data. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Forward-Forward Training of an Optical Neural Network + + +
+ Neural networks (NN) have demonstrated remarkable capabilities in various +tasks, but their computation-intensive nature demands faster and more +energy-efficient hardware implementations. Optics-based platforms, using +technologies such as silicon photonics and spatial light modulators, offer +promising avenues for achieving this goal. However, training multiple trainable +layers in tandem with these physical systems poses challenges, as they are +difficult to fully characterize and describe with differentiable functions, +hindering the use of error backpropagation algorithm. The recently introduced +Forward-Forward Algorithm (FFA) eliminates the need for perfect +characterization of the learning system and shows promise for efficient +training with large numbers of programmable parameters. The FFA does not +require backpropagating an error signal to update the weights, rather the +weights are updated by only sending information in one direction. The local +loss function for each set of trainable weights enables low-power analog +hardware implementations without resorting to metaheuristic algorithms or +reinforcement learning. In this paper, we present an experiment utilizing +multimode nonlinear wave propagation in an optical fiber demonstrating the +feasibility of the FFA approach using an optical system. The results show that +incorporating optical transforms in multilayer NN architectures trained with +the FFA, can lead to performance improvements, even with a relatively small +number of trainable weights. The proposed method offers a new path to the +challenge of training optical NNs and provides insights into leveraging +physical transformations for enhancing NN performance. + +
+
+
+
+
+ + ♻ ☆ Conditional Generative Models for Learning Stochastic Processes + + +
+ A framework to learn a multi-modal distribution is proposed, denoted as the +Conditional Quantum Generative Adversarial Network (C-qGAN). The neural network +structure is strictly within a quantum circuit and, as a consequence, is shown +to represent a more efficient state preparation procedure than current methods. +This methodology has the potential to speed-up algorithms, such as Monte Carlo +analysis. In particular, after demonstrating the effectiveness of the network +in the learning task, the technique is applied to price Asian option +derivatives, providing the foundation for further research on other +path-dependent options. + +
+
+
+
+
+ + ♻ ☆ Revisiting Domain-Adaptive 3D Object Detection by Reliable, Diverse and + Class-balanced Pseudo-Labeling ICCV 2023 + + +
+ Unsupervised domain adaptation (DA) with the aid of pseudo labeling +techniques has emerged as a crucial approach for domain-adaptive 3D object +detection. While effective, existing DA methods suffer from a substantial drop +in performance when applied to a multi-class training setting, due to the +co-existence of low-quality pseudo labels and class imbalance issues. In this +paper, we address this challenge by proposing a novel ReDB framework tailored +for learning to detect all classes at once. Our approach produces Reliable, +Diverse, and class-Balanced pseudo 3D boxes to iteratively guide the +self-training on a distributionally different target domain. To alleviate +disruptions caused by the environmental discrepancy (e.g., beam numbers), the +proposed cross-domain examination (CDE) assesses the correctness of pseudo +labels by copy-pasting target instances into a source environment and measuring +the prediction consistency. To reduce computational overhead and mitigate the +object shift (e.g., scales and point densities), we design an overlapped boxes +counting (OBC) metric that allows to uniformly downsample pseudo-labeled +objects across different geometric characteristics. To confront the issue of +inter-class imbalance, we progressively augment the target point clouds with a +class-balanced set of pseudo-labeled target instances and source objects, which +boosts recognition accuracies on both frequently appearing and rare classes. +Experimental results on three benchmark datasets using both voxel-based (i.e., +SECOND) and point-based 3D detectors (i.e., PointRCNN) demonstrate that our +proposed ReDB approach outperforms existing 3D domain adaptation methods by a +large margin, improving 23.15% mAP on the nuScenes $\rightarrow$ KITTI task. +The code is available at https://github.com/zhuoxiao-chen/ReDB-DA-3Ddet. + +
+
+ comment: Accepted by ICCV 2023, camera-ready +
+
+
+
+
+ + ♻ ☆ A Brief Review of Hypernetworks in Deep Learning + + +
+ Hypernetworks, or hypernets in short, are neural networks that generate +weights for another neural network, known as the target network. They have +emerged as a powerful deep learning technique that allows for greater +flexibility, adaptability, dynamism, faster training, information sharing, and +model compression etc. Hypernets have shown promising results in a variety of +deep learning problems, including continual learning, causal inference, +transfer learning, weight pruning, uncertainty quantification, zero-shot +learning, natural language processing, and reinforcement learning etc. Despite +their success across different problem settings, currently, there is no review +available to inform the researchers about the developments and to help in +utilizing hypernets. To fill this gap, we review the progress in hypernets. We +present an illustrative example to train deep neural networks using hypernets +and propose categorizing hypernets based on five design criteria as inputs, +outputs, variability of inputs and outputs, and architecture of hypernets. We +also review applications of hypernets across different deep learning problem +settings, followed by a discussion of general scenarios where hypernets can be +effectively employed. Finally, we discuss the challenges and future directions +that remain under-explored in the field of hypernets. We believe that +hypernetworks have the potential to revolutionize the field of deep learning. +They offer a new way to design and train neural networks, and they have the +potential to improve the performance of deep learning models on a variety of +tasks. Through this review, we aim to inspire further advancements in deep +learning through hypernetworks. + +
+
+ comment: revised categorisation, added new Section '5 When can we use + Hypernets?', and other corrections(2 figures and 2 tables) (under review) +
+
+
+
+
+ + ♻ ☆ InfoNCE is variational inference in a recognition parameterised model + + +
+ Here, we show that the InfoNCE objective is equivalent to the ELBO in a new +class of probabilistic generative model, the recognition parameterised model +(RPM). When we learn the optimal prior, the RPM ELBO becomes equal to the +mutual information (MI; up to a constant), establishing a connection to +pre-existing self-supervised learning methods such as InfoNCE. However, +practical InfoNCE methods do not use the MI as an objective; the MI is +invariant to arbitrary invertible transformations, so using an MI objective can +lead to highly entangled representations (Tschannen et al., 2019). Instead, the +actual InfoNCE objective is a simplified lower bound on the MI which is loose +even in the infinite sample limit. Thus, an objective that works (i.e. the +actual InfoNCE objective) appears to be motivated as a loose bound on an +objective that does not work (i.e. the true MI which gives arbitrarily +entangled representations). We give an alternative motivation for the actual +InfoNCE objective. In particular, we show that in the infinite sample limit, +and for a particular choice of prior, the actual InfoNCE objective is equal to +the ELBO (up to a constant); and the ELBO is equal to the marginal likelihood +with a deterministic recognition model. Thus, we argue that our VAE perspective +gives a better motivation for InfoNCE than MI, as the actual InfoNCE objective +is only loosely bounded by the MI, but is equal to the ELBO/marginal likelihood +(up to a constant). + +
+
+
+
+
+ + ♻ ☆ Simplifying Momentum-based Positive-definite Submanifold Optimization + with Applications to Deep Learning ICML 2023 + + +
+ Riemannian submanifold optimization with momentum is computationally +challenging because, to ensure that the iterates remain on the submanifold, we +often need to solve difficult differential equations. Here, we simplify such +difficulties for a class of sparse or structured symmetric positive-definite +matrices with the affine-invariant metric. We do so by proposing a generalized +version of the Riemannian normal coordinates that dynamically orthonormalizes +the metric and locally converts the problem into an unconstrained problem in +the Euclidean space. We use our approach to simplify existing approaches for +structured covariances and develop matrix-inverse-free $2^\text{nd}$-order +optimizers for deep learning with low precision by using only matrix +multiplications. Code: https://github.com/yorkerlin/StructuredNGD-DL + +
+
+ comment: An updated version of the ICML 2023 paper. Updated the main text to + emphasize challenges of using existing Riemannian methods to estimate sparse + and structured SPD matrices +
+
+
+
+
+ + ♻ ☆ Π-ML: A dimensional analysis-based machine learning parameterization + of optical turbulence in the atmospheric surface layer + + +
+ Turbulent fluctuations of the atmospheric refraction index, so-called optical +turbulence, can significantly distort propagating laser beams. Therefore, +modeling the strength of these fluctuations ($C_n^2$) is highly relevant for +the successful development and deployment of future free-space optical +communication links. In this letter, we propose a physics-informed machine +learning (ML) methodology, $\Pi$-ML, based on dimensional analysis and gradient +boosting to estimate $C_n^2$. Through a systematic feature importance analysis, +we identify the normalized variance of potential temperature as the dominating +feature for predicting $C_n^2$. For statistical robustness, we train an +ensemble of models which yields high performance on the out-of-sample data of +$R^2=0.958\pm0.001$. + +
+
+
+
+
+ + ♻ ☆ SLEM: Machine Learning for Path Modeling and Causal Inference with Super + Learner Equation Modeling + + +
+ Causal inference is a crucial goal of science, enabling researchers to arrive +at meaningful conclusions regarding the predictions of hypothetical +interventions using observational data. Path models, Structural Equation Models +(SEMs), and, more generally, Directed Acyclic Graphs (DAGs), provide a means to +unambiguously specify assumptions regarding the causal structure underlying a +phenomenon. Unlike DAGs, which make very few assumptions about the functional +and parametric form, SEM assumes linearity. This can result in functional +misspecification which prevents researchers from undertaking reliable effect +size estimation. In contrast, we propose Super Learner Equation Modeling, a +path modeling technique integrating machine learning Super Learner ensembles. +We empirically demonstrate its ability to provide consistent and unbiased +estimates of causal effects, its competitive performance for linear models when +compared with SEM, and highlight its superiority over SEM when dealing with +non-linear relationships. We provide open-source code, and a tutorial notebook +with example usage, accentuating the easy-to-use nature of the method. + +
+
+
+
+
+ + ♻ ☆ Enhancing Nucleus Segmentation with HARU-Net: A Hybrid Attention Based + Residual U-Blocks Network + + +
+ Nucleus image segmentation is a crucial step in the analysis, pathological +diagnosis, and classification, which heavily relies on the quality of nucleus +segmentation. However, the complexity of issues such as variations in nucleus +size, blurred nucleus contours, uneven staining, cell clustering, and +overlapping cells poses significant challenges. Current methods for nucleus +segmentation primarily rely on nuclear morphology or contour-based approaches. +Nuclear morphology-based methods exhibit limited generalization ability and +struggle to effectively predict irregular-shaped nuclei, while contour-based +extraction methods face challenges in accurately segmenting overlapping nuclei. +To address the aforementioned issues, we propose a dual-branch network using +hybrid attention based residual U-blocks for nucleus instance segmentation. The +network simultaneously predicts target information and target contours. +Additionally, we introduce a post-processing method that combines the target +information and target contours to distinguish overlapping nuclei and generate +an instance segmentation image. Within the network, we propose a context fusion +block (CF-block) that effectively extracts and merges contextual information +from the network. Extensive quantitative evaluations are conducted to assess +the performance of our method. Experimental results demonstrate the superior +performance of the proposed method compared to state-of-the-art approaches on +the BNS, MoNuSeg, CoNSeg, and CPM-17 datasets. + +
+
+ comment: Nucleus segmentation, Deep learning, Instance segmentation, Medical + imaging, Dual-Branch network +
+
+
+
+
+ + ♻ ☆ Multi-metrics adaptively identifies backdoors in Federated learning ICCV + + +
+ The decentralized and privacy-preserving nature of federated learning (FL) +makes it vulnerable to backdoor attacks aiming to manipulate the behavior of +the resulting model on specific adversary-chosen inputs. However, most existing +defenses based on statistical differences take effect only against specific +attacks, especially when the malicious gradients are similar to benign ones or +the data are highly non-independent and identically distributed (non-IID). In +this paper, we revisit the distance-based defense methods and discover that i) +Euclidean distance becomes meaningless in high dimensions and ii) malicious +gradients with diverse characteristics cannot be identified by a single metric. +To this end, we present a simple yet effective defense strategy with +multi-metrics and dynamic weighting to identify backdoors adaptively. +Furthermore, our novel defense has no reliance on predefined assumptions over +attack settings or data distributions and little impact on benign performance. +To evaluate the effectiveness of our approach, we conduct comprehensive +experiments on different datasets under various attack settings, where our +method achieves the best defensive performance. For instance, we achieve the +lowest backdoor accuracy of 3.06% under the difficult Edge-case PGD, showing +significant superiority over previous defenses. The results also demonstrate +that our method can be well-adapted to a wide range of non-IID degrees without +sacrificing the benign performance. + +
+
+ comment: 14 pages, 8 figures and 7 tables; 2023 IEEE/CVF International + Conference on Computer Vision (ICCV) +
+
+
+
+
+ + ♻ ☆ Learning ground states of gapped quantum Hamiltonians with Kernel + Methods + + +
+ Neural network approaches to approximate the ground state of quantum +hamiltonians require the numerical solution of a highly nonlinear optimization +problem. We introduce a statistical learning approach that makes the +optimization trivial by using kernel methods. Our scheme is an approximate +realization of the power method, where supervised learning is used to learn the +next step of the power iteration. We show that the ground state properties of +arbitrary gapped quantum hamiltonians can be reached with polynomial resources +under the assumption that the supervised learning is efficient. Using kernel +ridge regression, we provide numerical evidence that the learning assumption is +verified by applying our scheme to find the ground states of several +prototypical interacting many-body quantum systems, both in one and two +dimensions, showing the flexibility of our approach. + +
+
+
+
+
+ + ♻ ☆ FALL-E: A Foley Sound Synthesis Model and Strategies + + +
+ This paper introduces FALL-E, a foley synthesis system and its +training/inference strategies. The FALL-E model employs a cascaded approach +comprising low-resolution spectrogram generation, spectrogram super-resolution, +and a vocoder. We trained every sound-related model from scratch using our +extensive datasets, and utilized a pre-trained language model. We conditioned +the model with dataset-specific texts, enabling it to learn sound quality and +recording environment based on text input. Moreover, we leveraged external +language models to improve text descriptions of our datasets and performed +prompt engineering for quality, coherence, and diversity. FALL-E was evaluated +by an objective measure as well as listening tests in the DCASE 2023 challenge +Task 7. The submission achieved the second place on average, while achieving +the best score for diversity, second place for audio quality, and third place +for class fitness. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Distributed Out-of-Memory NMF on CPU/GPU Architectures + + +
+ We propose an efficient distributed out-of-memory implementation of the +Non-negative Matrix Factorization (NMF) algorithm for heterogeneous +high-performance-computing (HPC) systems. The proposed implementation is based +on prior work on NMFk, which can perform automatic model selection and extract +latent variables and patterns from data. In this work, we extend NMFk by adding +support for dense and sparse matrix operation on multi-node, multi-GPU systems. +The resulting algorithm is optimized for out-of-memory (OOM) problems where the +memory required to factorize a given matrix is greater than the available GPU +memory. Memory complexity is reduced by batching/tiling strategies, and sparse +and dense matrix operations are significantly accelerated with GPU cores (or +tensor cores when available). Input/Output (I/O) latency associated with batch +copies between host and device is hidden using CUDA streams to overlap data +transfers and compute asynchronously, and latency associated with collective +communications (both intra-node and inter-node) is reduced using optimized +NVIDIA Collective Communication Library NCCL based communicators. Benchmark +results show significant improvement, from 32X to 76x speedup, with the new +implementation using GPUs over the CPU-based NMFk. Good weak scaling was +demonstrated on up to 4096 multi-GPU cluster nodes with approximately 25,000 +GPUs when decomposing a dense 340 Terabyte-size matrix and an 11 Exabyte-size +sparse matrix of density 10e-6. + +
+
+ comment: Accepted at Journal of Supercomputing +
+
+
+
+
+ + ♻ ☆ Progressive-Hint Prompting Improves Reasoning in Large Language Models + + +
+ The performance of Large Language Models (LLMs) in reasoning tasks depends +heavily on prompt design, with Chain-of-Thought (CoT) and self-consistency +being critical methods that enhance this ability. However, these methods do not +fully exploit the answers generated by the LLM to guide subsequent responses. +This paper proposes a new prompting method, named Progressive-Hint Prompting +(PHP), that enables automatic multiple interactions between users and LLMs by +using previously generated answers as hints to progressively guide toward the +correct answers. PHP is orthogonal to CoT and self-consistency, making it easy +to combine with state-of-the-art techniques to further improve performance. We +conducted extensive and comprehensive experiments on seven benchmarks. The +results show that PHP significantly improves accuracy while remaining highly +efficient. For instance, with text-davinci-003, we observed a 4.2% improvement +on GSM8K with greedy decoding compared to Complex CoT, and a 46.17% reduction +in sample paths with self-consistency. With GPT-4 and PHP, we achieve +state-of-the-art performances on SVAMP (89.1% -> 91.9%), GSM8K (92% -> 95.5%), +AQuA (76.4% -> 79.9%) and MATH (50.3% -> 53.9%). + +
+
+ comment: Tech Report +
+
+
+
+
+ + ♻ ☆ A Feature Set of Small Size for the PDF Malware Detection KDD + + +
+ Machine learning (ML)-based malware detection systems are becoming +increasingly important as malware threats increase and get more sophisticated. +PDF files are often used as vectors for phishing attacks because they are +widely regarded as trustworthy data resources, and are accessible across +different platforms. Therefore, researchers have developed many different PDF +malware detection methods. Performance in detecting PDF malware is greatly +influenced by feature selection. In this research, we propose a small features +set that don't require too much domain knowledge of the PDF file. We evaluate +proposed features with six different machine learning models. We report the +best accuracy of 99.75% when using Random Forest model. Our proposed feature +set, which consists of just 12 features, is one of the most conciseness in the +field of PDF malware detection. Despite its modest size, we obtain comparable +results to state-of-the-art that employ a much larger set of features. + +
+
+ comment: Accepted for publication at the ACM SIGKDD & Annual KDD Conference + workshop on Knowledge-infused Machine Learning, 2023 +
+
+
+
+
+ + ♻ ☆ Analyzing Privacy Leakage in Machine Learning via Multiple Hypothesis + Testing: A Lesson From Fano + + +
+ Differential privacy (DP) is by far the most widely accepted framework for +mitigating privacy risks in machine learning. However, exactly how small the +privacy parameter $\epsilon$ needs to be to protect against certain privacy +risks in practice is still not well-understood. In this work, we study data +reconstruction attacks for discrete data and analyze it under the framework of +multiple hypothesis testing. We utilize different variants of the celebrated +Fano's inequality to derive upper bounds on the inferential power of a data +reconstruction adversary when the model is trained differentially privately. +Importantly, we show that if the underlying private data takes values from a +set of size $M$, then the target privacy parameter $\epsilon$ can be $O(\log +M)$ before the adversary gains significant inferential power. Our analysis +offers theoretical evidence for the empirical effectiveness of DP against data +reconstruction attacks even at relatively large values of $\epsilon$. + +
+
+
+
+
+ + ♻ ☆ Privacy-Aware Compression for Federated Learning Through Numerical + Mechanism Design + + +
+ In private federated learning (FL), a server aggregates differentially +private updates from a large number of clients in order to train a machine +learning model. The main challenge in this setting is balancing privacy with +both classification accuracy of the learnt model as well as the number of bits +communicated between the clients and server. Prior work has achieved a good +trade-off by designing a privacy-aware compression mechanism, called the +minimum variance unbiased (MVU) mechanism, that numerically solves an +optimization problem to determine the parameters of the mechanism. This paper +builds upon it by introducing a new interpolation procedure in the numerical +design process that allows for a far more efficient privacy analysis. The +result is the new Interpolated MVU mechanism that is more scalable, has a +better privacy-utility trade-off, and provides SOTA results on +communication-efficient private FL on a variety of datasets. + +
+
+
+
+
+ + ♻ ☆ Benchmarking and Analyzing Robust Point Cloud Recognition: Bag of Tricks + for Defending Adversarial Examples + + +
+ Deep Neural Networks (DNNs) for 3D point cloud recognition are vulnerable to +adversarial examples, threatening their practical deployment. Despite the many +research endeavors have been made to tackle this issue in recent years, the +diversity of adversarial examples on 3D point clouds makes them more +challenging to defend against than those on 2D images. For examples, attackers +can generate adversarial examples by adding, shifting, or removing points. +Consequently, existing defense strategies are hard to counter unseen point +cloud adversarial examples. In this paper, we first establish a comprehensive, +and rigorous point cloud adversarial robustness benchmark to evaluate +adversarial robustness, which can provide a detailed understanding of the +effects of the defense and attack methods. We then collect existing defense +tricks in point cloud adversarial defenses and then perform extensive and +systematic experiments to identify an effective combination of these tricks. +Furthermore, we propose a hybrid training augmentation methods that consider +various types of point cloud adversarial examples to adversarial training, +significantly improving the adversarial robustness. By combining these tricks, +we construct a more robust defense framework achieving an average accuracy of +83.45\% against various attacks, demonstrating its capability to enabling +robust learners. Our codebase are open-sourced on: +\url{https://github.com/qiufan319/benchmark_pc_attack.git}. + +
+
+ comment: 8 pages 6 figures +
+
+
+
+
+ + ♻ ☆ Open Problems in Computer Vision for Wilderness SAR and The Search for + Patricia Wu-Murad + + +
+ This paper details the challenges in applying two computer vision systems, an +EfficientDET supervised learning model and the unsupervised RX spectral +classifier, to 98.9 GB of drone imagery from the Wu-Murad wilderness search and +rescue (WSAR) effort in Japan and identifies 3 directions for future research. +There have been at least 19 proposed approaches and 3 datasets aimed at +locating missing persons in drone imagery, but only 3 approaches (2 +unsupervised and 1 of an unknown structure) are referenced in the literature as +having been used in an actual WSAR operation. Of these proposed approaches, the +EfficientDET architecture and the unsupervised spectral RX classifier were +selected as the most appropriate for this setting. The EfficientDET model was +applied to the HERIDAL dataset and despite achieving performance that is +statistically equivalent to the state-of-the-art, the model fails to translate +to the real world in terms of false positives (e.g., identifying tree limbs and +rocks as people), and false negatives (e.g., failing to identify members of the +search team). The poor results in practice for algorithms that showed good +results on datasets suggest 3 areas of future research: more realistic datasets +for wilderness SAR, computer vision models that are capable of seamlessly +handling the variety of imagery that can be collected during actual WSAR +operations, and better alignment on performance measures. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ AI Increases Global Access to Reliable Flood Forecasts + + +
+ Floods are one of the most common and impactful natural disasters, with a +disproportionate impact in developing countries that often lack dense +streamflow monitoring networks. Accurate and timely warnings are critical for +mitigating flood risks, but accurate hydrological simulation models typically +must be calibrated to long data records in each watershed where they are +applied. We developed an Artificial Intelligence (AI) model to predict extreme +hydrological events at timescales up to 7 days in advance. This model +significantly outperforms current state of the art global hydrology models (the +Copernicus Emergency Management Service Global Flood Awareness System) across +all continents, lead times, and return periods. AI is especially effective at +forecasting in ungauged basins, which is important because only a few percent +of the world's watersheds have stream gauges, with a disproportionate number of +ungauged basins in developing countries that are especially vulnerable to the +human impacts of flooding. We produce forecasts of extreme events in South +America and Africa that achieve reliability approaching the current state of +the art in Europe and North America, and we achieve reliability at between 4 +and 6-day lead times that are similar to current state of the art nowcasts +(0-day lead time). Additionally, we achieve accuracies over 10-year return +period events that are similar to current accuracies over 2-year return period +events, meaning that AI can provide warnings earlier and over larger and more +impactful events. The model that we develop in this paper has been incorporated +into an operational early warning system that produces publicly available (free +and open) forecasts in real time in over 80 countries. This work using AI and +open data highlights a need for increasing the availability of hydrological +data to continue to improve global access to reliable flood warnings. + +
+
+
+
+
+ + ♻ ☆ Neural Model Reprogramming with Similarity Based Mapping for + Low-Resource Spoken Command Classification + + +
+ In this study, we propose a novel adversarial reprogramming (AR) approach for +low-resource spoken command recognition (SCR), and build an AR-SCR system. The +AR procedure aims to modify the acoustic signals (from the target domain) to +repurpose a pretrained SCR model (from the source domain). To solve the label +mismatches between source and target domains, and further improve the stability +of AR, we propose a novel similarity-based label mapping technique to align +classes. In addition, the transfer learning (TL) technique is combined with the +original AR process to improve the model adaptation capability. We evaluate the +proposed AR-SCR system on three low-resource SCR datasets, including Arabic, +Lithuanian, and dysarthric Mandarin speech. Experimental results show that with +a pretrained AM trained on a large-scale English dataset, the proposed AR-SCR +system outperforms the current state-of-the-art results on Arabic and +Lithuanian speech commands datasets, with only a limited amount of training +data. + +
+
+ comment: Accepted to Interspeech 2023. Code is available at: + https://github.com/dodohow1011/SpeechAdvReprogram +
+
+
+
+
+ + ♻ ☆ Non-linear Neurons with Human-like Apical Dendrite Activations + + +
+ In order to classify linearly non-separable data, neurons are typically +organized into multi-layer neural networks that are equipped with at least one +hidden layer. Inspired by some recent discoveries in neuroscience, we propose a +new model of artificial neuron along with a novel activation function enabling +the learning of nonlinear decision boundaries using a single neuron. We show +that a standard neuron followed by our novel apical dendrite activation (ADA) +can learn the XOR logical function with 100% accuracy. Furthermore, we conduct +experiments on six benchmark data sets from computer vision, signal processing +and natural language processing, i.e. MOROCO, UTKFace, CREMA-D, Fashion-MNIST, +Tiny ImageNet and ImageNet, showing that the ADA and the leaky ADA functions +provide superior results to Rectified Linear Units (ReLU), leaky ReLU, RBF and +Swish, for various neural network architectures, e.g. one-hidden-layer or +two-hidden-layer multi-layer perceptrons (MLPs) and convolutional neural +networks (CNNs) such as LeNet, VGG, ResNet and Character-level CNN. We obtain +further performance improvements when we change the standard model of the +neuron with our pyramidal neuron with apical dendrite activations (PyNADA). Our +code is available at: https://github.com/raduionescu/pynada. + +
+
+ comment: Accepted for publication in Applied Intelligence +
+
+
+
+
+ + ♻ ☆ Which Features are Learned by CodeBert: An Empirical Study of the + BERT-based Source Code Representation Learning + + +
+ The Bidirectional Encoder Representations from Transformers (BERT) were +proposed in the natural language process (NLP) and shows promising results. +Recently researchers applied the BERT to source-code representation learning +and reported some good news on several downstream tasks. However, in this +paper, we illustrated that current methods cannot effectively understand the +logic of source codes. The representation of source code heavily relies on the +programmer-defined variable and function names. We design and implement a set +of experiments to demonstrate our conjecture and provide some insights for +future works. + +
+
+ comment: 1 table, 2 figures +
+
+
+
+
+ + ♻ ☆ Collaborative Learning with a Drone Orchestrator + + +
+ In this paper, the problem of drone-assisted collaborative learning is +considered. In this scenario, swarm of intelligent wireless devices train a +shared neural network (NN) model with the help of a drone. Using its sensors, +each device records samples from its environment to gather a local dataset for +training. The training data is severely heterogeneous as various devices have +different amount of data and sensor noise level. The intelligent devices +iteratively train the NN on their local datasets and exchange the model +parameters with the drone for aggregation. For this system, the convergence +rate of collaborative learning is derived while considering data heterogeneity, +sensor noise levels, and communication errors, then, the drone trajectory that +maximizes the final accuracy of the trained NN is obtained. The proposed +trajectory optimization approach is aware of both the devices data +characteristics (i.e., local dataset size and noise level) and their wireless +channel conditions, and significantly improves the convergence rate and final +accuracy in comparison with baselines that only consider data characteristics +or channel conditions. Compared to state-of-the-art baselines, the proposed +approach achieves an average 3.85% and 3.54% improvement in the final accuracy +of the trained NN on benchmark datasets for image recognition and semantic +segmentation tasks, respectively. Moreover, the proposed framework achieves a +significant speedup in training, leading to an average 24% and 87% saving in +the drone hovering time, communication overhead, and battery usage, +respectively for these tasks. + +
+
+ comment: Accepted at the IEEE +
+
+
+
+
+ + ♻ ☆ Causality Guided Disentanglement for Cross-Platform Hate Speech + Detection + + +
+ Social media platforms, despite their value in promoting open discourse, are +often exploited to spread harmful content. Current deep learning and natural +language processing models used for detecting this harmful content overly rely +on domain-specific terms affecting their capabilities to adapt to generalizable +hate speech detection. This is because they tend to focus too narrowly on +particular linguistic signals or the use of certain categories of words. +Another significant challenge arises when platforms lack high-quality annotated +data for training, leading to a need for cross-platform models that can adapt +to different distribution shifts. Our research introduces a cross-platform hate +speech detection model capable of being trained on one platform's data and +generalizing to multiple unseen platforms. To achieve good generalizability +across platforms, one way is to disentangle the input representations into +invariant and platform-dependent features. We also argue that learning causal +relationships, which remain constant across diverse environments, can +significantly aid in understanding invariant representations in hate speech. By +disentangling input into platform-dependent features (useful for predicting +hate targets) and platform-independent features (used to predict the presence +of hate), we learn invariant representations resistant to distribution shifts. +These features are then used to predict hate speech across unseen platforms. +Our extensive experiments across four platforms highlight our model's enhanced +efficacy compared to existing state-of-the-art methods in detecting generalized +hate speech. + +
+
+
+
+
+ + ♻ ☆ There is more than one kind of robustness: Fooling Whisper with + adversarial examples + + +
+ Whisper is a recent Automatic Speech Recognition (ASR) model displaying +impressive robustness to both out-of-distribution inputs and random noise. In +this work, we show that this robustness does not carry over to adversarial +noise. We show that we can degrade Whisper performance dramatically, or even +transcribe a target sentence of our choice, by generating very small input +perturbations with Signal Noise Ratio of 35-45dB. We also show that by fooling +the Whisper language detector we can very easily degrade the performance of +multilingual models. These vulnerabilities of a widely popular open-source +model have practical security implications and emphasize the need for +adversarially robust ASR. + +
+
+ comment: Accepted at InterSpeech 2023 +
+
+
+
+
+ + ♻ ☆ How many perturbations break this model? Evaluating robustness beyond + adversarial accuracy + + +
+ Robustness to adversarial attacks is typically evaluated with adversarial +accuracy. While essential, this metric does not capture all aspects of +robustness and in particular leaves out the question of how many perturbations +can be found for each point. In this work, we introduce an alternative +approach, adversarial sparsity, which quantifies how difficult it is to find a +successful perturbation given both an input point and a constraint on the +direction of the perturbation. We show that sparsity provides valuable insight +into neural networks in multiple ways: for instance, it illustrates important +differences between current state-of-the-art robust models them that accuracy +analysis does not, and suggests approaches for improving their robustness. When +applying broken defenses effective against weak attacks but not strong ones, +sparsity can discriminate between the totally ineffective and the partially +effective defenses. Finally, with sparsity we can measure increases in +robustness that do not affect accuracy: we show for example that data +augmentation can by itself increase adversarial robustness, without using +adversarial training. + +
+
+
+
+
+ + ♻ ☆ Adversarial Erasing with Pruned Elements: Towards Better Graph Lottery + Ticket ECAI2023 + + +
+ Graph Lottery Ticket (GLT), a combination of core subgraph and sparse +subnetwork, has been proposed to mitigate the computational cost of deep Graph +Neural Networks (GNNs) on large input graphs while preserving original +performance. However, the winning GLTs in exisiting studies are obtained by +applying iterative magnitude-based pruning (IMP) without re-evaluating and +re-considering the pruned information, which disregards the dynamic changes in +the significance of edges/weights during graph/model structure pruning, and +thus limits the appeal of the winning tickets. In this paper, we formulate a +conjecture, i.e., existing overlooked valuable information in the pruned graph +connections and model parameters which can be re-grouped into GLT to enhance +the final performance. Specifically, we propose an adversarial complementary +erasing (ACE) framework to explore the valuable information from the pruned +components, thereby developing a more powerful GLT, referred to as the ACE-GLT. +The main idea is to mine valuable information from pruned edges/weights after +each round of IMP, and employ the ACE technique to refine the GLT processing. +Finally, experimental results demonstrate that our ACE-GLT outperforms existing +methods for searching GLT in diverse tasks. Our code will be made publicly +available. + +
+
+ comment: 17 pages, 10 figures, Accept by ECAI2023 +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ AudioLDM 2: Learning Holistic Audio Generation with Self-supervised + Pretraining + + +
+ Although audio generation shares commonalities across different types of +audio, such as speech, music, and sound effects, designing models for each type +requires careful consideration of specific objectives and biases that can +significantly differ from those of other types. To bring us closer to a unified +perspective of audio generation, this paper proposes a framework that utilizes +the same learning method for speech, music, and sound effect generation. Our +framework introduces a general representation of audio, called language of +audio (LOA). Any audio can be translated into LOA based on AudioMAE, a +self-supervised pre-trained representation learning model. In the generation +process, we translate any modalities into LOA by using a GPT-2 model, and we +perform self-supervised audio generation learning with a latent diffusion model +conditioned on LOA. The proposed framework naturally brings advantages such as +in-context learning abilities and reusable self-supervised pretrained AudioMAE +and latent diffusion models. Experiments on the major benchmarks of +text-to-audio, text-to-music, and text-to-speech demonstrate new +state-of-the-art or competitive performance to previous approaches. Our demo +and code are available at https://audioldm.github.io/audioldm2. + +
+
+ comment: AudioLDM 2 project page is https://audioldm.github.io/audioldm2 +
+
+
+
+
+ + ☆ Speech-Driven 3D Face Animation with Composite and Regional Facial + Movements + + +
+ Speech-driven 3D face animation poses significant challenges due to the +intricacy and variability inherent in human facial movements. This paper +emphasizes the importance of considering both the composite and regional +natures of facial movements in speech-driven 3D face animation. The composite +nature pertains to how speech-independent factors globally modulate +speech-driven facial movements along the temporal dimension. Meanwhile, the +regional nature alludes to the notion that facial movements are not globally +correlated but are actuated by local musculature along the spatial dimension. +It is thus indispensable to incorporate both natures for engendering vivid +animation. To address the composite nature, we introduce an adaptive modulation +module that employs arbitrary facial movements to dynamically adjust +speech-driven facial movements across frames on a global scale. To accommodate +the regional nature, our approach ensures that each constituent of the facial +features for every frame focuses on the local spatial movements of 3D faces. +Moreover, we present a non-autoregressive backbone for translating audio to 3D +facial movements, which maintains high-frequency nuances of facial movements +and facilitates efficient inference. Comprehensive experiments and user studies +demonstrate that our method surpasses contemporary state-of-the-art approaches +both qualitatively and quantitatively. + +
+
+ comment: Accepted by MM 2023, 9 pages, 7 figures +
+
+
+
+
+ + ☆ Progressive Spatio-temporal Perception for Audio-Visual Question + Answering ACM MM 2023 + + +
+ Audio-Visual Question Answering (AVQA) task aims to answer questions about +different visual objects, sounds, and their associations in videos. Such +naturally multi-modal videos are composed of rich and complex dynamic +audio-visual components, where most of which could be unrelated to the given +questions, or even play as interference in answering the content of interest. +Oppositely, only focusing on the question-aware audio-visual content could get +rid of influence, meanwhile enabling the model to answer more efficiently. In +this paper, we propose a Progressive Spatio-Temporal Perception Network +(PSTP-Net), which contains three modules that progressively identify key +spatio-temporal regions w.r.t. questions. Specifically, a temporal segment +selection module is first introduced to select the most relevant audio-visual +segments related to the given question. Then, a spatial region selection module +is utilized to choose the most relevant regions associated with the question +from the selected temporal segments. To further refine the selection of +features, an audio-guided visual attention module is employed to perceive the +association between auido and selected spatial regions. Finally, the +spatio-temporal features from these modules are integrated for answering the +question. Extensive experimental results on the public MUSIC-AVQA and AVQA +datasets provide compelling evidence of the effectiveness and efficiency of +PSTP-Net. Code is available at: +\href{https://github.com/GeWu-Lab/PSTP-Net}{https://github.com/GeWu-Lab/PSTP-Net} + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Optimizing Adaptive Video Streaming with Human Feedback + + +
+ Quality of Experience~(QoE)-driven adaptive bitrate (ABR) algorithms are +typically optimized using QoE models that are based on the mean opinion +score~(MOS), while such principles may not account for user heterogeneity on +rating scales, resulting in unexpected behaviors. In this paper, we propose +Jade, which leverages reinforcement learning with human feedback~(RLHF) +technologies to better align the users' opinion scores. Jade's rank-based QoE +model considers relative values of user ratings to interpret the subjective +perception of video sessions. We implement linear-based and Deep Neural Network +(DNN)-based architectures for satisfying both accuracy and generalization +ability. We further propose entropy-aware reinforced mechanisms for training +policies with the integration of the proposed QoE models. Experimental results +demonstrate that Jade performs favorably on conventional metrics, such as +quality and stall ratio, and improves QoE by 8.09%-38.13% in different network +conditions, emphasizing the importance of user heterogeneity in QoE modeling +and the potential of combining linear-based and DNN-based models for +performance improvement. + +
+
+ comment: ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ Learning Music-Dance Representations through Explicit-Implicit Rhythm + Synchronization + + +
+ Although audio-visual representation has been proved to be applicable in many +downstream tasks, the representation of dancing videos, which is more specific +and always accompanied by music with complex auditory contents, remains +challenging and uninvestigated. Considering the intrinsic alignment between the +cadent movement of dancer and music rhythm, we introduce MuDaR, a novel +Music-Dance Representation learning framework to perform the synchronization of +music and dance rhythms both in explicit and implicit ways. Specifically, we +derive the dance rhythms based on visual appearance and motion cues inspired by +the music rhythm analysis. Then the visual rhythms are temporally aligned with +the music counterparts, which are extracted by the amplitude of sound +intensity. Meanwhile, we exploit the implicit coherence of rhythms implied in +audio and visual streams by contrastive learning. The model learns the joint +embedding by predicting the temporal consistency between audio-visual pairs. +The music-dance representation, together with the capability of detecting audio +and visual rhythms, can further be applied to three downstream tasks: (a) dance +classification, (b) music-dance retrieval, and (c) music-dance retargeting. +Extensive experiments demonstrate that our proposed framework outperforms other +self-supervised methods by a large margin. + +
+
+ comment: Accepted for publication in IEEE Transactions on Multimedia +
+
+
+
+
+ + ♻ ☆ A Closer Look at Audio-Visual Semantic Segmentation + + +
+ Audio-visual segmentation (AVS) is a complex task that involves accurately +segmenting the corresponding sounding object based on audio-visual queries. +Successful audio-visual learning requires two essential components: 1) an +unbiased dataset with high-quality pixel-level multi-class labels, and 2) a +model capable of effectively linking audio information with its corresponding +visual object. However, these two requirements are only partially addressed by +current methods, with training sets containing biased audio-visual data, and +models that generalise poorly beyond this biased training set. In this work, we +propose a new strategy to build cost-effective and relatively unbiased +audio-visual semantic segmentation benchmarks. Our strategy, called Visual +Post-production (VPO), explores the observation that it is not necessary to +have explicit audio-visual pairs extracted from single video sources to build +such benchmarks. We also refine the previously proposed AVSBench to transform +it into the audio-visual semantic segmentation benchmark AVSBench-Single+. +Furthermore, this paper introduces a new pixel-wise audio-visual contrastive +learning method to enable a better generalisation of the model beyond the +training set. We verify the validity of the VPO strategy by showing that +state-of-the-art (SOTA) models trained with datasets built by matching audio +and visual data from different sources or with datasets containing audio and +visual data from the same video source produce almost the same accuracy. Then, +using the proposed VPO benchmarks and AVSBench-Single+, we show that our method +produces more accurate audio-visual semantic segmentation than SOTA models. +Code and dataset will be available. + +
+
+
+
+
+ + ♻ ☆ DiffSynth: Latent In-Iteration Deflickering for Realistic Video + Synthesis + + +
+ In recent years, diffusion models have emerged as the most powerful approach +in image synthesis. However, applying these models directly to video synthesis +presents challenges, as it often leads to noticeable flickering contents. +Although recently proposed zero-shot methods can alleviate flicker to some +extent, we still struggle to generate coherent videos. In this paper, we +propose DiffSynth, a novel approach that aims to convert image synthesis +pipelines to video synthesis pipelines. DiffSynth consists of two key +components: a latent in-iteration deflickering framework and a video +deflickering algorithm. The latent in-iteration deflickering framework applies +video deflickering to the latent space of diffusion models, effectively +preventing flicker accumulation in intermediate steps. Additionally, we propose +a video deflickering algorithm, named patch blending algorithm, that remaps +objects in different frames and blends them together to enhance video +consistency. One of the notable advantages of DiffSynth is its general +applicability to various video synthesis tasks, including text-guided video +stylization, fashion video synthesis, image-guided video stylization, video +restoring, and 3D rendering. In the task of text-guided video stylization, we +make it possible to synthesize high-quality videos without cherry-picking. The +experimental results demonstrate the effectiveness of DiffSynth. All videos can +be viewed on our project page. Source codes will also be released. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`