diff --git a/backend/.gitignore b/backend/.gitignore index 73d4ca56..5cbbf7cf 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -1,4 +1,5 @@ .DS_Store +.swp # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/backend/backend/Chat_functionality/config.py b/backend/backend/Chat_functionality/config.py index 812d1f77..7fd250fa 100644 --- a/backend/backend/Chat_functionality/config.py +++ b/backend/backend/Chat_functionality/config.py @@ -1,10 +1,9 @@ from pydantic_settings import BaseSettings -from dotenv import load_dotenv -import os - -load_dotenv() # Manually load the .env file class Settings(BaseSettings): + neo4j_uri: str + neo4j_user: str + neo4j_password: str aws_access_key_id: str aws_secret_access_key: str together_api_key: str diff --git a/backend/backend/Chat_functionality/embeddings.py b/backend/backend/Chat_functionality/embeddings.py index f0d1f443..7ef1992d 100644 --- a/backend/backend/Chat_functionality/embeddings.py +++ b/backend/backend/Chat_functionality/embeddings.py @@ -18,8 +18,8 @@ def get_embeddings(texts: List[str], model: str) -> List[List[float]]: outputs = client.embeddings.create(input = texts, model=model) return [outputs.data[i].embedding for i in range(len(texts))] -input_texts = ['Our solar system orbits the Milky Way galaxy at about 515,000 mph'] -embeddings = get_embeddings(input_texts, model='togethercomputer/m2-bert-80M-8k-retrieval') +# input_texts = ['Our solar system orbits the Milky Way galaxy at about 515,000 mph'] +# embeddings = get_embeddings(input_texts, model='togethercomputer/m2-bert-80M-8k-retrieval') #print(embeddings) # [[0.13437459, 0.09866201, ..., -0.20736569]] diff --git a/backend/backend/__pycache__/main.cpython-311.pyc b/backend/backend/__pycache__/main.cpython-311.pyc index 1da424ed..51802cb9 100644 Binary files a/backend/backend/__pycache__/main.cpython-311.pyc and b/backend/backend/__pycache__/main.cpython-311.pyc differ diff --git a/backend/backend/main.py b/backend/backend/main.py index c90d0b83..200ad21b 100644 --- a/backend/backend/main.py +++ b/backend/backend/main.py @@ -1,5 +1,7 @@ -from fastapi import FastAPI, HTTPException import dotenv +dotenv.load_dotenv() + +from fastapi import FastAPI, HTTPException import os from utils.api.workspace import create_workspace, add_paper_to_workspace, get_all_workspaces, get_workspace from models.request_models import Workspace @@ -7,8 +9,8 @@ from utils.api.paper import upload_paper_with_metadata, get_all_papers from utils.api.search import top_k_abstract_query import together +from utils.scripts.seed_db import seed_db -dotenv.load_dotenv() together.api_key = os.getenv("TOGETHER_API_KEY") from pydantic import BaseModel @@ -20,7 +22,10 @@ from typing import List from Chat_functionality.embeddings import get_embeddings from Chat_functionality.dependencies import get_s3_client, get_together_client +from utils.cluster_papers import cluster_papers +import math from models.chat_models import ChatInput + settings = Settings() together_api_key = get_together_client @@ -96,7 +101,7 @@ def chat( paper_id: str, chat_input: ChatInput): context = {'chat_history': []} - #updating context with new chat input and embeddings +# #updating context with new chat input and embeddings #Note: You'd also want to store the chat output here new_chat_input = chat_input.question @@ -104,7 +109,7 @@ def chat( paper_id: str, chat_input: ChatInput): context['chat_history'].append({'input':new_chat_input, 'embedding': embeddings}) #save updated context - concatenated_prompts = "\n".join([entry['input'] for entry in context['chat_history']]) + "\n" + new_chat_input +# concatenated_prompts = "\n".join([entry['input'] for entry in context['chat_history']]) + "\n" + new_chat_input save_to_storage(paper_id, context) @@ -121,6 +126,18 @@ def chat( paper_id: str, chat_input: ChatInput): def get_graph(): return {"Graph": "GET Request"} +#NOTE: the only purpose of exposing these two methods as routes is to test them. this should be removed in production +@app.get("/seed") +def seed(): + seed_db() + return {"message": "Database seeded successfully"} + +# @app.get("/cluster_papers") +# def cluster_papers_route(): +# papers = get_all_papers() +# cluster_papers(papers=papers, n_clusters=math.floor(math.sqrt(len(papers)))) +# return {"message": "Papers clustered successfully"} + if __name__ == "__main__": import uvicorn @@ -129,5 +146,3 @@ def get_graph(): sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from backend.Chat_functionality.config import settings uvicorn.run(app, host="0.0.0.0", port=8000) - - diff --git a/backend/backend/models/__init__.py b/backend/backend/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/backend/models/__pycache__/dto_models.cpython-311.pyc b/backend/backend/models/__pycache__/dto_models.cpython-311.pyc index 069ff2f8..2416548b 100644 Binary files a/backend/backend/models/__pycache__/dto_models.cpython-311.pyc and b/backend/backend/models/__pycache__/dto_models.cpython-311.pyc differ diff --git a/backend/backend/models/__pycache__/request_models.cpython-311.pyc b/backend/backend/models/__pycache__/request_models.cpython-311.pyc index 42e8a339..02be5ddc 100644 Binary files a/backend/backend/models/__pycache__/request_models.cpython-311.pyc and b/backend/backend/models/__pycache__/request_models.cpython-311.pyc differ diff --git a/backend/backend/models/dto_models.py b/backend/backend/models/dto_models.py index 449e0c26..f6ae7931 100644 --- a/backend/backend/models/dto_models.py +++ b/backend/backend/models/dto_models.py @@ -1,4 +1,6 @@ from models.request_models import Paper +from pydantic import BaseModel +from typing import List, Optional class Paper(Paper): cite_count: int @@ -8,4 +10,6 @@ class Paper(Paper): publication_date: str pdf_blob: str arxiv_id: str - raw_markdown: str \ No newline at end of file + raw_markdown: str + abstract_embedding: Optional[List[float]] = None + id: str \ No newline at end of file diff --git a/backend/backend/utils/__pycache__/embeddings.cpython-311.pyc b/backend/backend/utils/__pycache__/embeddings.cpython-311.pyc index 7729ac15..63e9bc63 100644 Binary files a/backend/backend/utils/__pycache__/embeddings.cpython-311.pyc and b/backend/backend/utils/__pycache__/embeddings.cpython-311.pyc differ diff --git a/backend/backend/utils/api/__pycache__/paper.cpython-311.pyc b/backend/backend/utils/api/__pycache__/paper.cpython-311.pyc index 17ea5e41..7b440a89 100644 Binary files a/backend/backend/utils/api/__pycache__/paper.cpython-311.pyc and b/backend/backend/utils/api/__pycache__/paper.cpython-311.pyc differ diff --git a/backend/backend/utils/api/__pycache__/search.cpython-311.pyc b/backend/backend/utils/api/__pycache__/search.cpython-311.pyc index b8ebf933..d24a77ac 100644 Binary files a/backend/backend/utils/api/__pycache__/search.cpython-311.pyc and b/backend/backend/utils/api/__pycache__/search.cpython-311.pyc differ diff --git a/backend/backend/utils/api/__pycache__/workspace.cpython-311.pyc b/backend/backend/utils/api/__pycache__/workspace.cpython-311.pyc index ce33d5dc..da44b7cf 100644 Binary files a/backend/backend/utils/api/__pycache__/workspace.cpython-311.pyc and b/backend/backend/utils/api/__pycache__/workspace.cpython-311.pyc differ diff --git a/backend/backend/utils/api/paper.py b/backend/backend/utils/api/paper.py index 36eb6644..dce489be 100644 --- a/backend/backend/utils/api/paper.py +++ b/backend/backend/utils/api/paper.py @@ -4,6 +4,7 @@ from models.dto_models import Paper from utils.db.neo4j import driver from utils.embeddings import get_embeddings +import uuid # BEGIN CODE FROM DAVID TO GET METADATA def only_id(filename): @@ -65,6 +66,7 @@ def upload_paper_with_metadata(paper: Paper): #compute the abstract embedding; abstract_embedding = get_embeddings([paper.abstract], model='togethercomputer/m2-bert-80M-8k-retrieval')[0] + id = str(uuid.uuid4()) QUERY = """ MERGE (p:Paper {arxiv_id: $arxiv_id}) @@ -76,11 +78,12 @@ def upload_paper_with_metadata(paper: Paper): p.pdf_blob = $pdf_blob, p.raw_markdown = $raw_markdown, p.abstract_embedding = $abstract_embedding + p.id = $id RETURN p """ with driver.session() as session: - result = session.run(QUERY, arxiv_id=paper.arxiv_id, title=paper.title, abstract=paper.abstract, publication_date=paper.publication_date, cite_count=paper.cite_count, inf_cite_count=paper.inf_cite_count, pdf_blob=paper.pdf_blob, raw_markdown=paper.raw_markdown, abstract_embedding=abstract_embedding) + result = session.run(QUERY, id=id, arxiv_id=paper.arxiv_id, title=paper.title, abstract=paper.abstract, publication_date=paper.publication_date, cite_count=paper.cite_count, inf_cite_count=paper.inf_cite_count, pdf_blob=paper.pdf_blob, raw_markdown=paper.raw_markdown, abstract_embedding=abstract_embedding) return result.data() def get_all_papers(): @@ -90,9 +93,9 @@ def get_all_papers(): QUERY = """ MATCH (p:Paper) - RETURN p + RETURN p.id as id, p.arxiv_id as arxiv_id, p.title as title, p.abstract as abstract, p.publication_date as publication_date, p.cite_count as cite_count, p.inf_cite_count as inf_cite_count, p.pdf_blob as pdf_blob, p.raw_markdown as raw_markdown, p.abstract_embedding as abstract_embedding """ - with driver.session() as session: - result = session.run(QUERY) - return result.data() \ No newline at end of file + records, summary, keys = driver.execute_query(QUERY) + return records + # return [record["p"] for record in result] \ No newline at end of file diff --git a/backend/backend/utils/cluster_papers.py b/backend/backend/utils/cluster_papers.py new file mode 100644 index 00000000..86980410 --- /dev/null +++ b/backend/backend/utils/cluster_papers.py @@ -0,0 +1,154 @@ +""" +Cluster research papers based on their abstracts, and store the clusters in the database +The purpose of this is to aid with the graph visualization feature: nodes returned in the search results will be connected to their respective clusters, containing a summary +The clustering is hierarchical, so the clusters will be further clustered and summarized, creating a tree-like structure. +""" + +from typing import List +from sklearn.mixture import GaussianMixture +from sklearn.manifold import TSNE +import numpy as np +from utils.db.neo4j import driver +from pydantic import BaseModel +import uuid +from embeddings import get_embeddings +from models.dto_models import Paper +import os +from openai import OpenAI +import json +import math + +client = OpenAI( + api_key=os.getenv("TOGETHER_API_KEY"), + base_url='https://api.together.xyz/v1', +) + +class PaperClusterNode(BaseModel): + id: str + title: str + text: str # this will either be the summary if the node is a cluster, or the abstract if it's a paper + embedding: List[float] + +class SummaryResponse(BaseModel): + title: str + summary: str + +def cluster_papers(papers: List[Paper], n_clusters: int = 10): + print(papers) + # convert the papers to PaperClusterNode objects + paper_nodes = [PaperClusterNode(id=paper["id"], text=paper["abstract"], embedding=paper["abstract_embedding"], title=paper["title"]) for i, paper in enumerate(papers)] + recursive_cluster(paper_nodes, n_clusters) + +def create_cluster_summaries(cluster_texts: List[List[str]]) -> List[str]: + """ + Create a summary of the cluster based on the text of the papers in the cluster + """ + + summaries: List[SummaryResponse] = [] + + for i, abstracts in enumerate(cluster_texts): + chat_completion = client.chat.completions.create( + messages=[ + { + "role": "system", + "content": + """ + Given the following instructions: + +- You are an expert supparizer, tasked with creating a summary of a cluster of research papers, based on the abstracts of the papers. +- The purpose of the application is to help researchers find papers, so the papers need to be grouped by topic. +- The paper clusters were chosen based on Gaussian Mixture Model (GMM) clustering of paper embeddings. +- The summaries should be concise, around 3 sentences long, and not too broad. +- You are given all the abstracts from a single cluster of papers, and you need to summarize them into a single summary. + +The output must be in the following JSON format (this is VERY IMPORTANT): + +{ + "title": "Title of Cluster", + "summary": "Concise 3-sentence summary of the Cluster" +} + +You must output the result in this exact JSON format. Do not include any additional text or information outside of the JSON object. + """ + }, + { + "role": "user", + "content": "Here are the abstracts of the papers in the cluster: " + "\n\n".join(abstracts) + } + ], + model="mistralai/Mixtral-8x7B-Instruct-v0.1" + ) + + # print("RAW RESPONSE", chat_completion.choices[0].message.content) + response = json.loads(chat_completion.choices[0].message.content) + print("SUMMARIZATION", response) + summaries.append(SummaryResponse(**response)) + + return summaries + + +def recursive_cluster(nodes: List[PaperClusterNode], n_clusters: int = 5): + """ + Recursively cluster the papers based on their embeddings and store the clusters in the database + 'node' can be either a paper (if it's a leaf node) or a cluster (if it's an internal node in the tree) + """ + + # convert the embeddings to t-sne + embeddings = np.array([node.embedding for node in nodes]) + tsne = TSNE(n_components=2, perplexity=len(nodes)-1, n_iter=5000) + tsne_embeddings = tsne.fit_transform(embeddings) + + # cluster the papers + gmm = GaussianMixture(n_components=n_clusters) + clusters = gmm.fit_predict(tsne_embeddings) + + # get the text of all the nodes belonging to each cluster + cluster_texts = [[] for _ in range(n_clusters)] + + for i, cluster in enumerate(clusters): + print(cluster, nodes[i].text) + print("CLUSTER", cluster) + cluster_texts[cluster].append(nodes[i].text) + + # summarize the text of each cluster + cluster_summaries = create_cluster_summaries(cluster_texts) + cluster_objects = [] + + # store the clusters in the database + with driver.session() as session: + for i, summary_res in enumerate(cluster_summaries): + cluster_uuid = str(uuid.uuid4()) + + cluster_objects.append(PaperClusterNode( + id=cluster_uuid, + title=summary_res.title, + text=summary_res.summary, + embedding=get_embeddings([summary_res.summary], "togethercomputer/m2-bert-80M-8k-retrieval")[0] + )) + + # Convert PaperClusterNode objects to dictionaries (to avoid type errors) + node_data = [ + { + "id": node.id, + "title": node.title, + "text": node.text, + "embedding": node.embedding + } + for j, node in enumerate(nodes) if clusters[j] == i + ] + + session.run("MERGE (c:Cluster {id: $id, summary: $summary, title: $title})", id=cluster_uuid, summary=summary_res.summary, title=summary_res.title) + session.run(""" + WITH $papers as papers + UNWIND papers as paper + MATCH (p:Paper|Cluster {id: paper.id}) + MATCH (c:Cluster {id: $cluster_id}) + MERGE (c)-[:CONTAINS]->(p) + """, papers=node_data, cluster_id=cluster_uuid) + + new_n_clusters = math.floor(math.sqrt(len(cluster_objects))) + if new_n_clusters > 1: + recursive_cluster(cluster_objects, new_n_clusters) + + return + diff --git a/backend/backend/utils/db/__pycache__/__init__.cpython-311.pyc b/backend/backend/utils/db/__pycache__/__init__.cpython-311.pyc index 2519c45a..9a86610f 100644 Binary files a/backend/backend/utils/db/__pycache__/__init__.cpython-311.pyc and b/backend/backend/utils/db/__pycache__/__init__.cpython-311.pyc differ diff --git a/backend/backend/utils/db/__pycache__/neo4j.cpython-311.pyc b/backend/backend/utils/db/__pycache__/neo4j.cpython-311.pyc index c9e3378a..884eadcc 100644 Binary files a/backend/backend/utils/db/__pycache__/neo4j.cpython-311.pyc and b/backend/backend/utils/db/__pycache__/neo4j.cpython-311.pyc differ diff --git a/backend/backend/utils/db/neo4j_schema.cql b/backend/backend/utils/db/neo4j_schema.cql index 77375ef1..d92f6677 100644 --- a/backend/backend/utils/db/neo4j_schema.cql +++ b/backend/backend/utils/db/neo4j_schema.cql @@ -4,3 +4,6 @@ CALL db.index.vector.createNodeIndex('abstract_embedding_index', 'Paper', 'abstr CREATE CONSTRAINT ON (p:Paper) ASSERT p.id IS UNIQUE CREATE CONSTRAINT ON (w:Workspace) ASSERT w.id IS UNIQUE CREATE CONSTRAINT ON (u:User) ASSERT u.id IS UNIQUE + +-- Create some seeding data +CREATE (u:User { id: 'user1', name: 'User 1' }) diff --git a/backend/backend/utils/scripts/__init__.py b/backend/backend/utils/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/backend/utils/scripts/paper_metadata.json b/backend/backend/utils/scripts/paper_metadata.json new file mode 100644 index 00000000..4c439b9f --- /dev/null +++ b/backend/backend/utils/scripts/paper_metadata.json @@ -0,0 +1,8422 @@ +[ + { + "id":"2010.11929", + "submitter":"Alexey Dosovitskiy", + "authors":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk\n Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias\n Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit and Neil Houlsby", + "title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at\n Scale", + "comments":"Fine-tuning code and pre-trained models are available at\n https:\/\/github.com\/google-research\/vision_transformer. ICLR camera-ready\n version with 2 small modifications: 1) Added a discussion of CLS vs GAP\n classifier in the appendix, 2) Fixed an error in exaFLOPs computation in\n Figure 5 and Table 6 (relative performance of models is basically not\n affected)", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" While the Transformer architecture has become the de-facto standard for\nnatural language processing tasks, its applications to computer vision remain\nlimited. In vision, attention is either applied in conjunction with\nconvolutional networks, or used to replace certain components of convolutional\nnetworks while keeping their overall structure in place. We show that this\nreliance on CNNs is not necessary and a pure transformer applied directly to\nsequences of image patches can perform very well on image classification tasks.\nWhen pre-trained on large amounts of data and transferred to multiple mid-sized\nor small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision\nTransformer (ViT) attains excellent results compared to state-of-the-art\nconvolutional networks while requiring substantially fewer computational\nresources to train.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 22 Oct 2020 17:55:59 GMT" + }, + { + "version":"v2", + "created":"Thu, 3 Jun 2021 13:08:56 GMT" + } + ], + "update_date":"2021-06-04", + "authors_parsed":[ + [ + "Dosovitskiy", + "Alexey", + "" + ], + [ + "Beyer", + "Lucas", + "" + ], + [ + "Kolesnikov", + "Alexander", + "" + ], + [ + "Weissenborn", + "Dirk", + "" + ], + [ + "Zhai", + "Xiaohua", + "" + ], + [ + "Unterthiner", + "Thomas", + "" + ], + [ + "Dehghani", + "Mostafa", + "" + ], + [ + "Minderer", + "Matthias", + "" + ], + [ + "Heigold", + "Georg", + "" + ], + [ + "Gelly", + "Sylvain", + "" + ], + [ + "Uszkoreit", + "Jakob", + "" + ], + [ + "Houlsby", + "Neil", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG" + ], + "citation_count":19021.0, + "inf_cite_count":3121.0, + "publication_date":"2020-10-22" + }, + { + "id":"1710.10903", + "submitter":"Petar Veli\\v{c}kovi\\'c", + "authors":"Petar Veli\\v{c}kovi\\'c, Guillem Cucurull, Arantxa Casanova, Adriana\n Romero, Pietro Li\\`o, Yoshua Bengio", + "title":"Graph Attention Networks", + "comments":"To appear at ICLR 2018. 12 pages, 2 figures", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"stat.ML cs.AI cs.LG cs.SI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We present graph attention networks (GATs), novel neural network\narchitectures that operate on graph-structured data, leveraging masked\nself-attentional layers to address the shortcomings of prior methods based on\ngraph convolutions or their approximations. By stacking layers in which nodes\nare able to attend over their neighborhoods' features, we enable (implicitly)\nspecifying different weights to different nodes in a neighborhood, without\nrequiring any kind of costly matrix operation (such as inversion) or depending\non knowing the graph structure upfront. In this way, we address several key\nchallenges of spectral-based graph neural networks simultaneously, and make our\nmodel readily applicable to inductive as well as transductive problems. Our GAT\nmodels have achieved or matched state-of-the-art results across four\nestablished transductive and inductive graph benchmarks: the Cora, Citeseer and\nPubmed citation network datasets, as well as a protein-protein interaction\ndataset (wherein test graphs remain unseen during training).\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 30 Oct 2017 12:41:12 GMT" + }, + { + "version":"v2", + "created":"Wed, 20 Dec 2017 11:18:15 GMT" + }, + { + "version":"v3", + "created":"Sun, 4 Feb 2018 19:13:29 GMT" + } + ], + "update_date":"2018-02-06", + "authors_parsed":[ + [ + "Veli\u010dkovi\u0107", + "Petar", + "" + ], + [ + "Cucurull", + "Guillem", + "" + ], + [ + "Casanova", + "Arantxa", + "" + ], + [ + "Romero", + "Adriana", + "" + ], + [ + "Li\u00f2", + "Pietro", + "" + ], + [ + "Bengio", + "Yoshua", + "" + ] + ], + "categories_split":[ + "stat.ML", + "cs.AI", + "cs.LG", + "cs.SI" + ], + "citation_count":14547.0, + "inf_cite_count":2782.0, + "publication_date":"2017-10-30" + }, + { + "id":"1106.1813", + "submitter":"K. W. Bowyer", + "authors":"N. V. Chawla, K. W. Bowyer, L. O. Hall, W. P. Kegelmeyer", + "title":"SMOTE: Synthetic Minority Over-sampling Technique", + "comments":null, + "journal-ref":"Journal Of Artificial Intelligence Research, Volume 16, pages\n 321-357, 2002", + "doi":"10.1613\/jair.953", + "report-no":null, + "categories":"cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" An approach to the construction of classifiers from imbalanced datasets is\ndescribed. A dataset is imbalanced if the classification categories are not\napproximately equally represented. Often real-world data sets are predominately\ncomposed of \"normal\" examples with only a small percentage of \"abnormal\" or\n\"interesting\" examples. It is also the case that the cost of misclassifying an\nabnormal (interesting) example as a normal example is often much higher than\nthe cost of the reverse error. Under-sampling of the majority (normal) class\nhas been proposed as a good means of increasing the sensitivity of a classifier\nto the minority class. This paper shows that a combination of our method of\nover-sampling the minority (abnormal) class and under-sampling the majority\n(normal) class can achieve better classifier performance (in ROC space) than\nonly under-sampling the majority class. This paper also shows that a\ncombination of our method of over-sampling the minority class and\nunder-sampling the majority class can achieve better classifier performance (in\nROC space) than varying the loss ratios in Ripper or class priors in Naive\nBayes. Our method of over-sampling the minority class involves creating\nsynthetic minority class examples. Experiments are performed using C4.5, Ripper\nand a Naive Bayes classifier. The method is evaluated using the area under the\nReceiver Operating Characteristic curve (AUC) and the ROC convex hull strategy.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 9 Jun 2011 13:53:42 GMT" + } + ], + "update_date":"2011-11-25", + "authors_parsed":[ + [ + "Chawla", + "N. V.", + "" + ], + [ + "Bowyer", + "K. W.", + "" + ], + [ + "Hall", + "L. O.", + "" + ], + [ + "Kegelmeyer", + "W. P.", + "" + ] + ], + "categories_split":[ + "cs.AI" + ], + "citation_count":20793.0, + "inf_cite_count":2375.0, + "publication_date":null + }, + { + "id":"1703.03400", + "submitter":"Chelsea Finn", + "authors":"Chelsea Finn, Pieter Abbeel, Sergey Levine", + "title":"Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks", + "comments":"ICML 2017. Code at https:\/\/github.com\/cbfinn\/maml, Videos of RL\n results at https:\/\/sites.google.com\/view\/maml, Blog post at\n http:\/\/bair.berkeley.edu\/blog\/2017\/07\/18\/learning-to-learn\/", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.CV cs.NE", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We propose an algorithm for meta-learning that is model-agnostic, in the\nsense that it is compatible with any model trained with gradient descent and\napplicable to a variety of different learning problems, including\nclassification, regression, and reinforcement learning. The goal of\nmeta-learning is to train a model on a variety of learning tasks, such that it\ncan solve new learning tasks using only a small number of training samples. In\nour approach, the parameters of the model are explicitly trained such that a\nsmall number of gradient steps with a small amount of training data from a new\ntask will produce good generalization performance on that task. In effect, our\nmethod trains the model to be easy to fine-tune. We demonstrate that this\napproach leads to state-of-the-art performance on two few-shot image\nclassification benchmarks, produces good results on few-shot regression, and\naccelerates fine-tuning for policy gradient reinforcement learning with neural\nnetwork policies.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 9 Mar 2017 18:58:03 GMT" + }, + { + "version":"v2", + "created":"Tue, 9 May 2017 17:14:08 GMT" + }, + { + "version":"v3", + "created":"Tue, 18 Jul 2017 16:45:29 GMT" + } + ], + "update_date":"2017-07-19", + "authors_parsed":[ + [ + "Finn", + "Chelsea", + "" + ], + [ + "Abbeel", + "Pieter", + "" + ], + [ + "Levine", + "Sergey", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.CV", + "cs.NE" + ], + "citation_count":9445.0, + "inf_cite_count":2209.0, + "publication_date":"2017-03-09" + }, + { + "id":"1705.07874", + "submitter":"Scott Lundberg", + "authors":"Scott Lundberg and Su-In Lee", + "title":"A Unified Approach to Interpreting Model Predictions", + "comments":"To appear in NIPS 2017", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.AI cs.LG stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Understanding why a model makes a certain prediction can be as crucial as the\nprediction's accuracy in many applications. However, the highest accuracy for\nlarge modern datasets is often achieved by complex models that even experts\nstruggle to interpret, such as ensemble or deep learning models, creating a\ntension between accuracy and interpretability. In response, various methods\nhave recently been proposed to help users interpret the predictions of complex\nmodels, but it is often unclear how these methods are related and when one\nmethod is preferable over another. To address this problem, we present a\nunified framework for interpreting predictions, SHAP (SHapley Additive\nexPlanations). SHAP assigns each feature an importance value for a particular\nprediction. Its novel components include: (1) the identification of a new class\nof additive feature importance measures, and (2) theoretical results showing\nthere is a unique solution in this class with a set of desirable properties.\nThe new class unifies six existing methods, notable because several recent\nmethods in the class lack the proposed desirable properties. Based on insights\nfrom this unification, we present new methods that show improved computational\nperformance and\/or better consistency with human intuition than previous\napproaches.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 22 May 2017 17:38:10 GMT" + }, + { + "version":"v2", + "created":"Sat, 25 Nov 2017 03:53:32 GMT" + } + ], + "update_date":"2017-11-28", + "authors_parsed":[ + [ + "Lundberg", + "Scott", + "" + ], + [ + "Lee", + "Su-In", + "" + ] + ], + "categories_split":[ + "cs.AI", + "cs.LG", + "stat.ML" + ], + "citation_count":13635.0, + "inf_cite_count":1692.0, + "publication_date":"2017-05-22" + }, + { + "id":"1610.02391", + "submitter":"Ramprasaath R. Selvaraju", + "authors":"Ramprasaath R. Selvaraju, Michael Cogswell, Abhishek Das, Ramakrishna\n Vedantam, Devi Parikh, Dhruv Batra", + "title":"Grad-CAM: Visual Explanations from Deep Networks via Gradient-based\n Localization", + "comments":"This version was published in International Journal of Computer\n Vision (IJCV) in 2019; A previous version of the paper was published at\n International Conference on Computer Vision (ICCV'17)", + "journal-ref":null, + "doi":"10.1007\/s11263-019-01228-7", + "report-no":null, + "categories":"cs.CV cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We propose a technique for producing \"visual explanations\" for decisions from\na large class of CNN-based models, making them more transparent. Our approach -\nGradient-weighted Class Activation Mapping (Grad-CAM), uses the gradients of\nany target concept, flowing into the final convolutional layer to produce a\ncoarse localization map highlighting important regions in the image for\npredicting the concept. Grad-CAM is applicable to a wide variety of CNN\nmodel-families: (1) CNNs with fully-connected layers, (2) CNNs used for\nstructured outputs, (3) CNNs used in tasks with multimodal inputs or\nreinforcement learning, without any architectural changes or re-training. We\ncombine Grad-CAM with fine-grained visualizations to create a high-resolution\nclass-discriminative visualization and apply it to off-the-shelf image\nclassification, captioning, and visual question answering (VQA) models,\nincluding ResNet-based architectures. In the context of image classification\nmodels, our visualizations (a) lend insights into their failure modes, (b) are\nrobust to adversarial images, (c) outperform previous methods on localization,\n(d) are more faithful to the underlying model and (e) help achieve\ngeneralization by identifying dataset bias. For captioning and VQA, we show\nthat even non-attention based models can localize inputs. We devise a way to\nidentify important neurons through Grad-CAM and combine it with neuron names to\nprovide textual explanations for model decisions. Finally, we design and\nconduct human studies to measure if Grad-CAM helps users establish appropriate\ntrust in predictions from models and show that Grad-CAM helps untrained users\nsuccessfully discern a 'stronger' nodel from a 'weaker' one even when both make\nidentical predictions. Our code is available at\nhttps:\/\/github.com\/ramprs\/grad-cam\/, along with a demo at\nhttp:\/\/gradcam.cloudcv.org, and a video at youtu.be\/COjUB9Izk6E.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 7 Oct 2016 19:54:24 GMT" + }, + { + "version":"v2", + "created":"Fri, 30 Dec 2016 07:19:35 GMT" + }, + { + "version":"v3", + "created":"Tue, 21 Mar 2017 23:48:00 GMT" + }, + { + "version":"v4", + "created":"Tue, 3 Dec 2019 02:13:03 GMT" + } + ], + "update_date":"2019-12-04", + "authors_parsed":[ + [ + "Selvaraju", + "Ramprasaath R.", + "" + ], + [ + "Cogswell", + "Michael", + "" + ], + [ + "Das", + "Abhishek", + "" + ], + [ + "Vedantam", + "Ramakrishna", + "" + ], + [ + "Parikh", + "Devi", + "" + ], + [ + "Batra", + "Dhruv", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG" + ], + "citation_count":14387.0, + "inf_cite_count":1689.0, + "publication_date":"2016-10-07" + }, + { + "id":"1602.04938", + "submitter":"Marco Tulio Ribeiro", + "authors":"Marco Tulio Ribeiro and Sameer Singh and Carlos Guestrin", + "title":"\"Why Should I Trust You?\": Explaining the Predictions of Any Classifier", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Despite widespread adoption, machine learning models remain mostly black\nboxes. Understanding the reasons behind predictions is, however, quite\nimportant in assessing trust, which is fundamental if one plans to take action\nbased on a prediction, or when choosing whether to deploy a new model. Such\nunderstanding also provides insights into the model, which can be used to\ntransform an untrustworthy model or prediction into a trustworthy one. In this\nwork, we propose LIME, a novel explanation technique that explains the\npredictions of any classifier in an interpretable and faithful manner, by\nlearning an interpretable model locally around the prediction. We also propose\na method to explain models by presenting representative individual predictions\nand their explanations in a non-redundant way, framing the task as a submodular\noptimization problem. We demonstrate the flexibility of these methods by\nexplaining different models for text (e.g. random forests) and image\nclassification (e.g. neural networks). We show the utility of explanations via\nnovel experiments, both simulated and with human subjects, on various scenarios\nthat require trust: deciding if one should trust a prediction, choosing between\nmodels, improving an untrustworthy classifier, and identifying why a classifier\nshould not be trusted.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 16 Feb 2016 08:20:14 GMT" + }, + { + "version":"v2", + "created":"Wed, 3 Aug 2016 22:30:58 GMT" + }, + { + "version":"v3", + "created":"Tue, 9 Aug 2016 17:54:52 GMT" + } + ], + "update_date":"2016-08-10", + "authors_parsed":[ + [ + "Ribeiro", + "Marco Tulio", + "" + ], + [ + "Singh", + "Sameer", + "" + ], + [ + "Guestrin", + "Carlos", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "stat.ML" + ], + "citation_count":12782.0, + "inf_cite_count":1594.0, + "publication_date":"2016-02-16" + }, + { + "id":"1801.01290", + "submitter":"Tuomas Haarnoja", + "authors":"Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine", + "title":"Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement\n Learning with a Stochastic Actor", + "comments":"ICML 2018 Videos: sites.google.com\/view\/soft-actor-critic Code:\n github.com\/haarnoja\/sac", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Model-free deep reinforcement learning (RL) algorithms have been demonstrated\non a range of challenging decision making and control tasks. However, these\nmethods typically suffer from two major challenges: very high sample complexity\nand brittle convergence properties, which necessitate meticulous hyperparameter\ntuning. Both of these challenges severely limit the applicability of such\nmethods to complex, real-world domains. In this paper, we propose soft\nactor-critic, an off-policy actor-critic deep RL algorithm based on the maximum\nentropy reinforcement learning framework. In this framework, the actor aims to\nmaximize expected reward while also maximizing entropy. That is, to succeed at\nthe task while acting as randomly as possible. Prior deep RL methods based on\nthis framework have been formulated as Q-learning methods. By combining\noff-policy updates with a stable stochastic actor-critic formulation, our\nmethod achieves state-of-the-art performance on a range of continuous control\nbenchmark tasks, outperforming prior on-policy and off-policy methods.\nFurthermore, we demonstrate that, in contrast to other off-policy algorithms,\nour approach is very stable, achieving very similar performance across\ndifferent random seeds.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 4 Jan 2018 09:50:50 GMT" + }, + { + "version":"v2", + "created":"Wed, 8 Aug 2018 21:27:08 GMT" + } + ], + "update_date":"2018-08-10", + "authors_parsed":[ + [ + "Haarnoja", + "Tuomas", + "" + ], + [ + "Zhou", + "Aurick", + "" + ], + [ + "Abbeel", + "Pieter", + "" + ], + [ + "Levine", + "Sergey", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "stat.ML" + ], + "citation_count":5846.0, + "inf_cite_count":1385.0, + "publication_date":"2018-01-04" + }, + { + "id":"1502.01852", + "submitter":"Kaiming He", + "authors":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun", + "title":"Delving Deep into Rectifiers: Surpassing Human-Level Performance on\n ImageNet Classification", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Rectified activation units (rectifiers) are essential for state-of-the-art\nneural networks. In this work, we study rectifier neural networks for image\nclassification from two aspects. First, we propose a Parametric Rectified\nLinear Unit (PReLU) that generalizes the traditional rectified unit. PReLU\nimproves model fitting with nearly zero extra computational cost and little\noverfitting risk. Second, we derive a robust initialization method that\nparticularly considers the rectifier nonlinearities. This method enables us to\ntrain extremely deep rectified models directly from scratch and to investigate\ndeeper or wider network architectures. Based on our PReLU networks\n(PReLU-nets), we achieve 4.94% top-5 test error on the ImageNet 2012\nclassification dataset. This is a 26% relative improvement over the ILSVRC 2014\nwinner (GoogLeNet, 6.66%). To our knowledge, our result is the first to surpass\nhuman-level performance (5.1%, Russakovsky et al.) on this visual recognition\nchallenge.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 6 Feb 2015 10:44:00 GMT" + } + ], + "update_date":"2015-02-09", + "authors_parsed":[ + [ + "He", + "Kaiming", + "" + ], + [ + "Zhang", + "Xiangyu", + "" + ], + [ + "Ren", + "Shaoqing", + "" + ], + [ + "Sun", + "Jian", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG" + ], + "citation_count":16565.0, + "inf_cite_count":1178.0, + "publication_date":"2015-02-06" + }, + { + "id":"1405.4053", + "submitter":"Quoc Le", + "authors":"Quoc V. Le and Tomas Mikolov", + "title":"Distributed Representations of Sentences and Documents", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Many machine learning algorithms require the input to be represented as a\nfixed-length feature vector. When it comes to texts, one of the most common\nfixed-length features is bag-of-words. Despite their popularity, bag-of-words\nfeatures have two major weaknesses: they lose the ordering of the words and\nthey also ignore semantics of the words. For example, \"powerful,\" \"strong\" and\n\"Paris\" are equally distant. In this paper, we propose Paragraph Vector, an\nunsupervised algorithm that learns fixed-length feature representations from\nvariable-length pieces of texts, such as sentences, paragraphs, and documents.\nOur algorithm represents each document by a dense vector which is trained to\npredict words in the document. Its construction gives our algorithm the\npotential to overcome the weaknesses of bag-of-words models. Empirical results\nshow that Paragraph Vectors outperform bag-of-words models as well as other\ntechniques for text representations. Finally, we achieve new state-of-the-art\nresults on several text classification and sentiment analysis tasks.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 16 May 2014 07:12:16 GMT" + }, + { + "version":"v2", + "created":"Thu, 22 May 2014 23:23:19 GMT" + } + ], + "update_date":"2014-05-26", + "authors_parsed":[ + [ + "Le", + "Quoc V.", + "" + ], + [ + "Mikolov", + "Tomas", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI", + "cs.LG" + ], + "citation_count":8607.0, + "inf_cite_count":1049.0, + "publication_date":"2014-05-16" + }, + { + "id":"1411.1784", + "submitter":"Mehdi Mirza", + "authors":"Mehdi Mirza, Simon Osindero", + "title":"Conditional Generative Adversarial Nets", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.CV stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Generative Adversarial Nets [8] were recently introduced as a novel way to\ntrain generative models. In this work we introduce the conditional version of\ngenerative adversarial nets, which can be constructed by simply feeding the\ndata, y, we wish to condition on to both the generator and discriminator. We\nshow that this model can generate MNIST digits conditioned on class labels. We\nalso illustrate how this model could be used to learn a multi-modal model, and\nprovide preliminary examples of an application to image tagging in which we\ndemonstrate how this approach can generate descriptive tags which are not part\nof training labels.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 6 Nov 2014 22:33:22 GMT" + } + ], + "update_date":"2014-11-10", + "authors_parsed":[ + [ + "Mirza", + "Mehdi", + "" + ], + [ + "Osindero", + "Simon", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.CV", + "stat.ML" + ], + "citation_count":8992.0, + "inf_cite_count":1036.0, + "publication_date":"2014-11-06" + }, + { + "id":"1512.03012", + "submitter":"Manolis Savva", + "authors":"Angel X. Chang, Thomas Funkhouser, Leonidas Guibas, Pat Hanrahan,\n Qixing Huang, Zimo Li, Silvio Savarese, Manolis Savva, Shuran Song, Hao Su,\n Jianxiong Xiao, Li Yi, and Fisher Yu", + "title":"ShapeNet: An Information-Rich 3D Model Repository", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.GR cs.AI cs.CG cs.CV cs.RO", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We present ShapeNet: a richly-annotated, large-scale repository of shapes\nrepresented by 3D CAD models of objects. ShapeNet contains 3D models from a\nmultitude of semantic categories and organizes them under the WordNet taxonomy.\nIt is a collection of datasets providing many semantic annotations for each 3D\nmodel such as consistent rigid alignments, parts and bilateral symmetry planes,\nphysical sizes, keywords, as well as other planned annotations. Annotations are\nmade available through a public web-based interface to enable data\nvisualization of object attributes, promote data-driven geometric analysis, and\nprovide a large-scale quantitative benchmark for research in computer graphics\nand vision. At the time of this technical report, ShapeNet has indexed more\nthan 3,000,000 models, 220,000 models out of which are classified into 3,135\ncategories (WordNet synsets). In this report we describe the ShapeNet effort as\na whole, provide details for all currently available datasets, and summarize\nfuture plans.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 9 Dec 2015 19:42:48 GMT" + } + ], + "update_date":"2015-12-10", + "authors_parsed":[ + [ + "Chang", + "Angel X.", + "" + ], + [ + "Funkhouser", + "Thomas", + "" + ], + [ + "Guibas", + "Leonidas", + "" + ], + [ + "Hanrahan", + "Pat", + "" + ], + [ + "Huang", + "Qixing", + "" + ], + [ + "Li", + "Zimo", + "" + ], + [ + "Savarese", + "Silvio", + "" + ], + [ + "Savva", + "Manolis", + "" + ], + [ + "Song", + "Shuran", + "" + ], + [ + "Su", + "Hao", + "" + ], + [ + "Xiao", + "Jianxiong", + "" + ], + [ + "Yi", + "Li", + "" + ], + [ + "Yu", + "Fisher", + "" + ] + ], + "categories_split":[ + "cs.GR", + "cs.AI", + "cs.CG", + "cs.CV", + "cs.RO" + ], + "citation_count":4374.0, + "inf_cite_count":969.0, + "publication_date":"2015-12-09" + }, + { + "id":"1612.00796", + "submitter":"Raia Hadsell", + "authors":"James Kirkpatrick, Razvan Pascanu, Neil Rabinowitz, Joel Veness,\n Guillaume Desjardins, Andrei A. Rusu, Kieran Milan, John Quan, Tiago Ramalho,\n Agnieszka Grabska-Barwinska, Demis Hassabis, Claudia Clopath, Dharshan\n Kumaran, Raia Hadsell", + "title":"Overcoming catastrophic forgetting in neural networks", + "comments":null, + "journal-ref":null, + "doi":"10.1073\/pnas.1611835114", + "report-no":null, + "categories":"cs.LG cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" The ability to learn tasks in a sequential fashion is crucial to the\ndevelopment of artificial intelligence. Neural networks are not, in general,\ncapable of this and it has been widely thought that catastrophic forgetting is\nan inevitable feature of connectionist models. We show that it is possible to\novercome this limitation and train networks that can maintain expertise on\ntasks which they have not experienced for a long time. Our approach remembers\nold tasks by selectively slowing down learning on the weights important for\nthose tasks. We demonstrate our approach is scalable and effective by solving a\nset of classification tasks based on the MNIST hand written digit dataset and\nby learning several Atari 2600 games sequentially.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 2 Dec 2016 19:18:37 GMT" + }, + { + "version":"v2", + "created":"Wed, 25 Jan 2017 13:01:51 GMT" + } + ], + "update_date":"2022-06-08", + "authors_parsed":[ + [ + "Kirkpatrick", + "James", + "" + ], + [ + "Pascanu", + "Razvan", + "" + ], + [ + "Rabinowitz", + "Neil", + "" + ], + [ + "Veness", + "Joel", + "" + ], + [ + "Desjardins", + "Guillaume", + "" + ], + [ + "Rusu", + "Andrei A.", + "" + ], + [ + "Milan", + "Kieran", + "" + ], + [ + "Quan", + "John", + "" + ], + [ + "Ramalho", + "Tiago", + "" + ], + [ + "Grabska-Barwinska", + "Agnieszka", + "" + ], + [ + "Hassabis", + "Demis", + "" + ], + [ + "Clopath", + "Claudia", + "" + ], + [ + "Kumaran", + "Dharshan", + "" + ], + [ + "Hadsell", + "Raia", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "stat.ML" + ], + "citation_count":5280.0, + "inf_cite_count":943.0, + "publication_date":"2016-12-02" + }, + { + "id":"1806.07366", + "submitter":"David Duvenaud", + "authors":"Ricky T. Q. Chen, Yulia Rubanova, Jesse Bettencourt, David Duvenaud", + "title":"Neural Ordinary Differential Equations", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We introduce a new family of deep neural network models. Instead of\nspecifying a discrete sequence of hidden layers, we parameterize the derivative\nof the hidden state using a neural network. The output of the network is\ncomputed using a black-box differential equation solver. These continuous-depth\nmodels have constant memory cost, adapt their evaluation strategy to each\ninput, and can explicitly trade numerical precision for speed. We demonstrate\nthese properties in continuous-depth residual networks and continuous-time\nlatent variable models. We also construct continuous normalizing flows, a\ngenerative model that can train by maximum likelihood, without partitioning or\nordering the data dimensions. For training, we show how to scalably\nbackpropagate through any ODE solver, without access to its internal\noperations. This allows end-to-end training of ODEs within larger models.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 19 Jun 2018 17:50:12 GMT" + }, + { + "version":"v2", + "created":"Wed, 3 Oct 2018 00:13:07 GMT" + }, + { + "version":"v3", + "created":"Mon, 22 Oct 2018 22:06:50 GMT" + }, + { + "version":"v4", + "created":"Tue, 15 Jan 2019 01:56:48 GMT" + }, + { + "version":"v5", + "created":"Sat, 14 Dec 2019 02:01:18 GMT" + } + ], + "update_date":"2019-12-17", + "authors_parsed":[ + [ + "Chen", + "Ricky T. Q.", + "" + ], + [ + "Rubanova", + "Yulia", + "" + ], + [ + "Bettencourt", + "Jesse", + "" + ], + [ + "Duvenaud", + "David", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "stat.ML" + ], + "citation_count":3625.0, + "inf_cite_count":859.0, + "publication_date":"2018-06-19" + }, + { + "id":"1909.11942", + "submitter":"Zhenzhong Lan", + "authors":"Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush\n Sharma, Radu Soricut", + "title":"ALBERT: A Lite BERT for Self-supervised Learning of Language\n Representations", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Increasing model size when pretraining natural language representations often\nresults in improved performance on downstream tasks. However, at some point\nfurther model increases become harder due to GPU\/TPU memory limitations and\nlonger training times. To address these problems, we present two\nparameter-reduction techniques to lower memory consumption and increase the\ntraining speed of BERT. Comprehensive empirical evidence shows that our\nproposed methods lead to models that scale much better compared to the original\nBERT. We also use a self-supervised loss that focuses on modeling\ninter-sentence coherence, and show it consistently helps downstream tasks with\nmulti-sentence inputs. As a result, our best model establishes new\nstate-of-the-art results on the GLUE, RACE, and \\squad benchmarks while having\nfewer parameters compared to BERT-large. The code and the pretrained models are\navailable at https:\/\/github.com\/google-research\/ALBERT.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 26 Sep 2019 07:06:13 GMT" + }, + { + "version":"v2", + "created":"Wed, 23 Oct 2019 03:22:00 GMT" + }, + { + "version":"v3", + "created":"Wed, 30 Oct 2019 02:19:07 GMT" + }, + { + "version":"v4", + "created":"Fri, 10 Jan 2020 19:00:02 GMT" + }, + { + "version":"v5", + "created":"Mon, 3 Feb 2020 04:01:33 GMT" + }, + { + "version":"v6", + "created":"Sun, 9 Feb 2020 03:00:18 GMT" + } + ], + "update_date":"2020-02-11", + "authors_parsed":[ + [ + "Lan", + "Zhenzhong", + "" + ], + [ + "Chen", + "Mingda", + "" + ], + [ + "Goodman", + "Sebastian", + "" + ], + [ + "Gimpel", + "Kevin", + "" + ], + [ + "Sharma", + "Piyush", + "" + ], + [ + "Soricut", + "Radu", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI" + ], + "citation_count":5226.0, + "inf_cite_count":851.0, + "publication_date":"2019-09-26" + }, + { + "id":"2203.02155", + "submitter":"Jan Leike", + "authors":"Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright,\n Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John\n Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda\n Askell, Peter Welinder, Paul Christiano, Jan Leike, Ryan Lowe", + "title":"Training language models to follow instructions with human feedback", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Making language models bigger does not inherently make them better at\nfollowing a user's intent. For example, large language models can generate\noutputs that are untruthful, toxic, or simply not helpful to the user. In other\nwords, these models are not aligned with their users. In this paper, we show an\navenue for aligning language models with user intent on a wide range of tasks\nby fine-tuning with human feedback. Starting with a set of labeler-written\nprompts and prompts submitted through the OpenAI API, we collect a dataset of\nlabeler demonstrations of the desired model behavior, which we use to fine-tune\nGPT-3 using supervised learning. We then collect a dataset of rankings of model\noutputs, which we use to further fine-tune this supervised model using\nreinforcement learning from human feedback. We call the resulting models\nInstructGPT. In human evaluations on our prompt distribution, outputs from the\n1.3B parameter InstructGPT model are preferred to outputs from the 175B GPT-3,\ndespite having 100x fewer parameters. Moreover, InstructGPT models show\nimprovements in truthfulness and reductions in toxic output generation while\nhaving minimal performance regressions on public NLP datasets. Even though\nInstructGPT still makes simple mistakes, our results show that fine-tuning with\nhuman feedback is a promising direction for aligning language models with human\nintent.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 4 Mar 2022 07:04:42 GMT" + } + ], + "update_date":"2022-03-07", + "authors_parsed":[ + [ + "Ouyang", + "Long", + "" + ], + [ + "Wu", + "Jeff", + "" + ], + [ + "Jiang", + "Xu", + "" + ], + [ + "Almeida", + "Diogo", + "" + ], + [ + "Wainwright", + "Carroll L.", + "" + ], + [ + "Mishkin", + "Pamela", + "" + ], + [ + "Zhang", + "Chong", + "" + ], + [ + "Agarwal", + "Sandhini", + "" + ], + [ + "Slama", + "Katarina", + "" + ], + [ + "Ray", + "Alex", + "" + ], + [ + "Schulman", + "John", + "" + ], + [ + "Hilton", + "Jacob", + "" + ], + [ + "Kelton", + "Fraser", + "" + ], + [ + "Miller", + "Luke", + "" + ], + [ + "Simens", + "Maddie", + "" + ], + [ + "Askell", + "Amanda", + "" + ], + [ + "Welinder", + "Peter", + "" + ], + [ + "Christiano", + "Paul", + "" + ], + [ + "Leike", + "Jan", + "" + ], + [ + "Lowe", + "Ryan", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI", + "cs.LG" + ], + "citation_count":5560.0, + "inf_cite_count":789.0, + "publication_date":"2022-03-04" + }, + { + "id":"1802.09477", + "submitter":"Scott Fujimoto", + "authors":"Scott Fujimoto, Herke van Hoof, David Meger", + "title":"Addressing Function Approximation Error in Actor-Critic Methods", + "comments":"Accepted at ICML 2018", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.AI cs.LG stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" In value-based reinforcement learning methods such as deep Q-learning,\nfunction approximation errors are known to lead to overestimated value\nestimates and suboptimal policies. We show that this problem persists in an\nactor-critic setting and propose novel mechanisms to minimize its effects on\nboth the actor and the critic. Our algorithm builds on Double Q-learning, by\ntaking the minimum value between a pair of critics to limit overestimation. We\ndraw the connection between target networks and overestimation bias, and\nsuggest delaying policy updates to reduce per-update error and further improve\nperformance. We evaluate our method on the suite of OpenAI gym tasks,\noutperforming the state of the art in every environment tested.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 26 Feb 2018 17:54:49 GMT" + }, + { + "version":"v2", + "created":"Thu, 7 Jun 2018 18:21:26 GMT" + }, + { + "version":"v3", + "created":"Mon, 22 Oct 2018 17:37:07 GMT" + } + ], + "update_date":"2018-10-23", + "authors_parsed":[ + [ + "Fujimoto", + "Scott", + "" + ], + [ + "van Hoof", + "Herke", + "" + ], + [ + "Meger", + "David", + "" + ] + ], + "categories_split":[ + "cs.AI", + "cs.LG", + "stat.ML" + ], + "citation_count":3627.0, + "inf_cite_count":780.0, + "publication_date":"2018-02-26" + }, + { + "id":"1602.07360", + "submitter":"Forrest Iandola", + "authors":"Forrest N. Iandola, Song Han, Matthew W. Moskewicz, Khalid Ashraf,\n William J. Dally, Kurt Keutzer", + "title":"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB\n model size", + "comments":"In ICLR Format", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Recent research on deep neural networks has focused primarily on improving\naccuracy. For a given accuracy level, it is typically possible to identify\nmultiple DNN architectures that achieve that accuracy level. With equivalent\naccuracy, smaller DNN architectures offer at least three advantages: (1)\nSmaller DNNs require less communication across servers during distributed\ntraining. (2) Smaller DNNs require less bandwidth to export a new model from\nthe cloud to an autonomous car. (3) Smaller DNNs are more feasible to deploy on\nFPGAs and other hardware with limited memory. To provide all of these\nadvantages, we propose a small DNN architecture called SqueezeNet. SqueezeNet\nachieves AlexNet-level accuracy on ImageNet with 50x fewer parameters.\nAdditionally, with model compression techniques we are able to compress\nSqueezeNet to less than 0.5MB (510x smaller than AlexNet).\n The SqueezeNet architecture is available for download here:\nhttps:\/\/github.com\/DeepScale\/SqueezeNet\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 24 Feb 2016 00:09:45 GMT" + }, + { + "version":"v2", + "created":"Sat, 27 Feb 2016 20:24:20 GMT" + }, + { + "version":"v3", + "created":"Wed, 6 Apr 2016 07:21:49 GMT" + }, + { + "version":"v4", + "created":"Fri, 4 Nov 2016 21:26:08 GMT" + } + ], + "update_date":"2016-11-08", + "authors_parsed":[ + [ + "Iandola", + "Forrest N.", + "" + ], + [ + "Han", + "Song", + "" + ], + [ + "Moskewicz", + "Matthew W.", + "" + ], + [ + "Ashraf", + "Khalid", + "" + ], + [ + "Dally", + "William J.", + "" + ], + [ + "Keutzer", + "Kurt", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI" + ], + "citation_count":6503.0, + "inf_cite_count":754.0, + "publication_date":"2016-02-24" + }, + { + "id":"1703.06103", + "submitter":"Thomas Kipf", + "authors":"Michael Schlichtkrull, Thomas N. Kipf, Peter Bloem, Rianne van den\n Berg, Ivan Titov, Max Welling", + "title":"Modeling Relational Data with Graph Convolutional Networks", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"stat.ML cs.AI cs.DB cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Knowledge graphs enable a wide variety of applications, including question\nanswering and information retrieval. Despite the great effort invested in their\ncreation and maintenance, even the largest (e.g., Yago, DBPedia or Wikidata)\nremain incomplete. We introduce Relational Graph Convolutional Networks\n(R-GCNs) and apply them to two standard knowledge base completion tasks: Link\nprediction (recovery of missing facts, i.e. subject-predicate-object triples)\nand entity classification (recovery of missing entity attributes). R-GCNs are\nrelated to a recent class of neural networks operating on graphs, and are\ndeveloped specifically to deal with the highly multi-relational data\ncharacteristic of realistic knowledge bases. We demonstrate the effectiveness\nof R-GCNs as a stand-alone model for entity classification. We further show\nthat factorization models for link prediction such as DistMult can be\nsignificantly improved by enriching them with an encoder model to accumulate\nevidence over multiple inference steps in the relational graph, demonstrating a\nlarge improvement of 29.8% on FB15k-237 over a decoder-only baseline.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 17 Mar 2017 17:09:14 GMT" + }, + { + "version":"v2", + "created":"Thu, 30 Mar 2017 13:43:41 GMT" + }, + { + "version":"v3", + "created":"Tue, 6 Jun 2017 15:49:12 GMT" + }, + { + "version":"v4", + "created":"Thu, 26 Oct 2017 19:53:49 GMT" + } + ], + "update_date":"2017-10-30", + "authors_parsed":[ + [ + "Schlichtkrull", + "Michael", + "" + ], + [ + "Kipf", + "Thomas N.", + "" + ], + [ + "Bloem", + "Peter", + "" + ], + [ + "Berg", + "Rianne van den", + "" + ], + [ + "Titov", + "Ivan", + "" + ], + [ + "Welling", + "Max", + "" + ] + ], + "categories_split":[ + "stat.ML", + "cs.AI", + "cs.DB", + "cs.LG" + ], + "citation_count":3735.0, + "inf_cite_count":679.0, + "publication_date":"2017-03-17" + }, + { + "id":"1706.02275", + "submitter":"Ryan Lowe T.", + "authors":"Ryan Lowe, Yi Wu, Aviv Tamar, Jean Harb, Pieter Abbeel, Igor Mordatch", + "title":"Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.NE", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We explore deep reinforcement learning methods for multi-agent domains. We\nbegin by analyzing the difficulty of traditional algorithms in the multi-agent\ncase: Q-learning is challenged by an inherent non-stationarity of the\nenvironment, while policy gradient suffers from a variance that increases as\nthe number of agents grows. We then present an adaptation of actor-critic\nmethods that considers action policies of other agents and is able to\nsuccessfully learn policies that require complex multi-agent coordination.\nAdditionally, we introduce a training regimen utilizing an ensemble of policies\nfor each agent that leads to more robust multi-agent policies. We show the\nstrength of our approach compared to existing methods in cooperative as well as\ncompetitive scenarios, where agent populations are able to discover various\nphysical and informational coordination strategies.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 7 Jun 2017 17:35:00 GMT" + }, + { + "version":"v2", + "created":"Wed, 21 Jun 2017 22:18:54 GMT" + }, + { + "version":"v3", + "created":"Tue, 16 Jan 2018 23:37:25 GMT" + }, + { + "version":"v4", + "created":"Sat, 14 Mar 2020 20:33:00 GMT" + } + ], + "update_date":"2020-03-17", + "authors_parsed":[ + [ + "Lowe", + "Ryan", + "" + ], + [ + "Wu", + "Yi", + "" + ], + [ + "Tamar", + "Aviv", + "" + ], + [ + "Harb", + "Jean", + "" + ], + [ + "Abbeel", + "Pieter", + "" + ], + [ + "Mordatch", + "Igor", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.NE" + ], + "citation_count":3337.0, + "inf_cite_count":677.0, + "publication_date":"2017-06-07" + }, + { + "id":"1210.5644", + "submitter":"Philipp Kr\\\"ahenb\\\"uhl", + "authors":"Philipp Kr\\\"ahenb\\\"uhl and Vladlen Koltun", + "title":"Efficient Inference in Fully Connected CRFs with Gaussian Edge\n Potentials", + "comments":"NIPS 2011", + "journal-ref":"Advances in Neural Information Processing Systems 24 (2011)\n 109-117", + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Most state-of-the-art techniques for multi-class image segmentation and\nlabeling use conditional random fields defined over pixels or image regions.\nWhile region-level models often feature dense pairwise connectivity,\npixel-level models are considerably larger and have only permitted sparse graph\nstructures. In this paper, we consider fully connected CRF models defined on\nthe complete set of pixels in an image. The resulting graphs have billions of\nedges, making traditional inference algorithms impractical. Our main\ncontribution is a highly efficient approximate inference algorithm for fully\nconnected CRF models in which the pairwise edge potentials are defined by a\nlinear combination of Gaussian kernels. Our experiments demonstrate that dense\nconnectivity at the pixel level substantially improves segmentation and\nlabeling accuracy.\n", + "versions":[ + { + "version":"v1", + "created":"Sat, 20 Oct 2012 17:41:23 GMT" + } + ], + "update_date":"2012-10-23", + "authors_parsed":[ + [ + "Kr\u00e4henb\u00fchl", + "Philipp", + "" + ], + [ + "Koltun", + "Vladlen", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG" + ], + "citation_count":3222.0, + "inf_cite_count":646.0, + "publication_date":"2011-12-12" + }, + { + "id":"1711.03938", + "submitter":"Alexey Dosovitskiy", + "authors":"Alexey Dosovitskiy, German Ros, Felipe Codevilla, Antonio Lopez,\n Vladlen Koltun", + "title":"CARLA: An Open Urban Driving Simulator", + "comments":"Published at the 1st Conference on Robot Learning (CoRL)", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.CV cs.RO", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We introduce CARLA, an open-source simulator for autonomous driving research.\nCARLA has been developed from the ground up to support development, training,\nand validation of autonomous urban driving systems. In addition to open-source\ncode and protocols, CARLA provides open digital assets (urban layouts,\nbuildings, vehicles) that were created for this purpose and can be used freely.\nThe simulation platform supports flexible specification of sensor suites and\nenvironmental conditions. We use CARLA to study the performance of three\napproaches to autonomous driving: a classic modular pipeline, an end-to-end\nmodel trained via imitation learning, and an end-to-end model trained via\nreinforcement learning. The approaches are evaluated in controlled scenarios of\nincreasing difficulty, and their performance is examined via metrics provided\nby CARLA, illustrating the platform's utility for autonomous driving research.\nThe supplementary video can be viewed at https:\/\/youtu.be\/Hp8Dz-Zek2E\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 10 Nov 2017 17:54:40 GMT" + } + ], + "update_date":"2017-11-13", + "authors_parsed":[ + [ + "Dosovitskiy", + "Alexey", + "" + ], + [ + "Ros", + "German", + "" + ], + [ + "Codevilla", + "Felipe", + "" + ], + [ + "Lopez", + "Antonio", + "" + ], + [ + "Koltun", + "Vladlen", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.CV", + "cs.RO" + ], + "citation_count":3723.0, + "inf_cite_count":631.0, + "publication_date":"2017-10-18" + }, + { + "id":"1602.07332", + "submitter":"Ranjay Krishna", + "authors":"Ranjay Krishna, Yuke Zhu, Oliver Groth, Justin Johnson, Kenji Hata,\n Joshua Kravitz, Stephanie Chen, Yannis Kalantidis, Li-Jia Li, David A.\n Shamma, Michael S. Bernstein, Fei-Fei Li", + "title":"Visual Genome: Connecting Language and Vision Using Crowdsourced Dense\n Image Annotations", + "comments":"44 pages, 37 figures", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Despite progress in perceptual tasks such as image classification, computers\nstill perform poorly on cognitive tasks such as image description and question\nanswering. Cognition is core to tasks that involve not just recognizing, but\nreasoning about our visual world. However, models used to tackle the rich\ncontent in images for cognitive tasks are still being trained using the same\ndatasets designed for perceptual tasks. To achieve success at cognitive tasks,\nmodels need to understand the interactions and relationships between objects in\nan image. When asked \"What vehicle is the person riding?\", computers will need\nto identify the objects in an image as well as the relationships riding(man,\ncarriage) and pulling(horse, carriage) in order to answer correctly that \"the\nperson is riding a horse-drawn carriage\".\n In this paper, we present the Visual Genome dataset to enable the modeling of\nsuch relationships. We collect dense annotations of objects, attributes, and\nrelationships within each image to learn these models. Specifically, our\ndataset contains over 100K images where each image has an average of 21\nobjects, 18 attributes, and 18 pairwise relationships between objects. We\ncanonicalize the objects, attributes, relationships, and noun phrases in region\ndescriptions and questions answer pairs to WordNet synsets. Together, these\nannotations represent the densest and largest dataset of image descriptions,\nobjects, attributes, relationships, and question answers.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 23 Feb 2016 22:00:40 GMT" + } + ], + "update_date":"2016-02-25", + "authors_parsed":[ + [ + "Krishna", + "Ranjay", + "" + ], + [ + "Zhu", + "Yuke", + "" + ], + [ + "Groth", + "Oliver", + "" + ], + [ + "Johnson", + "Justin", + "" + ], + [ + "Hata", + "Kenji", + "" + ], + [ + "Kravitz", + "Joshua", + "" + ], + [ + "Chen", + "Stephanie", + "" + ], + [ + "Kalantidis", + "Yannis", + "" + ], + [ + "Li", + "Li-Jia", + "" + ], + [ + "Shamma", + "David A.", + "" + ], + [ + "Bernstein", + "Michael S.", + "" + ], + [ + "Li", + "Fei-Fei", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI" + ], + "citation_count":4623.0, + "inf_cite_count":629.0, + "publication_date":"2016-02-23" + }, + { + "id":"1605.08803", + "submitter":"Laurent Dinh", + "authors":"Laurent Dinh, Jascha Sohl-Dickstein, Samy Bengio", + "title":"Density estimation using Real NVP", + "comments":"10 pages of main content, 3 pages of bibliography, 18 pages of\n appendix. Accepted at ICLR 2017", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.NE stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Unsupervised learning of probabilistic models is a central yet challenging\nproblem in machine learning. Specifically, designing models with tractable\nlearning, sampling, inference and evaluation is crucial in solving this task.\nWe extend the space of such models using real-valued non-volume preserving\n(real NVP) transformations, a set of powerful invertible and learnable\ntransformations, resulting in an unsupervised learning algorithm with exact\nlog-likelihood computation, exact sampling, exact inference of latent\nvariables, and an interpretable latent space. We demonstrate its ability to\nmodel natural images on four datasets through sampling, log-likelihood\nevaluation and latent variable manipulations.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 27 May 2016 21:24:32 GMT" + }, + { + "version":"v2", + "created":"Mon, 14 Nov 2016 21:37:10 GMT" + }, + { + "version":"v3", + "created":"Mon, 27 Feb 2017 23:21:10 GMT" + } + ], + "update_date":"2017-03-01", + "authors_parsed":[ + [ + "Dinh", + "Laurent", + "" + ], + [ + "Sohl-Dickstein", + "Jascha", + "" + ], + [ + "Bengio", + "Samy", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.NE", + "stat.ML" + ], + "citation_count":2988.0, + "inf_cite_count":588.0, + "publication_date":"2016-05-27" + }, + { + "id":"2106.09685", + "submitter":"Edward J. Hu", + "authors":"Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi\n Li, Shean Wang, Lu Wang, Weizhu Chen", + "title":"LoRA: Low-Rank Adaptation of Large Language Models", + "comments":"Draft V2 includes better baselines, experiments on GLUE, and more on\n adapter latency", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" An important paradigm of natural language processing consists of large-scale\npre-training on general domain data and adaptation to particular tasks or\ndomains. As we pre-train larger models, full fine-tuning, which retrains all\nmodel parameters, becomes less feasible. Using GPT-3 175B as an example --\ndeploying independent instances of fine-tuned models, each with 175B\nparameters, is prohibitively expensive. We propose Low-Rank Adaptation, or\nLoRA, which freezes the pre-trained model weights and injects trainable rank\ndecomposition matrices into each layer of the Transformer architecture, greatly\nreducing the number of trainable parameters for downstream tasks. Compared to\nGPT-3 175B fine-tuned with Adam, LoRA can reduce the number of trainable\nparameters by 10,000 times and the GPU memory requirement by 3 times. LoRA\nperforms on-par or better than fine-tuning in model quality on RoBERTa,\nDeBERTa, GPT-2, and GPT-3, despite having fewer trainable parameters, a higher\ntraining throughput, and, unlike adapters, no additional inference latency. We\nalso provide an empirical investigation into rank-deficiency in language model\nadaptation, which sheds light on the efficacy of LoRA. We release a package\nthat facilitates the integration of LoRA with PyTorch models and provide our\nimplementations and model checkpoints for RoBERTa, DeBERTa, and GPT-2 at\nhttps:\/\/github.com\/microsoft\/LoRA.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 17 Jun 2021 17:37:18 GMT" + }, + { + "version":"v2", + "created":"Sat, 16 Oct 2021 18:40:34 GMT" + } + ], + "update_date":"2021-10-19", + "authors_parsed":[ + [ + "Hu", + "Edward J.", + "" + ], + [ + "Shen", + "Yelong", + "" + ], + [ + "Wallis", + "Phillip", + "" + ], + [ + "Allen-Zhu", + "Zeyuan", + "" + ], + [ + "Li", + "Yuanzhi", + "" + ], + [ + "Wang", + "Shean", + "" + ], + [ + "Wang", + "Lu", + "" + ], + [ + "Chen", + "Weizhu", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI", + "cs.LG" + ], + "citation_count":2948.0, + "inf_cite_count":586.0, + "publication_date":"2021-06-17" + }, + { + "id":"1611.01578", + "submitter":"Quoc Le", + "authors":"Barret Zoph and Quoc V. Le", + "title":"Neural Architecture Search with Reinforcement Learning", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.NE", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Neural networks are powerful and flexible models that work well for many\ndifficult learning tasks in image, speech and natural language understanding.\nDespite their success, neural networks are still hard to design. In this paper,\nwe use a recurrent network to generate the model descriptions of neural\nnetworks and train this RNN with reinforcement learning to maximize the\nexpected accuracy of the generated architectures on a validation set. On the\nCIFAR-10 dataset, our method, starting from scratch, can design a novel network\narchitecture that rivals the best human-invented architecture in terms of test\nset accuracy. Our CIFAR-10 model achieves a test error rate of 3.65, which is\n0.09 percent better and 1.05x faster than the previous state-of-the-art model\nthat used a similar architectural scheme. On the Penn Treebank dataset, our\nmodel can compose a novel recurrent cell that outperforms the widely-used LSTM\ncell, and other state-of-the-art baselines. Our cell achieves a test set\nperplexity of 62.4 on the Penn Treebank, which is 3.6 perplexity better than\nthe previous state-of-the-art model. The cell can also be transferred to the\ncharacter language modeling task on PTB and achieves a state-of-the-art\nperplexity of 1.214.\n", + "versions":[ + { + "version":"v1", + "created":"Sat, 5 Nov 2016 00:41:37 GMT" + }, + { + "version":"v2", + "created":"Wed, 15 Feb 2017 05:28:05 GMT" + } + ], + "update_date":"2017-02-16", + "authors_parsed":[ + [ + "Zoph", + "Barret", + "" + ], + [ + "Le", + "Quoc V.", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.NE" + ], + "citation_count":4794.0, + "inf_cite_count":580.0, + "publication_date":"2016-11-04" + }, + { + "id":"1803.01271", + "submitter":"Shaojie Bai", + "authors":"Shaojie Bai, J. Zico Kolter, Vladlen Koltun", + "title":"An Empirical Evaluation of Generic Convolutional and Recurrent Networks\n for Sequence Modeling", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.CL", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" For most deep learning practitioners, sequence modeling is synonymous with\nrecurrent networks. Yet recent results indicate that convolutional\narchitectures can outperform recurrent networks on tasks such as audio\nsynthesis and machine translation. Given a new sequence modeling task or\ndataset, which architecture should one use? We conduct a systematic evaluation\nof generic convolutional and recurrent architectures for sequence modeling. The\nmodels are evaluated across a broad range of standard tasks that are commonly\nused to benchmark recurrent networks. Our results indicate that a simple\nconvolutional architecture outperforms canonical recurrent networks such as\nLSTMs across a diverse range of tasks and datasets, while demonstrating longer\neffective memory. We conclude that the common association between sequence\nmodeling and recurrent networks should be reconsidered, and convolutional\nnetworks should be regarded as a natural starting point for sequence modeling\ntasks. To assist related work, we have made code available at\nhttp:\/\/github.com\/locuslab\/TCN .\n", + "versions":[ + { + "version":"v1", + "created":"Sun, 4 Mar 2018 00:20:29 GMT" + }, + { + "version":"v2", + "created":"Thu, 19 Apr 2018 14:32:38 GMT" + } + ], + "update_date":"2018-04-20", + "authors_parsed":[ + [ + "Bai", + "Shaojie", + "" + ], + [ + "Kolter", + "J. Zico", + "" + ], + [ + "Koltun", + "Vladlen", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.CL" + ], + "citation_count":3492.0, + "inf_cite_count":541.0, + "publication_date":"2018-03-04" + }, + { + "id":"2307.09288", + "submitter":"Thomas Scialom", + "authors":"Hugo Touvron and Louis Martin and Kevin Stone and Peter Albert and\n Amjad Almahairi and Yasmine Babaei and Nikolay Bashlykov and Soumya Batra and\n Prajjwal Bhargava and Shruti Bhosale and Dan Bikel and Lukas Blecher and\n Cristian Canton Ferrer and Moya Chen and Guillem Cucurull and David Esiobu\n and Jude Fernandes and Jeremy Fu and Wenyin Fu and Brian Fuller and Cynthia\n Gao and Vedanuj Goswami and Naman Goyal and Anthony Hartshorn and Saghar\n Hosseini and Rui Hou and Hakan Inan and Marcin Kardas and Viktor Kerkez and\n Madian Khabsa and Isabel Kloumann and Artem Korenev and Punit Singh Koura and\n Marie-Anne Lachaux and Thibaut Lavril and Jenya Lee and Diana Liskovich and\n Yinghai Lu and Yuning Mao and Xavier Martinet and Todor Mihaylov and Pushkar\n Mishra and Igor Molybog and Yixin Nie and Andrew Poulton and Jeremy\n Reizenstein and Rashi Rungta and Kalyan Saladi and Alan Schelten and Ruan\n Silva and Eric Michael Smith and Ranjan Subramanian and Xiaoqing Ellen Tan\n and Binh Tang and Ross Taylor and Adina Williams and Jian Xiang Kuan and\n Puxin Xu and Zheng Yan and Iliyan Zarov and Yuchen Zhang and Angela Fan and\n Melanie Kambadur and Sharan Narang and Aurelien Rodriguez and Robert Stojnic\n and Sergey Edunov and Thomas Scialom", + "title":"Llama 2: Open Foundation and Fine-Tuned Chat Models", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" In this work, we develop and release Llama 2, a collection of pretrained and\nfine-tuned large language models (LLMs) ranging in scale from 7 billion to 70\nbillion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for\ndialogue use cases. Our models outperform open-source chat models on most\nbenchmarks we tested, and based on our human evaluations for helpfulness and\nsafety, may be a suitable substitute for closed-source models. We provide a\ndetailed description of our approach to fine-tuning and safety improvements of\nLlama 2-Chat in order to enable the community to build on our work and\ncontribute to the responsible development of LLMs.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 18 Jul 2023 14:31:57 GMT" + }, + { + "version":"v2", + "created":"Wed, 19 Jul 2023 17:08:59 GMT" + } + ], + "update_date":"2023-07-20", + "authors_parsed":[ + [ + "Touvron", + "Hugo", + "" + ], + [ + "Martin", + "Louis", + "" + ], + [ + "Stone", + "Kevin", + "" + ], + [ + "Albert", + "Peter", + "" + ], + [ + "Almahairi", + "Amjad", + "" + ], + [ + "Babaei", + "Yasmine", + "" + ], + [ + "Bashlykov", + "Nikolay", + "" + ], + [ + "Batra", + "Soumya", + "" + ], + [ + "Bhargava", + "Prajjwal", + "" + ], + [ + "Bhosale", + "Shruti", + "" + ], + [ + "Bikel", + "Dan", + "" + ], + [ + "Blecher", + "Lukas", + "" + ], + [ + "Ferrer", + "Cristian Canton", + "" + ], + [ + "Chen", + "Moya", + "" + ], + [ + "Cucurull", + "Guillem", + "" + ], + [ + "Esiobu", + "David", + "" + ], + [ + "Fernandes", + "Jude", + "" + ], + [ + "Fu", + "Jeremy", + "" + ], + [ + "Fu", + "Wenyin", + "" + ], + [ + "Fuller", + "Brian", + "" + ], + [ + "Gao", + "Cynthia", + "" + ], + [ + "Goswami", + "Vedanuj", + "" + ], + [ + "Goyal", + "Naman", + "" + ], + [ + "Hartshorn", + "Anthony", + "" + ], + [ + "Hosseini", + "Saghar", + "" + ], + [ + "Hou", + "Rui", + "" + ], + [ + "Inan", + "Hakan", + "" + ], + [ + "Kardas", + "Marcin", + "" + ], + [ + "Kerkez", + "Viktor", + "" + ], + [ + "Khabsa", + "Madian", + "" + ], + [ + "Kloumann", + "Isabel", + "" + ], + [ + "Korenev", + "Artem", + "" + ], + [ + "Koura", + "Punit Singh", + "" + ], + [ + "Lachaux", + "Marie-Anne", + "" + ], + [ + "Lavril", + "Thibaut", + "" + ], + [ + "Lee", + "Jenya", + "" + ], + [ + "Liskovich", + "Diana", + "" + ], + [ + "Lu", + "Yinghai", + "" + ], + [ + "Mao", + "Yuning", + "" + ], + [ + "Martinet", + "Xavier", + "" + ], + [ + "Mihaylov", + "Todor", + "" + ], + [ + "Mishra", + "Pushkar", + "" + ], + [ + "Molybog", + "Igor", + "" + ], + [ + "Nie", + "Yixin", + "" + ], + [ + "Poulton", + "Andrew", + "" + ], + [ + "Reizenstein", + "Jeremy", + "" + ], + [ + "Rungta", + "Rashi", + "" + ], + [ + "Saladi", + "Kalyan", + "" + ], + [ + "Schelten", + "Alan", + "" + ], + [ + "Silva", + "Ruan", + "" + ], + [ + "Smith", + "Eric Michael", + "" + ], + [ + "Subramanian", + "Ranjan", + "" + ], + [ + "Tan", + "Xiaoqing Ellen", + "" + ], + [ + "Tang", + "Binh", + "" + ], + [ + "Taylor", + "Ross", + "" + ], + [ + "Williams", + "Adina", + "" + ], + [ + "Kuan", + "Jian Xiang", + "" + ], + [ + "Xu", + "Puxin", + "" + ], + [ + "Yan", + "Zheng", + "" + ], + [ + "Zarov", + "Iliyan", + "" + ], + [ + "Zhang", + "Yuchen", + "" + ], + [ + "Fan", + "Angela", + "" + ], + [ + "Kambadur", + "Melanie", + "" + ], + [ + "Narang", + "Sharan", + "" + ], + [ + "Rodriguez", + "Aurelien", + "" + ], + [ + "Stojnic", + "Robert", + "" + ], + [ + "Edunov", + "Sergey", + "" + ], + [ + "Scialom", + "Thomas", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI" + ], + "citation_count":3270.0, + "inf_cite_count":524.0, + "publication_date":"2023-07-18" + }, + { + "id":"2105.05233", + "submitter":"Prafulla Dhariwal", + "authors":"Prafulla Dhariwal, Alex Nichol", + "title":"Diffusion Models Beat GANs on Image Synthesis", + "comments":"Added compute requirements, ImageNet 256$\\times$256 upsampling FID\n and samples, DDIM guided sampler, fixed typos", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.CV stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We show that diffusion models can achieve image sample quality superior to\nthe current state-of-the-art generative models. We achieve this on\nunconditional image synthesis by finding a better architecture through a series\nof ablations. For conditional image synthesis, we further improve sample\nquality with classifier guidance: a simple, compute-efficient method for\ntrading off diversity for fidelity using gradients from a classifier. We\nachieve an FID of 2.97 on ImageNet 128$\\times$128, 4.59 on ImageNet\n256$\\times$256, and 7.72 on ImageNet 512$\\times$512, and we match BigGAN-deep\neven with as few as 25 forward passes per sample, all while maintaining better\ncoverage of the distribution. Finally, we find that classifier guidance\ncombines well with upsampling diffusion models, further improving FID to 3.94\non ImageNet 256$\\times$256 and 3.85 on ImageNet 512$\\times$512. We release our\ncode at https:\/\/github.com\/openai\/guided-diffusion\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 11 May 2021 17:50:24 GMT" + }, + { + "version":"v2", + "created":"Wed, 12 May 2021 17:57:59 GMT" + }, + { + "version":"v3", + "created":"Thu, 13 May 2021 17:57:08 GMT" + }, + { + "version":"v4", + "created":"Tue, 1 Jun 2021 17:49:49 GMT" + } + ], + "update_date":"2021-06-02", + "authors_parsed":[ + [ + "Dhariwal", + "Prafulla", + "" + ], + [ + "Nichol", + "Alex", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.CV", + "stat.ML" + ], + "citation_count":3593.0, + "inf_cite_count":520.0, + "publication_date":"2021-05-11" + }, + { + "id":"1606.06357", + "submitter":"Th\\'eo Trouillon", + "authors":"Th\\'eo Trouillon, Johannes Welbl, Sebastian Riedel, \\'Eric Gaussier,\n Guillaume Bouchard", + "title":"Complex Embeddings for Simple Link Prediction", + "comments":"10+2 pages, accepted at ICML 2016", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.AI cs.LG stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" In statistical relational learning, the link prediction problem is key to\nautomatically understand the structure of large knowledge bases. As in previous\nstudies, we propose to solve this problem through latent factorization.\nHowever, here we make use of complex valued embeddings. The composition of\ncomplex embeddings can handle a large variety of binary relations, among them\nsymmetric and antisymmetric relations. Compared to state-of-the-art models such\nas Neural Tensor Network and Holographic Embeddings, our approach based on\ncomplex embeddings is arguably simpler, as it only uses the Hermitian dot\nproduct, the complex counterpart of the standard dot product between real\nvectors. Our approach is scalable to large datasets as it remains linear in\nboth space and time, while consistently outperforming alternative approaches on\nstandard link prediction benchmarks.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 20 Jun 2016 22:52:48 GMT" + } + ], + "update_date":"2016-06-22", + "authors_parsed":[ + [ + "Trouillon", + "Th\u00e9o", + "" + ], + [ + "Welbl", + "Johannes", + "" + ], + [ + "Riedel", + "Sebastian", + "" + ], + [ + "Gaussier", + "\u00c9ric", + "" + ], + [ + "Bouchard", + "Guillaume", + "" + ] + ], + "categories_split":[ + "cs.AI", + "cs.LG", + "stat.ML" + ], + "citation_count":2330.0, + "inf_cite_count":518.0, + "publication_date":"2016-06-19" + }, + { + "id":"2303.08774", + "submitter":"Adrien Ecoffet", + "authors":"OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge\n Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam\n Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie\n Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan\n Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny\n Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim\n Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew\n Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che\n Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark\n Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings,\n Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch,\n Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien\n Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix,\n Sim\\'on Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie\n Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha\n Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene,\n Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff\n Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey,\n Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin\n Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang,\n Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun,\n Tomer Kaftan, {\\L}ukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish\n Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim,\n Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel\n Kokotajlo, {\\L}ukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle\n Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan\n Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin,\n Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna\n Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca\n Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney,\n Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta,\n Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco,\n Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David M\\'ely,\n Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo,\n Hyeonwoo Noh, Long Ouyang, Cullen O'Keefe, Jakub Pachocki, Alex Paino, Joe\n Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy\n Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de\n Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto,\n Michael (Rai) Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell,\n Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack\n Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross,\n Bob Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders,\n Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John\n Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah\n Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan\n Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie\n Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang,\n Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian,\n Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe\n Cer\\'on Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll\n Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei,\n CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt\n Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren\n Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin\n Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang,\n Shengjia Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, Barret Zoph", + "title":"GPT-4 Technical Report", + "comments":"100 pages; updated authors list; fixed author names and added\n citation", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We report the development of GPT-4, a large-scale, multimodal model which can\naccept image and text inputs and produce text outputs. While less capable than\nhumans in many real-world scenarios, GPT-4 exhibits human-level performance on\nvarious professional and academic benchmarks, including passing a simulated bar\nexam with a score around the top 10% of test takers. GPT-4 is a\nTransformer-based model pre-trained to predict the next token in a document.\nThe post-training alignment process results in improved performance on measures\nof factuality and adherence to desired behavior. A core component of this\nproject was developing infrastructure and optimization methods that behave\npredictably across a wide range of scales. This allowed us to accurately\npredict some aspects of GPT-4's performance based on models trained with no\nmore than 1\/1,000th the compute of GPT-4.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 15 Mar 2023 17:15:04 GMT" + }, + { + "version":"v2", + "created":"Thu, 16 Mar 2023 04:59:24 GMT" + }, + { + "version":"v3", + "created":"Mon, 27 Mar 2023 17:46:54 GMT" + }, + { + "version":"v4", + "created":"Tue, 19 Dec 2023 00:34:40 GMT" + }, + { + "version":"v5", + "created":"Fri, 1 Mar 2024 16:30:27 GMT" + }, + { + "version":"v6", + "created":"Mon, 4 Mar 2024 06:01:33 GMT" + } + ], + "update_date":"2024-03-11", + "authors_parsed":[ + [ + "OpenAI", + "", + "", + "Rai" + ], + [ + "Achiam", + "Josh", + "", + "Rai" + ], + [ + "Adler", + "Steven", + "", + "Rai" + ], + [ + "Agarwal", + "Sandhini", + "", + "Rai" + ], + [ + "Ahmad", + "Lama", + "", + "Rai" + ], + [ + "Akkaya", + "Ilge", + "", + "Rai" + ], + [ + "Aleman", + "Florencia Leoni", + "", + "Rai" + ], + [ + "Almeida", + "Diogo", + "", + "Rai" + ], + [ + "Altenschmidt", + "Janko", + "", + "Rai" + ], + [ + "Altman", + "Sam", + "", + "Rai" + ], + [ + "Anadkat", + "Shyamal", + "", + "Rai" + ], + [ + "Avila", + "Red", + "", + "Rai" + ], + [ + "Babuschkin", + "Igor", + "", + "Rai" + ], + [ + "Balaji", + "Suchir", + "", + "Rai" + ], + [ + "Balcom", + "Valerie", + "", + "Rai" + ], + [ + "Baltescu", + "Paul", + "", + "Rai" + ], + [ + "Bao", + "Haiming", + "", + "Rai" + ], + [ + "Bavarian", + "Mohammad", + "", + "Rai" + ], + [ + "Belgum", + "Jeff", + "", + "Rai" + ], + [ + "Bello", + "Irwan", + "", + "Rai" + ], + [ + "Berdine", + "Jake", + "", + "Rai" + ], + [ + "Bernadett-Shapiro", + "Gabriel", + "", + "Rai" + ], + [ + "Berner", + "Christopher", + "", + "Rai" + ], + [ + "Bogdonoff", + "Lenny", + "", + "Rai" + ], + [ + "Boiko", + "Oleg", + "", + "Rai" + ], + [ + "Boyd", + "Madelaine", + "", + "Rai" + ], + [ + "Brakman", + "Anna-Luisa", + "", + "Rai" + ], + [ + "Brockman", + "Greg", + "", + "Rai" + ], + [ + "Brooks", + "Tim", + "", + "Rai" + ], + [ + "Brundage", + "Miles", + "", + "Rai" + ], + [ + "Button", + "Kevin", + "", + "Rai" + ], + [ + "Cai", + "Trevor", + "", + "Rai" + ], + [ + "Campbell", + "Rosie", + "", + "Rai" + ], + [ + "Cann", + "Andrew", + "", + "Rai" + ], + [ + "Carey", + "Brittany", + "", + "Rai" + ], + [ + "Carlson", + "Chelsea", + "", + "Rai" + ], + [ + "Carmichael", + "Rory", + "", + "Rai" + ], + [ + "Chan", + "Brooke", + "", + "Rai" + ], + [ + "Chang", + "Che", + "", + "Rai" + ], + [ + "Chantzis", + "Fotis", + "", + "Rai" + ], + [ + "Chen", + "Derek", + "", + "Rai" + ], + [ + "Chen", + "Sully", + "", + "Rai" + ], + [ + "Chen", + "Ruby", + "", + "Rai" + ], + [ + "Chen", + "Jason", + "", + "Rai" + ], + [ + "Chen", + "Mark", + "", + "Rai" + ], + [ + "Chess", + "Ben", + "", + "Rai" + ], + [ + "Cho", + "Chester", + "", + "Rai" + ], + [ + "Chu", + "Casey", + "", + "Rai" + ], + [ + "Chung", + "Hyung Won", + "", + "Rai" + ], + [ + "Cummings", + "Dave", + "", + "Rai" + ], + [ + "Currier", + "Jeremiah", + "", + "Rai" + ], + [ + "Dai", + "Yunxing", + "", + "Rai" + ], + [ + "Decareaux", + "Cory", + "", + "Rai" + ], + [ + "Degry", + "Thomas", + "", + "Rai" + ], + [ + "Deutsch", + "Noah", + "", + "Rai" + ], + [ + "Deville", + "Damien", + "", + "Rai" + ], + [ + "Dhar", + "Arka", + "", + "Rai" + ], + [ + "Dohan", + "David", + "", + "Rai" + ], + [ + "Dowling", + "Steve", + "", + "Rai" + ], + [ + "Dunning", + "Sheila", + "", + "Rai" + ], + [ + "Ecoffet", + "Adrien", + "", + "Rai" + ], + [ + "Eleti", + "Atty", + "", + "Rai" + ], + [ + "Eloundou", + "Tyna", + "", + "Rai" + ], + [ + "Farhi", + "David", + "", + "Rai" + ], + [ + "Fedus", + "Liam", + "", + "Rai" + ], + [ + "Felix", + "Niko", + "", + "Rai" + ], + [ + "Fishman", + "Sim\u00f3n Posada", + "", + "Rai" + ], + [ + "Forte", + "Juston", + "", + "Rai" + ], + [ + "Fulford", + "Isabella", + "", + "Rai" + ], + [ + "Gao", + "Leo", + "", + "Rai" + ], + [ + "Georges", + "Elie", + "", + "Rai" + ], + [ + "Gibson", + "Christian", + "", + "Rai" + ], + [ + "Goel", + "Vik", + "", + "Rai" + ], + [ + "Gogineni", + "Tarun", + "", + "Rai" + ], + [ + "Goh", + "Gabriel", + "", + "Rai" + ], + [ + "Gontijo-Lopes", + "Rapha", + "", + "Rai" + ], + [ + "Gordon", + "Jonathan", + "", + "Rai" + ], + [ + "Grafstein", + "Morgan", + "", + "Rai" + ], + [ + "Gray", + "Scott", + "", + "Rai" + ], + [ + "Greene", + "Ryan", + "", + "Rai" + ], + [ + "Gross", + "Joshua", + "", + "Rai" + ], + [ + "Gu", + "Shixiang Shane", + "", + "Rai" + ], + [ + "Guo", + "Yufei", + "", + "Rai" + ], + [ + "Hallacy", + "Chris", + "", + "Rai" + ], + [ + "Han", + "Jesse", + "", + "Rai" + ], + [ + "Harris", + "Jeff", + "", + "Rai" + ], + [ + "He", + "Yuchen", + "", + "Rai" + ], + [ + "Heaton", + "Mike", + "", + "Rai" + ], + [ + "Heidecke", + "Johannes", + "", + "Rai" + ], + [ + "Hesse", + "Chris", + "", + "Rai" + ], + [ + "Hickey", + "Alan", + "", + "Rai" + ], + [ + "Hickey", + "Wade", + "", + "Rai" + ], + [ + "Hoeschele", + "Peter", + "", + "Rai" + ], + [ + "Houghton", + "Brandon", + "", + "Rai" + ], + [ + "Hsu", + "Kenny", + "", + "Rai" + ], + [ + "Hu", + "Shengli", + "", + "Rai" + ], + [ + "Hu", + "Xin", + "", + "Rai" + ], + [ + "Huizinga", + "Joost", + "", + "Rai" + ], + [ + "Jain", + "Shantanu", + "", + "Rai" + ], + [ + "Jain", + "Shawn", + "", + "Rai" + ], + [ + "Jang", + "Joanne", + "", + "Rai" + ], + [ + "Jiang", + "Angela", + "", + "Rai" + ], + [ + "Jiang", + "Roger", + "", + "Rai" + ], + [ + "Jin", + "Haozhun", + "", + "Rai" + ], + [ + "Jin", + "Denny", + "", + "Rai" + ], + [ + "Jomoto", + "Shino", + "", + "Rai" + ], + [ + "Jonn", + "Billie", + "", + "Rai" + ], + [ + "Jun", + "Heewoo", + "", + "Rai" + ], + [ + "Kaftan", + "Tomer", + "", + "Rai" + ], + [ + "Kaiser", + "\u0141ukasz", + "", + "Rai" + ], + [ + "Kamali", + "Ali", + "", + "Rai" + ], + [ + "Kanitscheider", + "Ingmar", + "", + "Rai" + ], + [ + "Keskar", + "Nitish Shirish", + "", + "Rai" + ], + [ + "Khan", + "Tabarak", + "", + "Rai" + ], + [ + "Kilpatrick", + "Logan", + "", + "Rai" + ], + [ + "Kim", + "Jong Wook", + "", + "Rai" + ], + [ + "Kim", + "Christina", + "", + "Rai" + ], + [ + "Kim", + "Yongjik", + "", + "Rai" + ], + [ + "Kirchner", + "Jan Hendrik", + "", + "Rai" + ], + [ + "Kiros", + "Jamie", + "", + "Rai" + ], + [ + "Knight", + "Matt", + "", + "Rai" + ], + [ + "Kokotajlo", + "Daniel", + "", + "Rai" + ], + [ + "Kondraciuk", + "\u0141ukasz", + "", + "Rai" + ], + [ + "Kondrich", + "Andrew", + "", + "Rai" + ], + [ + "Konstantinidis", + "Aris", + "", + "Rai" + ], + [ + "Kosic", + "Kyle", + "", + "Rai" + ], + [ + "Krueger", + "Gretchen", + "", + "Rai" + ], + [ + "Kuo", + "Vishal", + "", + "Rai" + ], + [ + "Lampe", + "Michael", + "", + "Rai" + ], + [ + "Lan", + "Ikai", + "", + "Rai" + ], + [ + "Lee", + "Teddy", + "", + "Rai" + ], + [ + "Leike", + "Jan", + "", + "Rai" + ], + [ + "Leung", + "Jade", + "", + "Rai" + ], + [ + "Levy", + "Daniel", + "", + "Rai" + ], + [ + "Li", + "Chak Ming", + "", + "Rai" + ], + [ + "Lim", + "Rachel", + "", + "Rai" + ], + [ + "Lin", + "Molly", + "", + "Rai" + ], + [ + "Lin", + "Stephanie", + "", + "Rai" + ], + [ + "Litwin", + "Mateusz", + "", + "Rai" + ], + [ + "Lopez", + "Theresa", + "", + "Rai" + ], + [ + "Lowe", + "Ryan", + "", + "Rai" + ], + [ + "Lue", + "Patricia", + "", + "Rai" + ], + [ + "Makanju", + "Anna", + "", + "Rai" + ], + [ + "Malfacini", + "Kim", + "", + "Rai" + ], + [ + "Manning", + "Sam", + "", + "Rai" + ], + [ + "Markov", + "Todor", + "", + "Rai" + ], + [ + "Markovski", + "Yaniv", + "", + "Rai" + ], + [ + "Martin", + "Bianca", + "", + "Rai" + ], + [ + "Mayer", + "Katie", + "", + "Rai" + ], + [ + "Mayne", + "Andrew", + "", + "Rai" + ], + [ + "McGrew", + "Bob", + "", + "Rai" + ], + [ + "McKinney", + "Scott Mayer", + "", + "Rai" + ], + [ + "McLeavey", + "Christine", + "", + "Rai" + ], + [ + "McMillan", + "Paul", + "", + "Rai" + ], + [ + "McNeil", + "Jake", + "", + "Rai" + ], + [ + "Medina", + "David", + "", + "Rai" + ], + [ + "Mehta", + "Aalok", + "", + "Rai" + ], + [ + "Menick", + "Jacob", + "", + "Rai" + ], + [ + "Metz", + "Luke", + "", + "Rai" + ], + [ + "Mishchenko", + "Andrey", + "", + "Rai" + ], + [ + "Mishkin", + "Pamela", + "", + "Rai" + ], + [ + "Monaco", + "Vinnie", + "", + "Rai" + ], + [ + "Morikawa", + "Evan", + "", + "Rai" + ], + [ + "Mossing", + "Daniel", + "", + "Rai" + ], + [ + "Mu", + "Tong", + "", + "Rai" + ], + [ + "Murati", + "Mira", + "", + "Rai" + ], + [ + "Murk", + "Oleg", + "", + "Rai" + ], + [ + "M\u00e9ly", + "David", + "", + "Rai" + ], + [ + "Nair", + "Ashvin", + "", + "Rai" + ], + [ + "Nakano", + "Reiichiro", + "", + "Rai" + ], + [ + "Nayak", + "Rajeev", + "", + "Rai" + ], + [ + "Neelakantan", + "Arvind", + "", + "Rai" + ], + [ + "Ngo", + "Richard", + "", + "Rai" + ], + [ + "Noh", + "Hyeonwoo", + "", + "Rai" + ], + [ + "Ouyang", + "Long", + "", + "Rai" + ], + [ + "O'Keefe", + "Cullen", + "", + "Rai" + ], + [ + "Pachocki", + "Jakub", + "", + "Rai" + ], + [ + "Paino", + "Alex", + "", + "Rai" + ], + [ + "Palermo", + "Joe", + "", + "Rai" + ], + [ + "Pantuliano", + "Ashley", + "", + "Rai" + ], + [ + "Parascandolo", + "Giambattista", + "", + "Rai" + ], + [ + "Parish", + "Joel", + "", + "Rai" + ], + [ + "Parparita", + "Emy", + "", + "Rai" + ], + [ + "Passos", + "Alex", + "", + "Rai" + ], + [ + "Pavlov", + "Mikhail", + "", + "Rai" + ], + [ + "Peng", + "Andrew", + "", + "Rai" + ], + [ + "Perelman", + "Adam", + "", + "Rai" + ], + [ + "Peres", + "Filipe de Avila Belbute", + "", + "Rai" + ], + [ + "Petrov", + "Michael", + "", + "Rai" + ], + [ + "Pinto", + "Henrique Ponde de Oliveira", + "", + "Rai" + ], + [ + "Michael", + "", + "", + "Rai" + ], + [ + "Pokorny", + "", + "" + ], + [ + "Pokrass", + "Michelle", + "" + ], + [ + "Pong", + "Vitchyr H.", + "" + ], + [ + "Powell", + "Tolly", + "" + ], + [ + "Power", + "Alethea", + "" + ], + [ + "Power", + "Boris", + "" + ], + [ + "Proehl", + "Elizabeth", + "" + ], + [ + "Puri", + "Raul", + "" + ], + [ + "Radford", + "Alec", + "" + ], + [ + "Rae", + "Jack", + "" + ], + [ + "Ramesh", + "Aditya", + "" + ], + [ + "Raymond", + "Cameron", + "" + ], + [ + "Real", + "Francis", + "" + ], + [ + "Rimbach", + "Kendra", + "" + ], + [ + "Ross", + "Carl", + "" + ], + [ + "Rotsted", + "Bob", + "" + ], + [ + "Roussez", + "Henri", + "" + ], + [ + "Ryder", + "Nick", + "" + ], + [ + "Saltarelli", + "Mario", + "" + ], + [ + "Sanders", + "Ted", + "" + ], + [ + "Santurkar", + "Shibani", + "" + ], + [ + "Sastry", + "Girish", + "" + ], + [ + "Schmidt", + "Heather", + "" + ], + [ + "Schnurr", + "David", + "" + ], + [ + "Schulman", + "John", + "" + ], + [ + "Selsam", + "Daniel", + "" + ], + [ + "Sheppard", + "Kyla", + "" + ], + [ + "Sherbakov", + "Toki", + "" + ], + [ + "Shieh", + "Jessica", + "" + ], + [ + "Shoker", + "Sarah", + "" + ], + [ + "Shyam", + "Pranav", + "" + ], + [ + "Sidor", + "Szymon", + "" + ], + [ + "Sigler", + "Eric", + "" + ], + [ + "Simens", + "Maddie", + "" + ], + [ + "Sitkin", + "Jordan", + "" + ], + [ + "Slama", + "Katarina", + "" + ], + [ + "Sohl", + "Ian", + "" + ], + [ + "Sokolowsky", + "Benjamin", + "" + ], + [ + "Song", + "Yang", + "" + ], + [ + "Staudacher", + "Natalie", + "" + ], + [ + "Such", + "Felipe Petroski", + "" + ], + [ + "Summers", + "Natalie", + "" + ], + [ + "Sutskever", + "Ilya", + "" + ], + [ + "Tang", + "Jie", + "" + ], + [ + "Tezak", + "Nikolas", + "" + ], + [ + "Thompson", + "Madeleine B.", + "" + ], + [ + "Tillet", + "Phil", + "" + ], + [ + "Tootoonchian", + "Amin", + "" + ], + [ + "Tseng", + "Elizabeth", + "" + ], + [ + "Tuggle", + "Preston", + "" + ], + [ + "Turley", + "Nick", + "" + ], + [ + "Tworek", + "Jerry", + "" + ], + [ + "Uribe", + "Juan Felipe Cer\u00f3n", + "" + ], + [ + "Vallone", + "Andrea", + "" + ], + [ + "Vijayvergiya", + "Arun", + "" + ], + [ + "Voss", + "Chelsea", + "" + ], + [ + "Wainwright", + "Carroll", + "" + ], + [ + "Wang", + "Justin Jay", + "" + ], + [ + "Wang", + "Alvin", + "" + ], + [ + "Wang", + "Ben", + "" + ], + [ + "Ward", + "Jonathan", + "" + ], + [ + "Wei", + "Jason", + "" + ], + [ + "Weinmann", + "CJ", + "" + ], + [ + "Welihinda", + "Akila", + "" + ], + [ + "Welinder", + "Peter", + "" + ], + [ + "Weng", + "Jiayi", + "" + ], + [ + "Weng", + "Lilian", + "" + ], + [ + "Wiethoff", + "Matt", + "" + ], + [ + "Willner", + "Dave", + "" + ], + [ + "Winter", + "Clemens", + "" + ], + [ + "Wolrich", + "Samuel", + "" + ], + [ + "Wong", + "Hannah", + "" + ], + [ + "Workman", + "Lauren", + "" + ], + [ + "Wu", + "Sherwin", + "" + ], + [ + "Wu", + "Jeff", + "" + ], + [ + "Wu", + "Michael", + "" + ], + [ + "Xiao", + "Kai", + "" + ], + [ + "Xu", + "Tao", + "" + ], + [ + "Yoo", + "Sarah", + "" + ], + [ + "Yu", + "Kevin", + "" + ], + [ + "Yuan", + "Qiming", + "" + ], + [ + "Zaremba", + "Wojciech", + "" + ], + [ + "Zellers", + "Rowan", + "" + ], + [ + "Zhang", + "Chong", + "" + ], + [ + "Zhang", + "Marvin", + "" + ], + [ + "Zhao", + "Shengjia", + "" + ], + [ + "Zheng", + "Tianhao", + "" + ], + [ + "Zhuang", + "Juntang", + "" + ], + [ + "Zhuk", + "William", + "" + ], + [ + "Zoph", + "Barret", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI" + ], + "citation_count":3386.0, + "inf_cite_count":495.0, + "publication_date":"2023-03-15" + }, + { + "id":"1606.01540", + "submitter":"John Schulman", + "authors":"Greg Brockman, Vicki Cheung, Ludwig Pettersson, Jonas Schneider, John\n Schulman, Jie Tang, Wojciech Zaremba", + "title":"OpenAI Gym", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" OpenAI Gym is a toolkit for reinforcement learning research. It includes a\ngrowing collection of benchmark problems that expose a common interface, and a\nwebsite where people can share their results and compare the performance of\nalgorithms. This whitepaper discusses the components of OpenAI Gym and the\ndesign decisions that went into the software.\n", + "versions":[ + { + "version":"v1", + "created":"Sun, 5 Jun 2016 17:54:48 GMT" + } + ], + "update_date":"2016-06-07", + "authors_parsed":[ + [ + "Brockman", + "Greg", + "" + ], + [ + "Cheung", + "Vicki", + "" + ], + [ + "Pettersson", + "Ludwig", + "" + ], + [ + "Schneider", + "Jonas", + "" + ], + [ + "Schulman", + "John", + "" + ], + [ + "Tang", + "Jie", + "" + ], + [ + "Zaremba", + "Wojciech", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI" + ], + "citation_count":4412.0, + "inf_cite_count":487.0, + "publication_date":"2016-06-05" + }, + { + "id":"1606.03476", + "submitter":"Jonathan Ho", + "authors":"Jonathan Ho, Stefano Ermon", + "title":"Generative Adversarial Imitation Learning", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Consider learning a policy from example expert behavior, without interaction\nwith the expert or access to reinforcement signal. One approach is to recover\nthe expert's cost function with inverse reinforcement learning, then extract a\npolicy from that cost function with reinforcement learning. This approach is\nindirect and can be slow. We propose a new general framework for directly\nextracting a policy from data, as if it were obtained by reinforcement learning\nfollowing inverse reinforcement learning. We show that a certain instantiation\nof our framework draws an analogy between imitation learning and generative\nadversarial networks, from which we derive a model-free imitation learning\nalgorithm that obtains significant performance gains over existing model-free\nmethods in imitating complex behaviors in large, high-dimensional environments.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 10 Jun 2016 20:51:29 GMT" + } + ], + "update_date":"2016-06-14", + "authors_parsed":[ + [ + "Ho", + "Jonathan", + "" + ], + [ + "Ermon", + "Stefano", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI" + ], + "citation_count":2472.0, + "inf_cite_count":485.0, + "publication_date":"2016-06-10" + }, + { + "id":"1612.00837", + "submitter":"Yash Goyal", + "authors":"Yash Goyal, Tejas Khot, Douglas Summers-Stay, Dhruv Batra, Devi Parikh", + "title":"Making the V in VQA Matter: Elevating the Role of Image Understanding in\n Visual Question Answering", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.CL cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Problems at the intersection of vision and language are of significant\nimportance both as challenging research questions and for the rich set of\napplications they enable. However, inherent structure in our world and bias in\nour language tend to be a simpler signal for learning than visual modalities,\nresulting in models that ignore visual information, leading to an inflated\nsense of their capability.\n We propose to counter these language priors for the task of Visual Question\nAnswering (VQA) and make vision (the V in VQA) matter! Specifically, we balance\nthe popular VQA dataset by collecting complementary images such that every\nquestion in our balanced dataset is associated with not just a single image,\nbut rather a pair of similar images that result in two different answers to the\nquestion. Our dataset is by construction more balanced than the original VQA\ndataset and has approximately twice the number of image-question pairs. Our\ncomplete balanced dataset is publicly available at www.visualqa.org as part of\nthe 2nd iteration of the Visual Question Answering Dataset and Challenge (VQA\nv2.0).\n We further benchmark a number of state-of-art VQA models on our balanced\ndataset. All models perform significantly worse on our balanced dataset,\nsuggesting that these models have indeed learned to exploit language priors.\nThis finding provides the first concrete empirical evidence for what seems to\nbe a qualitative sense among practitioners.\n Finally, our data collection protocol for identifying complementary images\nenables us to develop a novel interpretable model, which in addition to\nproviding an answer to the given (image, question) pair, also provides a\ncounter-example based explanation. Specifically, it identifies an image that is\nsimilar to the original image, but it believes has a different answer to the\nsame question. This can help in building trust for machines among their users.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 2 Dec 2016 20:57:07 GMT" + }, + { + "version":"v2", + "created":"Fri, 14 Apr 2017 18:20:13 GMT" + }, + { + "version":"v3", + "created":"Mon, 15 May 2017 17:58:49 GMT" + } + ], + "update_date":"2017-05-16", + "authors_parsed":[ + [ + "Goyal", + "Yash", + "" + ], + [ + "Khot", + "Tejas", + "" + ], + [ + "Summers-Stay", + "Douglas", + "" + ], + [ + "Batra", + "Dhruv", + "" + ], + [ + "Parikh", + "Devi", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.CL", + "cs.LG" + ], + "citation_count":2202.0, + "inf_cite_count":473.0, + "publication_date":"2016-12-02" + }, + { + "id":"2201.11903", + "submitter":"Jason Wei", + "authors":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter,\n Fei Xia, Ed Chi, Quoc Le, Denny Zhou", + "title":"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI", + "license":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/", + "abstract":" We explore how generating a chain of thought -- a series of intermediate\nreasoning steps -- significantly improves the ability of large language models\nto perform complex reasoning. In particular, we show how such reasoning\nabilities emerge naturally in sufficiently large language models via a simple\nmethod called chain of thought prompting, where a few chain of thought\ndemonstrations are provided as exemplars in prompting. Experiments on three\nlarge language models show that chain of thought prompting improves performance\non a range of arithmetic, commonsense, and symbolic reasoning tasks. The\nempirical gains can be striking. For instance, prompting a 540B-parameter\nlanguage model with just eight chain of thought exemplars achieves state of the\nart accuracy on the GSM8K benchmark of math word problems, surpassing even\nfinetuned GPT-3 with a verifier.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 28 Jan 2022 02:33:07 GMT" + }, + { + "version":"v2", + "created":"Wed, 6 Apr 2022 03:51:50 GMT" + }, + { + "version":"v3", + "created":"Wed, 1 Jun 2022 00:10:30 GMT" + }, + { + "version":"v4", + "created":"Mon, 13 Jun 2022 21:44:34 GMT" + }, + { + "version":"v5", + "created":"Mon, 10 Oct 2022 20:21:17 GMT" + }, + { + "version":"v6", + "created":"Tue, 10 Jan 2023 23:07:57 GMT" + } + ], + "update_date":"2023-01-12", + "authors_parsed":[ + [ + "Wei", + "Jason", + "" + ], + [ + "Wang", + "Xuezhi", + "" + ], + [ + "Schuurmans", + "Dale", + "" + ], + [ + "Bosma", + "Maarten", + "" + ], + [ + "Ichter", + "Brian", + "" + ], + [ + "Xia", + "Fei", + "" + ], + [ + "Chi", + "Ed", + "" + ], + [ + "Le", + "Quoc", + "" + ], + [ + "Zhou", + "Denny", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI" + ], + "citation_count":3313.0, + "inf_cite_count":468.0, + "publication_date":"2022-01-28" + }, + { + "id":"2106.07447", + "submitter":"Wei-Ning Hsu", + "authors":"Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia,\n Ruslan Salakhutdinov, Abdelrahman Mohamed", + "title":"HuBERT: Self-Supervised Speech Representation Learning by Masked\n Prediction of Hidden Units", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI cs.LG eess.AS", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Self-supervised approaches for speech representation learning are challenged\nby three unique problems: (1) there are multiple sound units in each input\nutterance, (2) there is no lexicon of input sound units during the pre-training\nphase, and (3) sound units have variable lengths with no explicit segmentation.\nTo deal with these three problems, we propose the Hidden-Unit BERT (HuBERT)\napproach for self-supervised speech representation learning, which utilizes an\noffline clustering step to provide aligned target labels for a BERT-like\nprediction loss. A key ingredient of our approach is applying the prediction\nloss over the masked regions only, which forces the model to learn a combined\nacoustic and language model over the continuous inputs. HuBERT relies primarily\non the consistency of the unsupervised clustering step rather than the\nintrinsic quality of the assigned cluster labels. Starting with a simple\nk-means teacher of 100 clusters, and using two iterations of clustering, the\nHuBERT model either matches or improves upon the state-of-the-art wav2vec 2.0\nperformance on the Librispeech (960h) and Libri-light (60,000h) benchmarks with\n10min, 1h, 10h, 100h, and 960h fine-tuning subsets. Using a 1B parameter model,\nHuBERT shows up to 19% and 13% relative WER reduction on the more challenging\ndev-other and test-other evaluation subsets.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 14 Jun 2021 14:14:28 GMT" + } + ], + "update_date":"2021-06-15", + "authors_parsed":[ + [ + "Hsu", + "Wei-Ning", + "" + ], + [ + "Bolte", + "Benjamin", + "" + ], + [ + "Tsai", + "Yao-Hung Hubert", + "" + ], + [ + "Lakhotia", + "Kushal", + "" + ], + [ + "Salakhutdinov", + "Ruslan", + "" + ], + [ + "Mohamed", + "Abdelrahman", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI", + "cs.LG", + "eess.AS" + ], + "citation_count":1516.0, + "inf_cite_count":463.0, + "publication_date":"2021-06-14" + }, + { + "id":"1807.03039", + "submitter":"Prafulla Dhariwal", + "authors":"Diederik P. Kingma, Prafulla Dhariwal", + "title":"Glow: Generative Flow with Invertible 1x1 Convolutions", + "comments":"15 pages; fixed typo in abstract", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"stat.ML cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Flow-based generative models (Dinh et al., 2014) are conceptually attractive\ndue to tractability of the exact log-likelihood, tractability of exact\nlatent-variable inference, and parallelizability of both training and\nsynthesis. In this paper we propose Glow, a simple type of generative flow\nusing an invertible 1x1 convolution. Using our method we demonstrate a\nsignificant improvement in log-likelihood on standard benchmarks. Perhaps most\nstrikingly, we demonstrate that a generative model optimized towards the plain\nlog-likelihood objective is capable of efficient realistic-looking synthesis\nand manipulation of large images. The code for our model is available at\nhttps:\/\/github.com\/openai\/glow\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 9 Jul 2018 10:57:26 GMT" + }, + { + "version":"v2", + "created":"Tue, 10 Jul 2018 05:12:03 GMT" + } + ], + "update_date":"2018-07-11", + "authors_parsed":[ + [ + "Kingma", + "Diederik P.", + "" + ], + [ + "Dhariwal", + "Prafulla", + "" + ] + ], + "categories_split":[ + "stat.ML", + "cs.AI", + "cs.LG" + ], + "citation_count":2555.0, + "inf_cite_count":454.0, + "publication_date":"2018-07-09" + }, + { + "id":"1903.07291", + "submitter":"Ming-Yu Liu", + "authors":"Taesung Park, Ming-Yu Liu, Ting-Chun Wang, Jun-Yan Zhu", + "title":"Semantic Image Synthesis with Spatially-Adaptive Normalization", + "comments":"Accepted as a CVPR 2019 oral paper", + "journal-ref":"CVPR 2019", + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.GR cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We propose spatially-adaptive normalization, a simple but effective layer for\nsynthesizing photorealistic images given an input semantic layout. Previous\nmethods directly feed the semantic layout as input to the deep network, which\nis then processed through stacks of convolution, normalization, and\nnonlinearity layers. We show that this is suboptimal as the normalization\nlayers tend to ``wash away'' semantic information. To address the issue, we\npropose using the input layout for modulating the activations in normalization\nlayers through a spatially-adaptive, learned transformation. Experiments on\nseveral challenging datasets demonstrate the advantage of the proposed method\nover existing approaches, regarding both visual fidelity and alignment with\ninput layouts. Finally, our model allows user control over both semantic and\nstyle. Code is available at https:\/\/github.com\/NVlabs\/SPADE .\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 18 Mar 2019 08:12:23 GMT" + }, + { + "version":"v2", + "created":"Tue, 5 Nov 2019 15:41:27 GMT" + } + ], + "update_date":"2019-11-06", + "authors_parsed":[ + [ + "Park", + "Taesung", + "" + ], + [ + "Liu", + "Ming-Yu", + "" + ], + [ + "Wang", + "Ting-Chun", + "" + ], + [ + "Zhu", + "Jun-Yan", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.GR", + "cs.LG" + ], + "citation_count":2218.0, + "inf_cite_count":447.0, + "publication_date":"2019-03-18" + }, + { + "id":"1506.03340", + "submitter":"Karl Moritz Hermann", + "authors":"Karl Moritz Hermann, Tom\\'a\\v{s} Ko\\v{c}isk\\'y, Edward Grefenstette,\n Lasse Espeholt, Will Kay, Mustafa Suleyman and Phil Blunsom", + "title":"Teaching Machines to Read and Comprehend", + "comments":"Appears in: Advances in Neural Information Processing Systems 28\n (NIPS 2015). 14 pages, 13 figures", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI cs.NE", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Teaching machines to read natural language documents remains an elusive\nchallenge. Machine reading systems can be tested on their ability to answer\nquestions posed on the contents of documents that they have seen, but until now\nlarge scale training and test datasets have been missing for this type of\nevaluation. In this work we define a new methodology that resolves this\nbottleneck and provides large scale supervised reading comprehension data. This\nallows us to develop a class of attention based deep neural networks that learn\nto read real documents and answer complex questions with minimal prior\nknowledge of language structure.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 10 Jun 2015 14:54:39 GMT" + }, + { + "version":"v2", + "created":"Thu, 1 Oct 2015 15:04:49 GMT" + }, + { + "version":"v3", + "created":"Thu, 19 Nov 2015 15:43:23 GMT" + } + ], + "update_date":"2015-11-20", + "authors_parsed":[ + [ + "Hermann", + "Karl Moritz", + "" + ], + [ + "Ko\u010disk\u00fd", + "Tom\u00e1\u0161", + "" + ], + [ + "Grefenstette", + "Edward", + "" + ], + [ + "Espeholt", + "Lasse", + "" + ], + [ + "Kay", + "Will", + "" + ], + [ + "Suleyman", + "Mustafa", + "" + ], + [ + "Blunsom", + "Phil", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI", + "cs.NE" + ], + "citation_count":3204.0, + "inf_cite_count":440.0, + "publication_date":"2015-06-10" + }, + { + "id":"1803.03635", + "submitter":"Jonathan Frankle", + "authors":"Jonathan Frankle and Michael Carbin", + "title":"The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks", + "comments":"ICLR camera ready", + "journal-ref":"ICLR 2019", + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.NE", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Neural network pruning techniques can reduce the parameter counts of trained\nnetworks by over 90%, decreasing storage requirements and improving\ncomputational performance of inference without compromising accuracy. However,\ncontemporary experience is that the sparse architectures produced by pruning\nare difficult to train from the start, which would similarly improve training\nperformance.\n We find that a standard pruning technique naturally uncovers subnetworks\nwhose initializations made them capable of training effectively. Based on these\nresults, we articulate the \"lottery ticket hypothesis:\" dense,\nrandomly-initialized, feed-forward networks contain subnetworks (\"winning\ntickets\") that - when trained in isolation - reach test accuracy comparable to\nthe original network in a similar number of iterations. The winning tickets we\nfind have won the initialization lottery: their connections have initial\nweights that make training particularly effective.\n We present an algorithm to identify winning tickets and a series of\nexperiments that support the lottery ticket hypothesis and the importance of\nthese fortuitous initializations. We consistently find winning tickets that are\nless than 10-20% of the size of several fully-connected and convolutional\nfeed-forward architectures for MNIST and CIFAR10. Above this size, the winning\ntickets that we find learn faster than the original network and reach higher\ntest accuracy.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 9 Mar 2018 18:51:28 GMT" + }, + { + "version":"v2", + "created":"Mon, 23 Apr 2018 19:58:09 GMT" + }, + { + "version":"v3", + "created":"Sun, 20 May 2018 19:46:47 GMT" + }, + { + "version":"v4", + "created":"Tue, 27 Nov 2018 20:03:01 GMT" + }, + { + "version":"v5", + "created":"Mon, 4 Mar 2019 15:51:11 GMT" + } + ], + "update_date":"2019-03-05", + "authors_parsed":[ + [ + "Frankle", + "Jonathan", + "" + ], + [ + "Carbin", + "Michael", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.NE" + ], + "citation_count":2705.0, + "inf_cite_count":437.0, + "publication_date":"2018-03-09" + }, + { + "id":"1812.05905", + "submitter":"Tuomas Haarnoja", + "authors":"Tuomas Haarnoja, Aurick Zhou, Kristian Hartikainen, George Tucker,\n Sehoon Ha, Jie Tan, Vikash Kumar, Henry Zhu, Abhishek Gupta, Pieter Abbeel\n and Sergey Levine", + "title":"Soft Actor-Critic Algorithms and Applications", + "comments":"arXiv admin note: substantial text overlap with arXiv:1801.01290", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.RO stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Model-free deep reinforcement learning (RL) algorithms have been successfully\napplied to a range of challenging sequential decision making and control tasks.\nHowever, these methods typically suffer from two major challenges: high sample\ncomplexity and brittleness to hyperparameters. Both of these challenges limit\nthe applicability of such methods to real-world domains. In this paper, we\ndescribe Soft Actor-Critic (SAC), our recently introduced off-policy\nactor-critic algorithm based on the maximum entropy RL framework. In this\nframework, the actor aims to simultaneously maximize expected return and\nentropy. That is, to succeed at the task while acting as randomly as possible.\nWe extend SAC to incorporate a number of modifications that accelerate training\nand improve stability with respect to the hyperparameters, including a\nconstrained formulation that automatically tunes the temperature\nhyperparameter. We systematically evaluate SAC on a range of benchmark tasks,\nas well as real-world challenging tasks such as locomotion for a quadrupedal\nrobot and robotic manipulation with a dexterous hand. With these improvements,\nSAC achieves state-of-the-art performance, outperforming prior on-policy and\noff-policy methods in sample-efficiency and asymptotic performance.\nFurthermore, we demonstrate that, in contrast to other off-policy algorithms,\nour approach is very stable, achieving similar performance across different\nrandom seeds. These results suggest that SAC is a promising candidate for\nlearning in real-world robotics tasks.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 13 Dec 2018 04:44:29 GMT" + }, + { + "version":"v2", + "created":"Tue, 29 Jan 2019 12:10:47 GMT" + } + ], + "update_date":"2019-09-16", + "authors_parsed":[ + [ + "Haarnoja", + "Tuomas", + "" + ], + [ + "Zhou", + "Aurick", + "" + ], + [ + "Hartikainen", + "Kristian", + "" + ], + [ + "Tucker", + "George", + "" + ], + [ + "Ha", + "Sehoon", + "" + ], + [ + "Tan", + "Jie", + "" + ], + [ + "Kumar", + "Vikash", + "" + ], + [ + "Zhu", + "Henry", + "" + ], + [ + "Gupta", + "Abhishek", + "" + ], + [ + "Abbeel", + "Pieter", + "" + ], + [ + "Levine", + "Sergey", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.RO", + "stat.ML" + ], + "citation_count":1752.0, + "inf_cite_count":436.0, + "publication_date":"2018-12-13" + }, + { + "id":"1907.02893", + "submitter":"Martin Arjovsky", + "authors":"Martin Arjovsky, L\\'eon Bottou, Ishaan Gulrajani, David Lopez-Paz", + "title":"Invariant Risk Minimization", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"stat.ML cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We introduce Invariant Risk Minimization (IRM), a learning paradigm to\nestimate invariant correlations across multiple training distributions. To\nachieve this goal, IRM learns a data representation such that the optimal\nclassifier, on top of that data representation, matches for all training\ndistributions. Through theory and experiments, we show how the invariances\nlearned by IRM relate to the causal structures governing the data and enable\nout-of-distribution generalization.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 5 Jul 2019 15:26:26 GMT" + }, + { + "version":"v2", + "created":"Sun, 1 Sep 2019 09:17:10 GMT" + }, + { + "version":"v3", + "created":"Fri, 27 Mar 2020 19:07:58 GMT" + } + ], + "update_date":"2020-03-31", + "authors_parsed":[ + [ + "Arjovsky", + "Martin", + "" + ], + [ + "Bottou", + "L\u00e9on", + "" + ], + [ + "Gulrajani", + "Ishaan", + "" + ], + [ + "Lopez-Paz", + "David", + "" + ] + ], + "categories_split":[ + "stat.ML", + "cs.AI", + "cs.LG" + ], + "citation_count":1631.0, + "inf_cite_count":423.0, + "publication_date":"2019-07-05" + }, + { + "id":"1503.00075", + "submitter":"Kai Sheng Tai", + "authors":"Kai Sheng Tai, Richard Socher, Christopher D. Manning", + "title":"Improved Semantic Representations From Tree-Structured Long Short-Term\n Memory Networks", + "comments":"Accepted for publication at ACL 2015", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Because of their superior ability to preserve sequence information over time,\nLong Short-Term Memory (LSTM) networks, a type of recurrent neural network with\na more complex computational unit, have obtained strong results on a variety of\nsequence modeling tasks. The only underlying LSTM structure that has been\nexplored so far is a linear chain. However, natural language exhibits syntactic\nproperties that would naturally combine words to phrases. We introduce the\nTree-LSTM, a generalization of LSTMs to tree-structured network topologies.\nTree-LSTMs outperform all existing systems and strong LSTM baselines on two\ntasks: predicting the semantic relatedness of two sentences (SemEval 2014, Task\n1) and sentiment classification (Stanford Sentiment Treebank).\n", + "versions":[ + { + "version":"v1", + "created":"Sat, 28 Feb 2015 06:31:50 GMT" + }, + { + "version":"v2", + "created":"Thu, 5 Mar 2015 20:13:25 GMT" + }, + { + "version":"v3", + "created":"Sat, 30 May 2015 06:51:20 GMT" + } + ], + "update_date":"2015-06-02", + "authors_parsed":[ + [ + "Tai", + "Kai Sheng", + "" + ], + [ + "Socher", + "Richard", + "" + ], + [ + "Manning", + "Christopher D.", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI", + "cs.LG" + ], + "citation_count":2961.0, + "inf_cite_count":422.0, + "publication_date":"2015-02-27" + }, + { + "id":"1207.4708", + "submitter":"Marc G. Bellemare", + "authors":"Marc G. Bellemare, Yavar Naddaf, Joel Veness, Michael Bowling", + "title":"The Arcade Learning Environment: An Evaluation Platform for General\n Agents", + "comments":null, + "journal-ref":"Journal of Artificial Intelligence Research 47, pages 253-279", + "doi":"10.1613\/jair.3912", + "report-no":null, + "categories":"cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" In this article we introduce the Arcade Learning Environment (ALE): both a\nchallenge problem and a platform and methodology for evaluating the development\nof general, domain-independent AI technology. ALE provides an interface to\nhundreds of Atari 2600 game environments, each one different, interesting, and\ndesigned to be a challenge for human players. ALE presents significant research\nchallenges for reinforcement learning, model learning, model-based planning,\nimitation learning, transfer learning, and intrinsic motivation. Most\nimportantly, it provides a rigorous testbed for evaluating and comparing\napproaches to these problems. We illustrate the promise of ALE by developing\nand benchmarking domain-independent agents designed using well-established AI\ntechniques for both reinforcement learning and planning. In doing so, we also\npropose an evaluation methodology made possible by ALE, reporting empirical\nresults on over 55 different games. All of the software, including the\nbenchmark agents, is publicly available.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 19 Jul 2012 15:33:25 GMT" + }, + { + "version":"v2", + "created":"Fri, 21 Jun 2013 18:07:06 GMT" + } + ], + "update_date":"2013-06-24", + "authors_parsed":[ + [ + "Bellemare", + "Marc G.", + "" + ], + [ + "Naddaf", + "Yavar", + "" + ], + [ + "Veness", + "Joel", + "" + ], + [ + "Bowling", + "Michael", + "" + ] + ], + "categories_split":[ + "cs.AI" + ], + "citation_count":2655.0, + "inf_cite_count":417.0, + "publication_date":"2012-07-19" + }, + { + "id":"1609.08144", + "submitter":"Mike Schuster", + "authors":"Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,\n Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, Jeff\n Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, {\\L}ukasz Kaiser,\n Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,\n George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason\n Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, Jeffrey\n Dean", + "title":"Google's Neural Machine Translation System: Bridging the Gap between\n Human and Machine Translation", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Neural Machine Translation (NMT) is an end-to-end learning approach for\nautomated translation, with the potential to overcome many of the weaknesses of\nconventional phrase-based translation systems. Unfortunately, NMT systems are\nknown to be computationally expensive both in training and in translation\ninference. Also, most NMT systems have difficulty with rare words. These issues\nhave hindered NMT's use in practical deployments and services, where both\naccuracy and speed are essential. In this work, we present GNMT, Google's\nNeural Machine Translation system, which attempts to address many of these\nissues. Our model consists of a deep LSTM network with 8 encoder and 8 decoder\nlayers using attention and residual connections. To improve parallelism and\ntherefore decrease training time, our attention mechanism connects the bottom\nlayer of the decoder to the top layer of the encoder. To accelerate the final\ntranslation speed, we employ low-precision arithmetic during inference\ncomputations. To improve handling of rare words, we divide words into a limited\nset of common sub-word units (\"wordpieces\") for both input and output. This\nmethod provides a good balance between the flexibility of \"character\"-delimited\nmodels and the efficiency of \"word\"-delimited models, naturally handles\ntranslation of rare words, and ultimately improves the overall accuracy of the\nsystem. Our beam search technique employs a length-normalization procedure and\nuses a coverage penalty, which encourages generation of an output sentence that\nis most likely to cover all the words in the source sentence. On the WMT'14\nEnglish-to-French and English-to-German benchmarks, GNMT achieves competitive\nresults to state-of-the-art. Using a human side-by-side evaluation on a set of\nisolated simple sentences, it reduces translation errors by an average of 60%\ncompared to Google's phrase-based production system.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 26 Sep 2016 19:59:55 GMT" + }, + { + "version":"v2", + "created":"Sat, 8 Oct 2016 19:10:41 GMT" + } + ], + "update_date":"2016-10-11", + "authors_parsed":[ + [ + "Wu", + "Yonghui", + "" + ], + [ + "Schuster", + "Mike", + "" + ], + [ + "Chen", + "Zhifeng", + "" + ], + [ + "Le", + "Quoc V.", + "" + ], + [ + "Norouzi", + "Mohammad", + "" + ], + [ + "Macherey", + "Wolfgang", + "" + ], + [ + "Krikun", + "Maxim", + "" + ], + [ + "Cao", + "Yuan", + "" + ], + [ + "Gao", + "Qin", + "" + ], + [ + "Macherey", + "Klaus", + "" + ], + [ + "Klingner", + "Jeff", + "" + ], + [ + "Shah", + "Apurva", + "" + ], + [ + "Johnson", + "Melvin", + "" + ], + [ + "Liu", + "Xiaobing", + "" + ], + [ + "Kaiser", + "\u0141ukasz", + "" + ], + [ + "Gouws", + "Stephan", + "" + ], + [ + "Kato", + "Yoshikiyo", + "" + ], + [ + "Kudo", + "Taku", + "" + ], + [ + "Kazawa", + "Hideto", + "" + ], + [ + "Stevens", + "Keith", + "" + ], + [ + "Kurian", + "George", + "" + ], + [ + "Patil", + "Nishant", + "" + ], + [ + "Wang", + "Wei", + "" + ], + [ + "Young", + "Cliff", + "" + ], + [ + "Smith", + "Jason", + "" + ], + [ + "Riesa", + "Jason", + "" + ], + [ + "Rudnick", + "Alex", + "" + ], + [ + "Vinyals", + "Oriol", + "" + ], + [ + "Corrado", + "Greg", + "" + ], + [ + "Hughes", + "Macduff", + "" + ], + [ + "Dean", + "Jeffrey", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI", + "cs.LG" + ], + "citation_count":6205.0, + "inf_cite_count":415.0, + "publication_date":"2016-09-26" + }, + { + "id":"cs\/9605103", + "submitter":null, + "authors":"L. P. Kaelbling, M. L. Littman, A. W. Moore", + "title":"Reinforcement Learning: A Survey", + "comments":"See http:\/\/www.jair.org\/ for any accompanying files", + "journal-ref":"Journal of Artificial Intelligence Research, Vol 4, (1996),\n 237-285", + "doi":null, + "report-no":null, + "categories":"cs.AI", + "license":null, + "abstract":" This paper surveys the field of reinforcement learning from a\ncomputer-science perspective. It is written to be accessible to researchers\nfamiliar with machine learning. Both the historical basis of the field and a\nbroad selection of current work are summarized. Reinforcement learning is the\nproblem faced by an agent that learns behavior through trial-and-error\ninteractions with a dynamic environment. The work described here has a\nresemblance to work in psychology, but differs considerably in the details and\nin the use of the word ``reinforcement.'' The paper discusses central issues of\nreinforcement learning, including trading off exploration and exploitation,\nestablishing the foundations of the field via Markov decision theory, learning\nfrom delayed reinforcement, constructing empirical models to accelerate\nlearning, making use of generalization and hierarchy, and coping with hidden\nstate. It concludes with a survey of some implemented systems and an assessment\nof the practical utility of current methods for reinforcement learning.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 1 May 1996 00:00:00 GMT" + } + ], + "update_date":"2014-11-17", + "authors_parsed":[ + [ + "Kaelbling", + "L. P.", + "" + ], + [ + "Littman", + "M. L.", + "" + ], + [ + "Moore", + "A. W.", + "" + ] + ], + "categories_split":[ + "cs.AI" + ], + "citation_count":8396.0, + "inf_cite_count":408.0, + "publication_date":"1996-04-30" + }, + { + "id":"1802.00420", + "submitter":"Anish Athalye", + "authors":"Anish Athalye, Nicholas Carlini, David Wagner", + "title":"Obfuscated Gradients Give a False Sense of Security: Circumventing\n Defenses to Adversarial Examples", + "comments":"ICML 2018. Source code at\n https:\/\/github.com\/anishathalye\/obfuscated-gradients", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.CR", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We identify obfuscated gradients, a kind of gradient masking, as a phenomenon\nthat leads to a false sense of security in defenses against adversarial\nexamples. While defenses that cause obfuscated gradients appear to defeat\niterative optimization-based attacks, we find defenses relying on this effect\ncan be circumvented. We describe characteristic behaviors of defenses\nexhibiting the effect, and for each of the three types of obfuscated gradients\nwe discover, we develop attack techniques to overcome it. In a case study,\nexamining non-certified white-box-secure defenses at ICLR 2018, we find\nobfuscated gradients are a common occurrence, with 7 of 9 defenses relying on\nobfuscated gradients. Our new attacks successfully circumvent 6 completely, and\n1 partially, in the original threat model each paper considers.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 1 Feb 2018 18:20:05 GMT" + }, + { + "version":"v2", + "created":"Thu, 15 Feb 2018 16:32:56 GMT" + }, + { + "version":"v3", + "created":"Thu, 7 Jun 2018 16:37:42 GMT" + }, + { + "version":"v4", + "created":"Tue, 31 Jul 2018 00:09:56 GMT" + } + ], + "update_date":"2018-08-01", + "authors_parsed":[ + [ + "Athalye", + "Anish", + "" + ], + [ + "Carlini", + "Nicholas", + "" + ], + [ + "Wagner", + "David", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.CR" + ], + "citation_count":2813.0, + "inf_cite_count":407.0, + "publication_date":"2018-02-01" + }, + { + "id":"1011.0686", + "submitter":"Stephane Ross", + "authors":"Stephane Ross, Geoffrey J. Gordon, J. Andrew Bagnell", + "title":"A Reduction of Imitation Learning and Structured Prediction to No-Regret\n Online Learning", + "comments":"Appearing in the 14th International Conference on Artificial\n Intelligence and Statistics (AISTATS 2011)", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Sequential prediction problems such as imitation learning, where future\nobservations depend on previous predictions (actions), violate the common\ni.i.d. assumptions made in statistical learning. This leads to poor performance\nin theory and often in practice. Some recent approaches provide stronger\nguarantees in this setting, but remain somewhat unsatisfactory as they train\neither non-stationary or stochastic policies and require a large number of\niterations. In this paper, we propose a new iterative algorithm, which trains a\nstationary deterministic policy, that can be seen as a no regret algorithm in\nan online learning setting. We show that any such no regret algorithm, combined\nwith additional reduction assumptions, must find a policy with good performance\nunder the distribution of observations it induces in such sequential settings.\nWe demonstrate that this new approach outperforms previous approaches on two\nchallenging imitation learning problems and a benchmark sequence labeling\nproblem.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 2 Nov 2010 17:55:55 GMT" + }, + { + "version":"v2", + "created":"Wed, 3 Nov 2010 15:59:19 GMT" + }, + { + "version":"v3", + "created":"Wed, 16 Mar 2011 18:51:21 GMT" + } + ], + "update_date":"2015-03-17", + "authors_parsed":[ + [ + "Ross", + "Stephane", + "" + ], + [ + "Gordon", + "Geoffrey J.", + "" + ], + [ + "Bagnell", + "J. Andrew", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "stat.ML" + ], + "citation_count":2623.0, + "inf_cite_count":406.0, + "publication_date":"2010-11-02" + }, + { + "id":"1604.07379", + "submitter":"Deepak Pathak", + "authors":"Deepak Pathak, Philipp Krahenbuhl, Jeff Donahue, Trevor Darrell,\n Alexei A. Efros", + "title":"Context Encoders: Feature Learning by Inpainting", + "comments":"New results on ImageNet Generation", + "journal-ref":"CVPR 2016", + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.GR cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We present an unsupervised visual feature learning algorithm driven by\ncontext-based pixel prediction. By analogy with auto-encoders, we propose\nContext Encoders -- a convolutional neural network trained to generate the\ncontents of an arbitrary image region conditioned on its surroundings. In order\nto succeed at this task, context encoders need to both understand the content\nof the entire image, as well as produce a plausible hypothesis for the missing\npart(s). When training context encoders, we have experimented with both a\nstandard pixel-wise reconstruction loss, as well as a reconstruction plus an\nadversarial loss. The latter produces much sharper results because it can\nbetter handle multiple modes in the output. We found that a context encoder\nlearns a representation that captures not just appearance but also the\nsemantics of visual structures. We quantitatively demonstrate the effectiveness\nof our learned features for CNN pre-training on classification, detection, and\nsegmentation tasks. Furthermore, context encoders can be used for semantic\ninpainting tasks, either stand-alone or as initialization for non-parametric\nmethods.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 25 Apr 2016 19:42:46 GMT" + }, + { + "version":"v2", + "created":"Mon, 21 Nov 2016 20:56:42 GMT" + } + ], + "update_date":"2016-11-22", + "authors_parsed":[ + [ + "Pathak", + "Deepak", + "" + ], + [ + "Krahenbuhl", + "Philipp", + "" + ], + [ + "Donahue", + "Jeff", + "" + ], + [ + "Darrell", + "Trevor", + "" + ], + [ + "Efros", + "Alexei A.", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.GR", + "cs.LG" + ], + "citation_count":4736.0, + "inf_cite_count":403.0, + "publication_date":"2016-04-25" + }, + { + "id":"1106.4561", + "submitter":"M. Fox", + "authors":"M. Fox, D. Long", + "title":"PDDL2.1: An Extension to PDDL for Expressing Temporal Planning Domains", + "comments":null, + "journal-ref":"Journal Of Artificial Intelligence Research, Volume 20, pages\n 61-124, 2003", + "doi":"10.1613\/jair.1129", + "report-no":null, + "categories":"cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" In recent years research in the planning community has moved increasingly\ntoward s application of planners to realistic problems involving both time and\nmany typ es of resources. For example, interest in planning demonstrated by the\nspace res earch community has inspired work in observation scheduling,\nplanetary rover ex ploration and spacecraft control domains. Other temporal and\nresource-intensive domains including logistics planning, plant control and\nmanufacturing have also helped to focus the community on the modelling and\nreasoning issues that must be confronted to make planning technology meet the\nchallenges of application. The International Planning Competitions have acted\nas an important motivating fo rce behind the progress that has been made in\nplanning since 1998. The third com petition (held in 2002) set the planning\ncommunity the challenge of handling tim e and numeric resources. This\nnecessitated the development of a modelling langua ge capable of expressing\ntemporal and numeric properties of planning domains. In this paper we describe\nthe language, PDDL2.1, that was used in the competition. We describe the syntax\nof the language, its formal semantics and the validation of concurrent plans.\nWe observe that PDDL2.1 has considerable modelling power --- exceeding the\ncapabilities of current planning technology --- and presents a number of\nimportant challenges to the research community.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 22 Jun 2011 20:20:10 GMT" + } + ], + "update_date":"2011-06-24", + "authors_parsed":[ + [ + "Fox", + "M.", + "" + ], + [ + "Long", + "D.", + "" + ] + ], + "categories_split":[ + "cs.AI" + ], + "citation_count":2031.0, + "inf_cite_count":402.0, + "publication_date":"2003-12-01" + }, + { + "id":"2304.02643", + "submitter":"Alexander Kirillov", + "authors":"Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe\n Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg,\n Wan-Yen Lo, Piotr Doll\\'ar, Ross Girshick", + "title":"Segment Anything", + "comments":"Project web-page: https:\/\/segment-anything.com", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We introduce the Segment Anything (SA) project: a new task, model, and\ndataset for image segmentation. Using our efficient model in a data collection\nloop, we built the largest segmentation dataset to date (by far), with over 1\nbillion masks on 11M licensed and privacy respecting images. The model is\ndesigned and trained to be promptable, so it can transfer zero-shot to new\nimage distributions and tasks. We evaluate its capabilities on numerous tasks\nand find that its zero-shot performance is impressive -- often competitive with\nor even superior to prior fully supervised results. We are releasing the\nSegment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and\n11M images at https:\/\/segment-anything.com to foster research into foundation\nmodels for computer vision.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 5 Apr 2023 17:59:46 GMT" + } + ], + "update_date":"2023-04-06", + "authors_parsed":[ + [ + "Kirillov", + "Alexander", + "" + ], + [ + "Mintun", + "Eric", + "" + ], + [ + "Ravi", + "Nikhila", + "" + ], + [ + "Mao", + "Hanzi", + "" + ], + [ + "Rolland", + "Chloe", + "" + ], + [ + "Gustafson", + "Laura", + "" + ], + [ + "Xiao", + "Tete", + "" + ], + [ + "Whitehead", + "Spencer", + "" + ], + [ + "Berg", + "Alexander C.", + "" + ], + [ + "Lo", + "Wan-Yen", + "" + ], + [ + "Doll\u00e1r", + "Piotr", + "" + ], + [ + "Girshick", + "Ross", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG" + ], + "citation_count":2031.0, + "inf_cite_count":401.0, + "publication_date":"2023-04-05" + }, + { + "id":"1802.01548", + "submitter":"Esteban Real", + "authors":"Esteban Real, Alok Aggarwal, Yanping Huang and Quoc V Le", + "title":"Regularized Evolution for Image Classifier Architecture Search", + "comments":"Accepted for publication at AAAI 2019, the Thirty-Third AAAI\n Conference on Artificial Intelligence", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.NE cs.AI cs.CV cs.DC", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" The effort devoted to hand-crafting neural network image classifiers has\nmotivated the use of architecture search to discover them automatically.\nAlthough evolutionary algorithms have been repeatedly applied to neural network\ntopologies, the image classifiers thus discovered have remained inferior to\nhuman-crafted ones. Here, we evolve an image classifier---AmoebaNet-A---that\nsurpasses hand-designs for the first time. To do this, we modify the tournament\nselection evolutionary algorithm by introducing an age property to favor the\nyounger genotypes. Matching size, AmoebaNet-A has comparable accuracy to\ncurrent state-of-the-art ImageNet models discovered with more complex\narchitecture-search methods. Scaled to larger size, AmoebaNet-A sets a new\nstate-of-the-art 83.9% \/ 96.6% top-5 ImageNet accuracy. In a controlled\ncomparison against a well known reinforcement learning algorithm, we give\nevidence that evolution can obtain results faster with the same hardware,\nespecially at the earlier stages of the search. This is relevant when fewer\ncompute resources are available. Evolution is, thus, a simple method to\neffectively discover high-quality architectures.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 5 Feb 2018 18:20:52 GMT" + }, + { + "version":"v2", + "created":"Tue, 6 Feb 2018 18:24:29 GMT" + }, + { + "version":"v3", + "created":"Thu, 1 Mar 2018 00:10:00 GMT" + }, + { + "version":"v4", + "created":"Mon, 25 Jun 2018 06:21:47 GMT" + }, + { + "version":"v5", + "created":"Thu, 4 Oct 2018 00:11:37 GMT" + }, + { + "version":"v6", + "created":"Fri, 26 Oct 2018 05:56:00 GMT" + }, + { + "version":"v7", + "created":"Sat, 16 Feb 2019 23:28:16 GMT" + } + ], + "update_date":"2019-02-19", + "authors_parsed":[ + [ + "Real", + "Esteban", + "" + ], + [ + "Aggarwal", + "Alok", + "" + ], + [ + "Huang", + "Yanping", + "" + ], + [ + "Le", + "Quoc V", + "" + ] + ], + "categories_split":[ + "cs.NE", + "cs.AI", + "cs.CV", + "cs.DC" + ], + "citation_count":2609.0, + "inf_cite_count":386.0, + "publication_date":"2018-02-05" + }, + { + "id":"1706.07269", + "submitter":"Tim Miller", + "authors":"Tim Miller", + "title":"Explanation in Artificial Intelligence: Insights from the Social\n Sciences", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" There has been a recent resurgence in the area of explainable artificial\nintelligence as researchers and practitioners seek to make their algorithms\nmore understandable. Much of this research is focused on explicitly explaining\ndecisions or actions to a human observer, and it should not be controversial to\nsay that looking at how humans explain to each other can serve as a useful\nstarting point for explanation in artificial intelligence. However, it is fair\nto say that most work in explainable artificial intelligence uses only the\nresearchers' intuition of what constitutes a `good' explanation. There exists\nvast and valuable bodies of research in philosophy, psychology, and cognitive\nscience of how people define, generate, select, evaluate, and present\nexplanations, which argues that people employ certain cognitive biases and\nsocial expectations towards the explanation process. This paper argues that the\nfield of explainable artificial intelligence should build on this existing\nresearch, and reviews relevant papers from philosophy, cognitive\npsychology\/science, and social psychology, which study these topics. It draws\nout some important findings, and discusses ways that these can be infused with\nwork on explainable artificial intelligence.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 22 Jun 2017 11:46:11 GMT" + }, + { + "version":"v2", + "created":"Thu, 24 May 2018 02:43:30 GMT" + }, + { + "version":"v3", + "created":"Wed, 15 Aug 2018 00:50:00 GMT" + } + ], + "update_date":"2018-08-16", + "authors_parsed":[ + [ + "Miller", + "Tim", + "" + ] + ], + "categories_split":[ + "cs.AI" + ], + "citation_count":3283.0, + "inf_cite_count":382.0, + "publication_date":"2017-06-22" + }, + { + "id":"1905.02249", + "submitter":"David Berthelot", + "authors":"David Berthelot, Nicholas Carlini, Ian Goodfellow, Nicolas Papernot,\n Avital Oliver, Colin Raffel", + "title":"MixMatch: A Holistic Approach to Semi-Supervised Learning", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.CV stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Semi-supervised learning has proven to be a powerful paradigm for leveraging\nunlabeled data to mitigate the reliance on large labeled datasets. In this\nwork, we unify the current dominant approaches for semi-supervised learning to\nproduce a new algorithm, MixMatch, that works by guessing low-entropy labels\nfor data-augmented unlabeled examples and mixing labeled and unlabeled data\nusing MixUp. We show that MixMatch obtains state-of-the-art results by a large\nmargin across many datasets and labeled data amounts. For example, on CIFAR-10\nwith 250 labels, we reduce error rate by a factor of 4 (from 38% to 11%) and by\na factor of 2 on STL-10. We also demonstrate how MixMatch can help achieve a\ndramatically better accuracy-privacy trade-off for differential privacy.\nFinally, we perform an ablation study to tease apart which components of\nMixMatch are most important for its success.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 6 May 2019 19:56:03 GMT" + }, + { + "version":"v2", + "created":"Wed, 23 Oct 2019 18:47:34 GMT" + } + ], + "update_date":"2019-10-25", + "authors_parsed":[ + [ + "Berthelot", + "David", + "" + ], + [ + "Carlini", + "Nicholas", + "" + ], + [ + "Goodfellow", + "Ian", + "" + ], + [ + "Papernot", + "Nicolas", + "" + ], + [ + "Oliver", + "Avital", + "" + ], + [ + "Raffel", + "Colin", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.CV", + "stat.ML" + ], + "citation_count":2397.0, + "inf_cite_count":380.0, + "publication_date":"2019-05-06" + }, + { + "id":"1706.08840", + "submitter":"David Lopez-Paz", + "authors":"David Lopez-Paz and Marc'Aurelio Ranzato", + "title":"Gradient Episodic Memory for Continual Learning", + "comments":"Published at NIPS 2017", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" One major obstacle towards AI is the poor ability of models to solve new\nproblems quicker, and without forgetting previously acquired knowledge. To\nbetter understand this issue, we study the problem of continual learning, where\nthe model observes, once and one by one, examples concerning a sequence of\ntasks. First, we propose a set of metrics to evaluate models learning over a\ncontinuum of data. These metrics characterize models not only by their test\naccuracy, but also in terms of their ability to transfer knowledge across\ntasks. Second, we propose a model for continual learning, called Gradient\nEpisodic Memory (GEM) that alleviates forgetting, while allowing beneficial\ntransfer of knowledge to previous tasks. Our experiments on variants of the\nMNIST and CIFAR-100 datasets demonstrate the strong performance of GEM when\ncompared to the state-of-the-art.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 26 Jun 2017 14:53:34 GMT" + }, + { + "version":"v2", + "created":"Wed, 28 Jun 2017 16:19:12 GMT" + }, + { + "version":"v3", + "created":"Wed, 12 Jul 2017 17:14:59 GMT" + }, + { + "version":"v4", + "created":"Wed, 2 Aug 2017 20:37:51 GMT" + }, + { + "version":"v5", + "created":"Sat, 4 Nov 2017 13:11:18 GMT" + }, + { + "version":"v6", + "created":"Tue, 13 Sep 2022 14:47:52 GMT" + } + ], + "update_date":"2022-09-14", + "authors_parsed":[ + [ + "Lopez-Paz", + "David", + "" + ], + [ + "Ranzato", + "Marc'Aurelio", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI" + ], + "citation_count":2015.0, + "inf_cite_count":378.0, + "publication_date":"2017-06-01" + }, + { + "id":"1003.0146", + "submitter":"Lihong Li", + "authors":"Lihong Li, Wei Chu, John Langford, Robert E. Schapire", + "title":"A Contextual-Bandit Approach to Personalized News Article Recommendation", + "comments":"10 pages, 5 figures", + "journal-ref":"Presented at the Nineteenth International Conference on World Wide\n Web (WWW 2010), Raleigh, NC, USA, 2010", + "doi":"10.1145\/1772690.1772758", + "report-no":null, + "categories":"cs.LG cs.AI cs.IR", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Personalized web services strive to adapt their services (advertisements,\nnews articles, etc) to individual users by making use of both content and user\ninformation. Despite a few recent advances, this problem remains challenging\nfor at least two reasons. First, web service is featured with dynamically\nchanging pools of content, rendering traditional collaborative filtering\nmethods inapplicable. Second, the scale of most web services of practical\ninterest calls for solutions that are both fast in learning and computation.\n In this work, we model personalized recommendation of news articles as a\ncontextual bandit problem, a principled approach in which a learning algorithm\nsequentially selects articles to serve users based on contextual information\nabout the users and articles, while simultaneously adapting its\narticle-selection strategy based on user-click feedback to maximize total user\nclicks.\n The contributions of this work are three-fold. First, we propose a new,\ngeneral contextual bandit algorithm that is computationally efficient and well\nmotivated from learning theory. Second, we argue that any bandit algorithm can\nbe reliably evaluated offline using previously recorded random traffic.\nFinally, using this offline evaluation method, we successfully applied our new\nalgorithm to a Yahoo! Front Page Today Module dataset containing over 33\nmillion events. Results showed a 12.5% click lift compared to a standard\ncontext-free bandit algorithm, and the advantage becomes even greater when data\ngets more scarce.\n", + "versions":[ + { + "version":"v1", + "created":"Sun, 28 Feb 2010 02:18:59 GMT" + }, + { + "version":"v2", + "created":"Thu, 1 Mar 2012 23:49:42 GMT" + } + ], + "update_date":"2012-03-05", + "authors_parsed":[ + [ + "Li", + "Lihong", + "" + ], + [ + "Chu", + "Wei", + "" + ], + [ + "Langford", + "John", + "" + ], + [ + "Schapire", + "Robert E.", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.IR" + ], + "citation_count":2605.0, + "inf_cite_count":376.0, + "publication_date":"2010-02-27" + }, + { + "id":"cs\/0102027", + "submitter":"Candida Ferreira", + "authors":"Candida Ferreira", + "title":"Gene Expression Programming: a New Adaptive Algorithm for Solving\n Problems", + "comments":"22 pages, 17 figures", + "journal-ref":"Complex Systems, 13(2): 87-129, 2001", + "doi":null, + "report-no":null, + "categories":"cs.AI cs.NE", + "license":null, + "abstract":" Gene expression programming, a genotype\/phenotype genetic algorithm (linear\nand ramified), is presented here for the first time as a new technique for the\ncreation of computer programs. Gene expression programming uses character\nlinear chromosomes composed of genes structurally organized in a head and a\ntail. The chromosomes function as a genome and are subjected to modification by\nmeans of mutation, transposition, root transposition, gene transposition, gene\nrecombination, and one- and two-point recombination. The chromosomes encode\nexpression trees which are the object of selection. The creation of these\nseparate entities (genome and expression tree) with distinct functions allows\nthe algorithm to perform with high efficiency that greatly surpasses existing\nadaptive techniques. The suite of problems chosen to illustrate the power and\nversatility of gene expression programming includes symbolic regression,\nsequence induction with and without constant creation, block stacking, cellular\nautomata rules for the density-classification problem, and two problems of\nboolean concept learning: the 11-multiplexer and the GP rule problem.\n", + "versions":[ + { + "version":"v1", + "created":"Sun, 25 Feb 2001 19:29:55 GMT" + }, + { + "version":"v2", + "created":"Tue, 17 Apr 2001 12:21:18 GMT" + }, + { + "version":"v3", + "created":"Sun, 30 Dec 2001 20:04:47 GMT" + } + ], + "update_date":"2007-05-23", + "authors_parsed":[ + [ + "Ferreira", + "Candida", + "" + ] + ], + "categories_split":[ + "cs.AI", + "cs.NE" + ], + "citation_count":2134.0, + "inf_cite_count":376.0, + "publication_date":"2001-02-25" + }, + { + "id":"1707.01495", + "submitter":"Marcin Andrychowicz", + "authors":"Marcin Andrychowicz, Filip Wolski, Alex Ray, Jonas Schneider, Rachel\n Fong, Peter Welinder, Bob McGrew, Josh Tobin, Pieter Abbeel, Wojciech Zaremba", + "title":"Hindsight Experience Replay", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.NE cs.RO", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Dealing with sparse rewards is one of the biggest challenges in Reinforcement\nLearning (RL). We present a novel technique called Hindsight Experience Replay\nwhich allows sample-efficient learning from rewards which are sparse and binary\nand therefore avoid the need for complicated reward engineering. It can be\ncombined with an arbitrary off-policy RL algorithm and may be seen as a form of\nimplicit curriculum.\n We demonstrate our approach on the task of manipulating objects with a\nrobotic arm. In particular, we run experiments on three different tasks:\npushing, sliding, and pick-and-place, in each case using only binary rewards\nindicating whether or not the task is completed. Our ablation studies show that\nHindsight Experience Replay is a crucial ingredient which makes training\npossible in these challenging environments. We show that our policies trained\non a physics simulation can be deployed on a physical robot and successfully\ncomplete the task.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 5 Jul 2017 17:55:53 GMT" + }, + { + "version":"v2", + "created":"Mon, 10 Jul 2017 18:35:33 GMT" + }, + { + "version":"v3", + "created":"Fri, 23 Feb 2018 10:04:20 GMT" + } + ], + "update_date":"2018-02-26", + "authors_parsed":[ + [ + "Andrychowicz", + "Marcin", + "" + ], + [ + "Wolski", + "Filip", + "" + ], + [ + "Ray", + "Alex", + "" + ], + [ + "Schneider", + "Jonas", + "" + ], + [ + "Fong", + "Rachel", + "" + ], + [ + "Welinder", + "Peter", + "" + ], + [ + "McGrew", + "Bob", + "" + ], + [ + "Tobin", + "Josh", + "" + ], + [ + "Abbeel", + "Pieter", + "" + ], + [ + "Zaremba", + "Wojciech", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.NE", + "cs.RO" + ], + "citation_count":1939.0, + "inf_cite_count":374.0, + "publication_date":"2017-07-05" + }, + { + "id":"0805.2368", + "submitter":"Alex Smola J", + "authors":"Arthur Gretton, Karsten Borgwardt, Malte J. Rasch, Bernhard Scholkopf,\n Alexander J. Smola", + "title":"A Kernel Method for the Two-Sample Problem", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We propose a framework for analyzing and comparing distributions, allowing us\nto design statistical tests to determine if two samples are drawn from\ndifferent distributions. Our test statistic is the largest difference in\nexpectations over functions in the unit ball of a reproducing kernel Hilbert\nspace (RKHS). We present two tests based on large deviation bounds for the test\nstatistic, while a third is based on the asymptotic distribution of this\nstatistic. The test statistic can be computed in quadratic time, although\nefficient linear time approximations are available. Several classical metrics\non distributions are recovered when the function space used to compute the\ndifference in expectations is allowed to be more general (eg. a Banach space).\nWe apply our two-sample tests to a variety of problems, including attribute\nmatching for databases using the Hungarian marriage method, where they perform\nstrongly. Excellent performance is also obtained when comparing distributions\nover graphs, for which these are the first such tests.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 15 May 2008 17:46:53 GMT" + } + ], + "update_date":"2008-05-16", + "authors_parsed":[ + [ + "Gretton", + "Arthur", + "" + ], + [ + "Borgwardt", + "Karsten", + "" + ], + [ + "Rasch", + "Malte J.", + "" + ], + [ + "Scholkopf", + "Bernhard", + "" + ], + [ + "Smola", + "Alexander J.", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI" + ], + "citation_count":2074.0, + "inf_cite_count":369.0, + "publication_date":"2006-12-04" + }, + { + "id":"1511.05493", + "submitter":"Yujia Li", + "authors":"Yujia Li, Daniel Tarlow, Marc Brockschmidt, Richard Zemel", + "title":"Gated Graph Sequence Neural Networks", + "comments":"Published as a conference paper in ICLR 2016. Fixed a typo", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.NE stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Graph-structured data appears frequently in domains including chemistry,\nnatural language semantics, social networks, and knowledge bases. In this work,\nwe study feature learning techniques for graph-structured inputs. Our starting\npoint is previous work on Graph Neural Networks (Scarselli et al., 2009), which\nwe modify to use gated recurrent units and modern optimization techniques and\nthen extend to output sequences. The result is a flexible and broadly useful\nclass of neural network models that has favorable inductive biases relative to\npurely sequence-based models (e.g., LSTMs) when the problem is\ngraph-structured. We demonstrate the capabilities on some simple AI (bAbI) and\ngraph algorithm learning tasks. We then show it achieves state-of-the-art\nperformance on a problem from program verification, in which subgraphs need to\nbe matched to abstract data structures.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 17 Nov 2015 18:10:12 GMT" + }, + { + "version":"v2", + "created":"Thu, 19 Nov 2015 22:03:02 GMT" + }, + { + "version":"v3", + "created":"Tue, 3 May 2016 21:55:01 GMT" + }, + { + "version":"v4", + "created":"Fri, 22 Sep 2017 21:36:00 GMT" + } + ], + "update_date":"2017-09-26", + "authors_parsed":[ + [ + "Li", + "Yujia", + "" + ], + [ + "Tarlow", + "Daniel", + "" + ], + [ + "Brockschmidt", + "Marc", + "" + ], + [ + "Zemel", + "Richard", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.NE", + "stat.ML" + ], + "citation_count":2895.0, + "inf_cite_count":359.0, + "publication_date":"2015-11-17" + }, + { + "id":"1302.4964", + "submitter":"George H. John", + "authors":"George H. John, Pat Langley", + "title":"Estimating Continuous Distributions in Bayesian Classifiers", + "comments":"Appears in Proceedings of the Eleventh Conference on Uncertainty in\n Artificial Intelligence (UAI1995)", + "journal-ref":null, + "doi":null, + "report-no":"UAI-P-1995-PG-338-345", + "categories":"cs.LG cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" When modeling a probability distribution with a Bayesian network, we are\nfaced with the problem of how to handle continuous variables. Most previous\nwork has either solved the problem by discretizing, or assumed that the data\nare generated by a single Gaussian. In this paper we abandon the normality\nassumption and instead use statistical methods for nonparametric density\nestimation. For a naive Bayesian classifier, we present experimental results on\na variety of natural and artificial domains, comparing two methods of density\nestimation: assuming normality and modeling each conditional distribution with\na single Gaussian; and using nonparametric kernel density estimation. We\nobserve large reductions in error on several natural and artificial data sets,\nwhich suggests that kernel estimation is a useful tool for learning Bayesian\nmodels.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 20 Feb 2013 15:22:01 GMT" + } + ], + "update_date":"2013-02-21", + "authors_parsed":[ + [ + "John", + "George H.", + "" + ], + [ + "Langley", + "Pat", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "stat.ML" + ], + "citation_count":3571.0, + "inf_cite_count":349.0, + "publication_date":"1995-08-18" + }, + { + "id":"1609.07843", + "submitter":"Richard Socher", + "authors":"Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher", + "title":"Pointer Sentinel Mixture Models", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Recent neural network sequence models with softmax classifiers have achieved\ntheir best language modeling performance only with very large hidden states and\nlarge vocabularies. Even then they struggle to predict rare or unseen words\neven if the context makes the prediction unambiguous. We introduce the pointer\nsentinel mixture architecture for neural sequence models which has the ability\nto either reproduce a word from the recent context or produce a word from a\nstandard softmax classifier. Our pointer sentinel-LSTM model achieves state of\nthe art language modeling performance on the Penn Treebank (70.9 perplexity)\nwhile using far fewer parameters than a standard softmax LSTM. In order to\nevaluate how well language models can exploit longer contexts and deal with\nmore realistic vocabularies and larger corpora we also introduce the freely\navailable WikiText corpus.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 26 Sep 2016 04:06:13 GMT" + } + ], + "update_date":"2016-09-27", + "authors_parsed":[ + [ + "Merity", + "Stephen", + "" + ], + [ + "Xiong", + "Caiming", + "" + ], + [ + "Bradbury", + "James", + "" + ], + [ + "Socher", + "Richard", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI" + ], + "citation_count":1844.0, + "inf_cite_count":345.0, + "publication_date":"2016-09-26" + }, + { + "id":"2012.07436", + "submitter":"Haoyi Zhou", + "authors":"Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui\n Xiong, Wancai Zhang", + "title":"Informer: Beyond Efficient Transformer for Long Sequence Time-Series\n Forecasting", + "comments":"8 pages (main), 5 pages (appendix) and to be appeared in AAAI2021", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.IR", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Many real-world applications require the prediction of long sequence\ntime-series, such as electricity consumption planning. Long sequence\ntime-series forecasting (LSTF) demands a high prediction capacity of the model,\nwhich is the ability to capture precise long-range dependency coupling between\noutput and input efficiently. Recent studies have shown the potential of\nTransformer to increase the prediction capacity. However, there are several\nsevere issues with Transformer that prevent it from being directly applicable\nto LSTF, including quadratic time complexity, high memory usage, and inherent\nlimitation of the encoder-decoder architecture. To address these issues, we\ndesign an efficient transformer-based model for LSTF, named Informer, with\nthree distinctive characteristics: (i) a $ProbSparse$ self-attention mechanism,\nwhich achieves $O(L \\log L)$ in time complexity and memory usage, and has\ncomparable performance on sequences' dependency alignment. (ii) the\nself-attention distilling highlights dominating attention by halving cascading\nlayer input, and efficiently handles extreme long input sequences. (iii) the\ngenerative style decoder, while conceptually simple, predicts the long\ntime-series sequences at one forward operation rather than a step-by-step way,\nwhich drastically improves the inference speed of long-sequence predictions.\nExtensive experiments on four large-scale datasets demonstrate that Informer\nsignificantly outperforms existing methods and provides a new solution to the\nLSTF problem.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 14 Dec 2020 11:43:09 GMT" + }, + { + "version":"v2", + "created":"Thu, 17 Dec 2020 03:05:27 GMT" + }, + { + "version":"v3", + "created":"Sun, 28 Mar 2021 14:45:04 GMT" + } + ], + "update_date":"2021-03-30", + "authors_parsed":[ + [ + "Zhou", + "Haoyi", + "" + ], + [ + "Zhang", + "Shanghang", + "" + ], + [ + "Peng", + "Jieqi", + "" + ], + [ + "Zhang", + "Shuai", + "" + ], + [ + "Li", + "Jianxin", + "" + ], + [ + "Xiong", + "Hui", + "" + ], + [ + "Zhang", + "Wancai", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.IR" + ], + "citation_count":1793.0, + "inf_cite_count":335.0, + "publication_date":"2020-12-14" + }, + { + "id":"1302.6815", + "submitter":"David Heckerman", + "authors":"David Heckerman, Dan Geiger, David Maxwell Chickering", + "title":"Learning Bayesian Networks: The Combination of Knowledge and Statistical\n Data", + "comments":"Appears in Proceedings of the Tenth Conference on Uncertainty in\n Artificial Intelligence (UAI1994)", + "journal-ref":null, + "doi":null, + "report-no":"UAI-P-1994-PG-293-301", + "categories":"cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We describe algorithms for learning Bayesian networks from a combination of\nuser knowledge and statistical data. The algorithms have two components: a\nscoring metric and a search procedure. The scoring metric takes a network\nstructure, statistical data, and a user's prior knowledge, and returns a score\nproportional to the posterior probability of the network structure given the\ndata. The search procedure generates networks for evaluation by the scoring\nmetric. Our contributions are threefold. First, we identify two important\nproperties of metrics, which we call event equivalence and parameter\nmodularity. These properties have been mostly ignored, but when combined,\ngreatly simplify the encoding of a user's prior knowledge. In particular, a\nuser can express her knowledge-for the most part-as a single prior Bayesian\nnetwork for the domain. Second, we describe local search and annealing\nalgorithms to be used in conjunction with scoring metrics. In the special case\nwhere each node has at most one parent, we show that heuristic search can be\nreplaced with a polynomial algorithm to identify the networks with the highest\nscore. Third, we describe a methodology for evaluating Bayesian-network\nlearning algorithms. We apply this approach to a comparison of metrics and\nsearch procedures.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 27 Feb 2013 14:16:50 GMT" + }, + { + "version":"v2", + "created":"Sat, 16 May 2015 23:46:48 GMT" + } + ], + "update_date":"2015-05-19", + "authors_parsed":[ + [ + "Heckerman", + "David", + "" + ], + [ + "Geiger", + "Dan", + "" + ], + [ + "Chickering", + "David Maxwell", + "" + ] + ], + "categories_split":[ + "cs.AI" + ], + "citation_count":3888.0, + "inf_cite_count":334.0, + "publication_date":"1994-07-29" + }, + { + "id":"1206.7051", + "submitter":"David Blei", + "authors":"Matt Hoffman, David M. Blei, Chong Wang, John Paisley", + "title":"Stochastic Variational Inference", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"stat.ML cs.AI stat.CO stat.ME", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We develop stochastic variational inference, a scalable algorithm for\napproximating posterior distributions. We develop this technique for a large\nclass of probabilistic models and we demonstrate it with two probabilistic\ntopic models, latent Dirichlet allocation and the hierarchical Dirichlet\nprocess topic model. Using stochastic variational inference, we analyze several\nlarge collections of documents: 300K articles from Nature, 1.8M articles from\nThe New York Times, and 3.8M articles from Wikipedia. Stochastic inference can\neasily handle data sets of this size and outperforms traditional variational\ninference, which can only handle a smaller subset. (We also show that the\nBayesian nonparametric topic model outperforms its parametric counterpart.)\nStochastic variational inference lets us apply complex Bayesian models to\nmassive data sets.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 29 Jun 2012 15:23:11 GMT" + }, + { + "version":"v2", + "created":"Thu, 18 Apr 2013 15:40:02 GMT" + }, + { + "version":"v3", + "created":"Mon, 22 Apr 2013 20:23:40 GMT" + } + ], + "update_date":"2013-04-24", + "authors_parsed":[ + [ + "Hoffman", + "Matt", + "" + ], + [ + "Blei", + "David M.", + "" + ], + [ + "Wang", + "Chong", + "" + ], + [ + "Paisley", + "John", + "" + ] + ], + "categories_split":[ + "stat.ML", + "cs.AI", + "stat.CO", + "stat.ME" + ], + "citation_count":2395.0, + "inf_cite_count":327.0, + "publication_date":"2012-06-29" + }, + { + "id":"1703.00848", + "submitter":"Ming-Yu Liu", + "authors":"Ming-Yu Liu and Thomas Breuel and Jan Kautz", + "title":"Unsupervised Image-to-Image Translation Networks", + "comments":"NIPS 2017, 11 pages, 6 figures", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Unsupervised image-to-image translation aims at learning a joint distribution\nof images in different domains by using images from the marginal distributions\nin individual domains. Since there exists an infinite set of joint\ndistributions that can arrive the given marginal distributions, one could infer\nnothing about the joint distribution from the marginal distributions without\nadditional assumptions. To address the problem, we make a shared-latent space\nassumption and propose an unsupervised image-to-image translation framework\nbased on Coupled GANs. We compare the proposed framework with competing\napproaches and present high quality image translation results on various\nchallenging unsupervised image translation tasks, including street scene image\ntranslation, animal image translation, and face image translation. We also\napply the proposed framework to domain adaptation and achieve state-of-the-art\nperformance on benchmark datasets. Code and additional results are available in\nhttps:\/\/github.com\/mingyuliutw\/unit .\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 2 Mar 2017 16:29:30 GMT" + }, + { + "version":"v2", + "created":"Tue, 3 Oct 2017 17:55:21 GMT" + }, + { + "version":"v3", + "created":"Fri, 6 Oct 2017 03:14:21 GMT" + }, + { + "version":"v4", + "created":"Mon, 9 Oct 2017 18:14:27 GMT" + }, + { + "version":"v5", + "created":"Thu, 15 Feb 2018 15:33:48 GMT" + }, + { + "version":"v6", + "created":"Mon, 23 Jul 2018 03:39:28 GMT" + } + ], + "update_date":"2018-07-24", + "authors_parsed":[ + [ + "Liu", + "Ming-Yu", + "" + ], + [ + "Breuel", + "Thomas", + "" + ], + [ + "Kautz", + "Jan", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI" + ], + "citation_count":2468.0, + "inf_cite_count":323.0, + "publication_date":"2017-03-02" + }, + { + "id":"2302.05543", + "submitter":"Lvmin Zhang", + "authors":"Lvmin Zhang and Anyi Rao and Maneesh Agrawala", + "title":"Adding Conditional Control to Text-to-Image Diffusion Models", + "comments":"Codes and Supplementary Material:\n https:\/\/github.com\/lllyasviel\/ControlNet", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.GR cs.HC cs.MM", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We present ControlNet, a neural network architecture to add spatial\nconditioning controls to large, pretrained text-to-image diffusion models.\nControlNet locks the production-ready large diffusion models, and reuses their\ndeep and robust encoding layers pretrained with billions of images as a strong\nbackbone to learn a diverse set of conditional controls. The neural\narchitecture is connected with \"zero convolutions\" (zero-initialized\nconvolution layers) that progressively grow the parameters from zero and ensure\nthat no harmful noise could affect the finetuning. We test various conditioning\ncontrols, eg, edges, depth, segmentation, human pose, etc, with Stable\nDiffusion, using single or multiple conditions, with or without prompts. We\nshow that the training of ControlNets is robust with small (<50k) and large\n(>1m) datasets. Extensive results show that ControlNet may facilitate wider\napplications to control image diffusion models.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 10 Feb 2023 23:12:37 GMT" + }, + { + "version":"v2", + "created":"Sat, 2 Sep 2023 11:39:28 GMT" + }, + { + "version":"v3", + "created":"Sun, 26 Nov 2023 22:26:12 GMT" + } + ], + "update_date":"2023-11-28", + "authors_parsed":[ + [ + "Zhang", + "Lvmin", + "" + ], + [ + "Rao", + "Anyi", + "" + ], + [ + "Agrawala", + "Maneesh", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.GR", + "cs.HC", + "cs.MM" + ], + "citation_count":1450.0, + "inf_cite_count":318.0, + "publication_date":"2023-02-10" + }, + { + "id":"1612.00563", + "submitter":"Steven Rennie", + "authors":"Steven J. Rennie, Etienne Marcheret, Youssef Mroueh, Jarret Ross and\n Vaibhava Goel", + "title":"Self-critical Sequence Training for Image Captioning", + "comments":"CVPR 2017 + additional analysis + fixed baseline results, 16 pages", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.CV", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Recently it has been shown that policy-gradient methods for reinforcement\nlearning can be utilized to train deep end-to-end systems directly on\nnon-differentiable metrics for the task at hand. In this paper we consider the\nproblem of optimizing image captioning systems using reinforcement learning,\nand show that by carefully optimizing our systems using the test metrics of the\nMSCOCO task, significant gains in performance can be realized. Our systems are\nbuilt using a new optimization approach that we call self-critical sequence\ntraining (SCST). SCST is a form of the popular REINFORCE algorithm that, rather\nthan estimating a \"baseline\" to normalize the rewards and reduce variance,\nutilizes the output of its own test-time inference algorithm to normalize the\nrewards it experiences. Using this approach, estimating the reward signal (as\nactor-critic methods must do) and estimating normalization (as REINFORCE\nalgorithms typically do) is avoided, while at the same time harmonizing the\nmodel with respect to its test-time inference procedure. Empirically we find\nthat directly optimizing the CIDEr metric with SCST and greedy decoding at\ntest-time is highly effective. Our results on the MSCOCO evaluation sever\nestablish a new state-of-the-art on the task, improving the best result in\nterms of CIDEr from 104.9 to 114.7.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 2 Dec 2016 04:37:22 GMT" + }, + { + "version":"v2", + "created":"Thu, 16 Nov 2017 02:38:37 GMT" + } + ], + "update_date":"2017-11-17", + "authors_parsed":[ + [ + "Rennie", + "Steven J.", + "" + ], + [ + "Marcheret", + "Etienne", + "" + ], + [ + "Mroueh", + "Youssef", + "" + ], + [ + "Ross", + "Jarret", + "" + ], + [ + "Goel", + "Vaibhava", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.CV" + ], + "citation_count":1687.0, + "inf_cite_count":318.0, + "publication_date":"2016-12-02" + }, + { + "id":"1607.06520", + "submitter":"Tolga Bolukbasi", + "authors":"Tolga Bolukbasi, Kai-Wei Chang, James Zou, Venkatesh Saligrama, Adam\n Kalai", + "title":"Man is to Computer Programmer as Woman is to Homemaker? Debiasing Word\n Embeddings", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI cs.LG stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" The blind application of machine learning runs the risk of amplifying biases\npresent in data. Such a danger is facing us with word embedding, a popular\nframework to represent text data as vectors which has been used in many machine\nlearning and natural language processing tasks. We show that even word\nembeddings trained on Google News articles exhibit female\/male gender\nstereotypes to a disturbing extent. This raises concerns because their\nwidespread use, as we describe, often tends to amplify these biases.\nGeometrically, gender bias is first shown to be captured by a direction in the\nword embedding. Second, gender neutral words are shown to be linearly separable\nfrom gender definition words in the word embedding. Using these properties, we\nprovide a methodology for modifying an embedding to remove gender stereotypes,\nsuch as the association between between the words receptionist and female,\nwhile maintaining desired associations such as between the words queen and\nfemale. We define metrics to quantify both direct and indirect gender biases in\nembeddings, and develop algorithms to \"debias\" the embedding. Using\ncrowd-worker evaluation as well as standard benchmarks, we empirically\ndemonstrate that our algorithms significantly reduce gender bias in embeddings\nwhile preserving the its useful properties such as the ability to cluster\nrelated concepts and to solve analogy tasks. The resulting embeddings can be\nused in applications without amplifying gender bias.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 21 Jul 2016 22:26:20 GMT" + } + ], + "update_date":"2016-07-25", + "authors_parsed":[ + [ + "Bolukbasi", + "Tolga", + "" + ], + [ + "Chang", + "Kai-Wei", + "" + ], + [ + "Zou", + "James", + "" + ], + [ + "Saligrama", + "Venkatesh", + "" + ], + [ + "Kalai", + "Adam", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI", + "cs.LG", + "stat.ML" + ], + "citation_count":2520.0, + "inf_cite_count":315.0, + "publication_date":"2016-07-21" + }, + { + "id":"1610.08401", + "submitter":"Seyed-Mohsen Moosavi-Dezfooli", + "authors":"Seyed-Mohsen Moosavi-Dezfooli, Alhussein Fawzi, Omar Fawzi, Pascal\n Frossard", + "title":"Universal adversarial perturbations", + "comments":"Accepted at IEEE Conference on Computer Vision and Pattern\n Recognition (CVPR), 2017", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Given a state-of-the-art deep neural network classifier, we show the\nexistence of a universal (image-agnostic) and very small perturbation vector\nthat causes natural images to be misclassified with high probability. We\npropose a systematic algorithm for computing universal perturbations, and show\nthat state-of-the-art deep neural networks are highly vulnerable to such\nperturbations, albeit being quasi-imperceptible to the human eye. We further\nempirically analyze these universal perturbations and show, in particular, that\nthey generalize very well across neural networks. The surprising existence of\nuniversal perturbations reveals important geometric correlations among the\nhigh-dimensional decision boundary of classifiers. It further outlines\npotential security breaches with the existence of single directions in the\ninput space that adversaries can possibly exploit to break a classifier on most\nnatural images.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 26 Oct 2016 16:30:45 GMT" + }, + { + "version":"v2", + "created":"Thu, 17 Nov 2016 07:15:00 GMT" + }, + { + "version":"v3", + "created":"Thu, 9 Mar 2017 17:01:25 GMT" + } + ], + "update_date":"2017-03-10", + "authors_parsed":[ + [ + "Moosavi-Dezfooli", + "Seyed-Mohsen", + "" + ], + [ + "Fawzi", + "Alhussein", + "" + ], + [ + "Fawzi", + "Omar", + "" + ], + [ + "Frossard", + "Pascal", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG", + "stat.ML" + ], + "citation_count":2253.0, + "inf_cite_count":313.0, + "publication_date":"2016-10-26" + }, + { + "id":"1703.04730", + "submitter":"Pang Wei Koh", + "authors":"Pang Wei Koh and Percy Liang", + "title":"Understanding Black-box Predictions via Influence Functions", + "comments":"International Conference on Machine Learning, 2017. (This version\n adds more historical references and fixes typos.)", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"stat.ML cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" How can we explain the predictions of a black-box model? In this paper, we\nuse influence functions -- a classic technique from robust statistics -- to\ntrace a model's prediction through the learning algorithm and back to its\ntraining data, thereby identifying training points most responsible for a given\nprediction. To scale up influence functions to modern machine learning\nsettings, we develop a simple, efficient implementation that requires only\noracle access to gradients and Hessian-vector products. We show that even on\nnon-convex and non-differentiable models where the theory breaks down,\napproximations to influence functions can still provide valuable information.\nOn linear models and convolutional neural networks, we demonstrate that\ninfluence functions are useful for multiple purposes: understanding model\nbehavior, debugging models, detecting dataset errors, and even creating\nvisually-indistinguishable training-set attacks.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 14 Mar 2017 21:07:01 GMT" + }, + { + "version":"v2", + "created":"Mon, 10 Jul 2017 02:31:54 GMT" + }, + { + "version":"v3", + "created":"Tue, 29 Dec 2020 22:40:43 GMT" + } + ], + "update_date":"2021-01-01", + "authors_parsed":[ + [ + "Koh", + "Pang Wei", + "" + ], + [ + "Liang", + "Percy", + "" + ] + ], + "categories_split":[ + "stat.ML", + "cs.AI", + "cs.LG" + ], + "citation_count":2285.0, + "inf_cite_count":311.0, + "publication_date":"2017-03-14" + }, + { + "id":"1901.07031", + "submitter":"Jeremy Irvin", + "authors":"Jeremy Irvin, Pranav Rajpurkar, Michael Ko, Yifan Yu, Silviana\n Ciurea-Ilcus, Chris Chute, Henrik Marklund, Behzad Haghgoo, Robyn Ball, Katie\n Shpanskaya, Jayne Seekins, David A. Mong, Safwan S. Halabi, Jesse K.\n Sandberg, Ricky Jones, David B. Larson, Curtis P. Langlotz, Bhavik N. Patel,\n Matthew P. Lungren, Andrew Y. Ng", + "title":"CheXpert: A Large Chest Radiograph Dataset with Uncertainty Labels and\n Expert Comparison", + "comments":"Published in AAAI 2019", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG eess.IV", + "license":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/", + "abstract":" Large, labeled datasets have driven deep learning methods to achieve\nexpert-level performance on a variety of medical imaging tasks. We present\nCheXpert, a large dataset that contains 224,316 chest radiographs of 65,240\npatients. We design a labeler to automatically detect the presence of 14\nobservations in radiology reports, capturing uncertainties inherent in\nradiograph interpretation. We investigate different approaches to using the\nuncertainty labels for training convolutional neural networks that output the\nprobability of these observations given the available frontal and lateral\nradiographs. On a validation set of 200 chest radiographic studies which were\nmanually annotated by 3 board-certified radiologists, we find that different\nuncertainty approaches are useful for different pathologies. We then evaluate\nour best model on a test set composed of 500 chest radiographic studies\nannotated by a consensus of 5 board-certified radiologists, and compare the\nperformance of our model to that of 3 additional radiologists in the detection\nof 5 selected pathologies. On Cardiomegaly, Edema, and Pleural Effusion, the\nmodel ROC and PR curves lie above all 3 radiologist operating points. We\nrelease the dataset to the public as a standard benchmark to evaluate\nperformance of chest radiograph interpretation models.\n The dataset is freely available at\nhttps:\/\/stanfordmlgroup.github.io\/competitions\/chexpert .\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 21 Jan 2019 18:41:59 GMT" + } + ], + "update_date":"2019-01-23", + "authors_parsed":[ + [ + "Irvin", + "Jeremy", + "" + ], + [ + "Rajpurkar", + "Pranav", + "" + ], + [ + "Ko", + "Michael", + "" + ], + [ + "Yu", + "Yifan", + "" + ], + [ + "Ciurea-Ilcus", + "Silviana", + "" + ], + [ + "Chute", + "Chris", + "" + ], + [ + "Marklund", + "Henrik", + "" + ], + [ + "Haghgoo", + "Behzad", + "" + ], + [ + "Ball", + "Robyn", + "" + ], + [ + "Shpanskaya", + "Katie", + "" + ], + [ + "Seekins", + "Jayne", + "" + ], + [ + "Mong", + "David A.", + "" + ], + [ + "Halabi", + "Safwan S.", + "" + ], + [ + "Sandberg", + "Jesse K.", + "" + ], + [ + "Jones", + "Ricky", + "" + ], + [ + "Larson", + "David B.", + "" + ], + [ + "Langlotz", + "Curtis P.", + "" + ], + [ + "Patel", + "Bhavik N.", + "" + ], + [ + "Lungren", + "Matthew P.", + "" + ], + [ + "Ng", + "Andrew Y.", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG", + "eess.IV" + ], + "citation_count":1895.0, + "inf_cite_count":311.0, + "publication_date":"2019-01-21" + }, + { + "id":"1708.06519", + "submitter":"Zhuang Liu", + "authors":"Zhuang Liu and Jianguo Li and Zhiqiang Shen and Gao Huang and Shoumeng\n Yan and Changshui Zhang", + "title":"Learning Efficient Convolutional Networks through Network Slimming", + "comments":"Accepted by ICCV 2017", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" The deployment of deep convolutional neural networks (CNNs) in many real\nworld applications is largely hindered by their high computational cost. In\nthis paper, we propose a novel learning scheme for CNNs to simultaneously 1)\nreduce the model size; 2) decrease the run-time memory footprint; and 3) lower\nthe number of computing operations, without compromising accuracy. This is\nachieved by enforcing channel-level sparsity in the network in a simple but\neffective way. Different from many existing approaches, the proposed method\ndirectly applies to modern CNN architectures, introduces minimum overhead to\nthe training process, and requires no special software\/hardware accelerators\nfor the resulting models. We call our approach network slimming, which takes\nwide and large networks as input models, but during training insignificant\nchannels are automatically identified and pruned afterwards, yielding thin and\ncompact models with comparable accuracy. We empirically demonstrate the\neffectiveness of our approach with several state-of-the-art CNN models,\nincluding VGGNet, ResNet and DenseNet, on various image classification\ndatasets. For VGGNet, a multi-pass version of network slimming gives a 20x\nreduction in model size and a 5x reduction in computing operations.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 22 Aug 2017 07:35:26 GMT" + } + ], + "update_date":"2017-08-23", + "authors_parsed":[ + [ + "Liu", + "Zhuang", + "" + ], + [ + "Li", + "Jianguo", + "" + ], + [ + "Shen", + "Zhiqiang", + "" + ], + [ + "Huang", + "Gao", + "" + ], + [ + "Yan", + "Shoumeng", + "" + ], + [ + "Zhang", + "Changshui", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG" + ], + "citation_count":2002.0, + "inf_cite_count":308.0, + "publication_date":"2017-08-22" + }, + { + "id":"2304.08485", + "submitter":"Haotian Liu", + "authors":"Haotian Liu, Chunyuan Li, Qingyang Wu, Yong Jae Lee", + "title":"Visual Instruction Tuning", + "comments":"NeurIPS 2023 Oral; project page: https:\/\/llava-vl.github.io\/", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.CL cs.LG", + "license":"http:\/\/creativecommons.org\/licenses\/by\/4.0\/", + "abstract":" Instruction tuning large language models (LLMs) using machine-generated\ninstruction-following data has improved zero-shot capabilities on new tasks,\nbut the idea is less explored in the multimodal field. In this paper, we\npresent the first attempt to use language-only GPT-4 to generate multimodal\nlanguage-image instruction-following data. By instruction tuning on such\ngenerated data, we introduce LLaVA: Large Language and Vision Assistant, an\nend-to-end trained large multimodal model that connects a vision encoder and\nLLM for general-purpose visual and language understanding.Our early experiments\nshow that LLaVA demonstrates impressive multimodel chat abilities, sometimes\nexhibiting the behaviors of multimodal GPT-4 on unseen images\/instructions, and\nyields a 85.1% relative score compared with GPT-4 on a synthetic multimodal\ninstruction-following dataset. When fine-tuned on Science QA, the synergy of\nLLaVA and GPT-4 achieves a new state-of-the-art accuracy of 92.53%. We make\nGPT-4 generated visual instruction tuning data, our model and code base\npublicly available.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 17 Apr 2023 17:59:25 GMT" + }, + { + "version":"v2", + "created":"Mon, 11 Dec 2023 17:46:14 GMT" + } + ], + "update_date":"2023-12-14", + "authors_parsed":[ + [ + "Liu", + "Haotian", + "" + ], + [ + "Li", + "Chunyuan", + "" + ], + [ + "Wu", + "Qingyang", + "" + ], + [ + "Lee", + "Yong Jae", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.CL", + "cs.LG" + ], + "citation_count":966.0, + "inf_cite_count":308.0, + "publication_date":"2023-04-17" + }, + { + "id":"1109.6051", + "submitter":"M. Helmert", + "authors":"M. Helmert", + "title":"The Fast Downward Planning System", + "comments":null, + "journal-ref":"Journal Of Artificial Intelligence Research, Volume 26, pages\n 191-246, 2006", + "doi":"10.1613\/jair.1705", + "report-no":null, + "categories":"cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Fast Downward is a classical planning system based on heuristic search. It\ncan deal with general deterministic planning problems encoded in the\npropositional fragment of PDDL2.2, including advanced features like ADL\nconditions and effects and derived predicates (axioms). Like other well-known\nplanners such as HSP and FF, Fast Downward is a progression planner, searching\nthe space of world states of a planning task in the forward direction. However,\nunlike other PDDL planning systems, Fast Downward does not use the\npropositional PDDL representation of a planning task directly. Instead, the\ninput is first translated into an alternative representation called\nmulti-valued planning tasks, which makes many of the implicit constraints of a\npropositional planning task explicit. Exploiting this alternative\nrepresentation, Fast Downward uses hierarchical decompositions of planning\ntasks for computing its heuristic function, called the causal graph heuristic,\nwhich is very different from traditional HSP-like heuristics based on ignoring\nnegative interactions of operators.\n In this article, we give a full account of Fast Downwards approach to solving\nmulti-valued planning tasks. We extend our earlier discussion of the causal\ngraph heuristic to tasks involving axioms and conditional effects and present\nsome novel techniques for search control that are used within Fast Downwards\nbest-first search algorithm: preferred operators transfer the idea of helpful\nactions from local search to global best-first search, deferred evaluation of\nheuristic functions mitigates the negative effect of large branching factors on\nsearch performance, and multi-heuristic best-first search combines several\nheuristic evaluation functions within a single search algorithm in an\northogonal way. We also describe efficient data structures for fast state\nexpansion (successor generators and axiom evaluators) and present a new\nnon-heuristic search algorithm called focused iterative-broadening search,\nwhich utilizes the information encoded in causal graphs in a novel way.\n Fast Downward has proven remarkably successful: It won the \"classical (i.e.,\npropositional, non-optimising) track of the 4th International Planning\nCompetition at ICAPS 2004, following in the footsteps of planners such as FF\nand LPG. Our experiments show that it also performs very well on the benchmarks\nof the earlier planning competitions and provide some insights about the\nusefulness of the new search enhancements.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 27 Sep 2011 22:04:43 GMT" + } + ], + "update_date":"2011-09-29", + "authors_parsed":[ + [ + "Helmert", + "M.", + "" + ] + ], + "categories_split":[ + "cs.AI" + ], + "citation_count":1702.0, + "inf_cite_count":297.0, + "publication_date":"2006-05-01" + }, + { + "id":"2105.01601", + "submitter":"Ilya Tolstikhin", + "authors":"Ilya Tolstikhin and Neil Houlsby and Alexander Kolesnikov and Lucas\n Beyer and Xiaohua Zhai and Thomas Unterthiner and Jessica Yung and Andreas\n Steiner and Daniel Keysers and Jakob Uszkoreit and Mario Lucic and Alexey\n Dosovitskiy", + "title":"MLP-Mixer: An all-MLP Architecture for Vision", + "comments":"v2: Fixed parameter counts in Table 1. v3: Added results on JFT-3B in\n Figure 2(right); Added Section 3.4 on the input permutations. v4: Updated the\n x label in Figure 2(right)", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Convolutional Neural Networks (CNNs) are the go-to model for computer vision.\nRecently, attention-based networks, such as the Vision Transformer, have also\nbecome popular. In this paper we show that while convolutions and attention are\nboth sufficient for good performance, neither of them are necessary. We present\nMLP-Mixer, an architecture based exclusively on multi-layer perceptrons (MLPs).\nMLP-Mixer contains two types of layers: one with MLPs applied independently to\nimage patches (i.e. \"mixing\" the per-location features), and one with MLPs\napplied across patches (i.e. \"mixing\" spatial information). When trained on\nlarge datasets, or with modern regularization schemes, MLP-Mixer attains\ncompetitive scores on image classification benchmarks, with pre-training and\ninference cost comparable to state-of-the-art models. We hope that these\nresults spark further research beyond the realms of well established CNNs and\nTransformers.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 4 May 2021 16:17:21 GMT" + }, + { + "version":"v2", + "created":"Mon, 17 May 2021 12:48:26 GMT" + }, + { + "version":"v3", + "created":"Thu, 10 Jun 2021 09:50:52 GMT" + }, + { + "version":"v4", + "created":"Fri, 11 Jun 2021 09:36:50 GMT" + } + ], + "update_date":"2021-06-14", + "authors_parsed":[ + [ + "Tolstikhin", + "Ilya", + "" + ], + [ + "Houlsby", + "Neil", + "" + ], + [ + "Kolesnikov", + "Alexander", + "" + ], + [ + "Beyer", + "Lucas", + "" + ], + [ + "Zhai", + "Xiaohua", + "" + ], + [ + "Unterthiner", + "Thomas", + "" + ], + [ + "Yung", + "Jessica", + "" + ], + [ + "Steiner", + "Andreas", + "" + ], + [ + "Keysers", + "Daniel", + "" + ], + [ + "Uszkoreit", + "Jakob", + "" + ], + [ + "Lucic", + "Mario", + "" + ], + [ + "Dosovitskiy", + "Alexey", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG" + ], + "citation_count":1709.0, + "inf_cite_count":294.0, + "publication_date":"2021-05-04" + }, + { + "id":"1810.12894", + "submitter":"Yuri Burda", + "authors":"Yuri Burda, Harrison Edwards, Amos Storkey, Oleg Klimov", + "title":"Exploration by Random Network Distillation", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We introduce an exploration bonus for deep reinforcement learning methods\nthat is easy to implement and adds minimal overhead to the computation\nperformed. The bonus is the error of a neural network predicting features of\nthe observations given by a fixed randomly initialized neural network. We also\nintroduce a method to flexibly combine intrinsic and extrinsic rewards. We find\nthat the random network distillation (RND) bonus combined with this increased\nflexibility enables significant progress on several hard exploration Atari\ngames. In particular we establish state of the art performance on Montezuma's\nRevenge, a game famously difficult for deep reinforcement learning methods. To\nthe best of our knowledge, this is the first method that achieves better than\naverage human performance on this game without using demonstrations or having\naccess to the underlying state of the game, and occasionally completes the\nfirst level.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 30 Oct 2018 17:44:42 GMT" + } + ], + "update_date":"2018-10-31", + "authors_parsed":[ + [ + "Burda", + "Yuri", + "" + ], + [ + "Edwards", + "Harrison", + "" + ], + [ + "Storkey", + "Amos", + "" + ], + [ + "Klimov", + "Oleg", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "stat.ML" + ], + "citation_count":1027.0, + "inf_cite_count":294.0, + "publication_date":"2018-09-27" + }, + { + "id":"1607.01719", + "submitter":"Baochen Sun", + "authors":"Baochen Sun, Kate Saenko", + "title":"Deep CORAL: Correlation Alignment for Deep Domain Adaptation", + "comments":"Extended Abstract", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG cs.NE", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Deep neural networks are able to learn powerful representations from large\nquantities of labeled input data, however they cannot always generalize well\nacross changes in input distributions. Domain adaptation algorithms have been\nproposed to compensate for the degradation in performance due to domain shift.\nIn this paper, we address the case when the target domain is unlabeled,\nrequiring unsupervised adaptation. CORAL is a \"frustratingly easy\" unsupervised\ndomain adaptation method that aligns the second-order statistics of the source\nand target distributions with a linear transformation. Here, we extend CORAL to\nlearn a nonlinear transformation that aligns correlations of layer activations\nin deep neural networks (Deep CORAL). Experiments on standard benchmark\ndatasets show state-of-the-art performance.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 6 Jul 2016 17:35:55 GMT" + } + ], + "update_date":"2016-07-07", + "authors_parsed":[ + [ + "Sun", + "Baochen", + "" + ], + [ + "Saenko", + "Kate", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG", + "cs.NE" + ], + "citation_count":2430.0, + "inf_cite_count":293.0, + "publication_date":"2016-07-06" + }, + { + "id":"1710.02298", + "submitter":"Matteo Hessel", + "authors":"Matteo Hessel, Joseph Modayil, Hado van Hasselt, Tom Schaul, Georg\n Ostrovski, Will Dabney, Dan Horgan, Bilal Piot, Mohammad Azar, David Silver", + "title":"Rainbow: Combining Improvements in Deep Reinforcement Learning", + "comments":"Under review as a conference paper at AAAI 2018", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" The deep reinforcement learning community has made several independent\nimprovements to the DQN algorithm. However, it is unclear which of these\nextensions are complementary and can be fruitfully combined. This paper\nexamines six extensions to the DQN algorithm and empirically studies their\ncombination. Our experiments show that the combination provides\nstate-of-the-art performance on the Atari 2600 benchmark, both in terms of data\nefficiency and final performance. We also provide results from a detailed\nablation study that shows the contribution of each component to overall\nperformance.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 6 Oct 2017 07:45:46 GMT" + } + ], + "update_date":"2017-10-09", + "authors_parsed":[ + [ + "Hessel", + "Matteo", + "" + ], + [ + "Modayil", + "Joseph", + "" + ], + [ + "van Hasselt", + "Hado", + "" + ], + [ + "Schaul", + "Tom", + "" + ], + [ + "Ostrovski", + "Georg", + "" + ], + [ + "Dabney", + "Will", + "" + ], + [ + "Horgan", + "Dan", + "" + ], + [ + "Piot", + "Bilal", + "" + ], + [ + "Azar", + "Mohammad", + "" + ], + [ + "Silver", + "David", + "" + ] + ], + "categories_split":[ + "cs.AI", + "cs.LG" + ], + "citation_count":1852.0, + "inf_cite_count":291.0, + "publication_date":"2017-10-06" + }, + { + "id":"2103.03230", + "submitter":"Jure Zbontar", + "authors":"Jure Zbontar, Li Jing, Ishan Misra, Yann LeCun, St\\'ephane Deny", + "title":"Barlow Twins: Self-Supervised Learning via Redundancy Reduction", + "comments":"13 pages, 6 figures, to appear at ICML 2021", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG q-bio.NC", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Self-supervised learning (SSL) is rapidly closing the gap with supervised\nmethods on large computer vision benchmarks. A successful approach to SSL is to\nlearn embeddings which are invariant to distortions of the input sample.\nHowever, a recurring issue with this approach is the existence of trivial\nconstant solutions. Most current methods avoid such solutions by careful\nimplementation details. We propose an objective function that naturally avoids\ncollapse by measuring the cross-correlation matrix between the outputs of two\nidentical networks fed with distorted versions of a sample, and making it as\nclose to the identity matrix as possible. This causes the embedding vectors of\ndistorted versions of a sample to be similar, while minimizing the redundancy\nbetween the components of these vectors. The method is called Barlow Twins,\nowing to neuroscientist H. Barlow's redundancy-reduction principle applied to a\npair of identical networks. Barlow Twins does not require large batches nor\nasymmetry between the network twins such as a predictor network, gradient\nstopping, or a moving average on the weight updates. Intriguingly it benefits\nfrom very high-dimensional output vectors. Barlow Twins outperforms previous\nmethods on ImageNet for semi-supervised classification in the low-data regime,\nand is on par with current state of the art for ImageNet classification with a\nlinear classifier head, and for transfer tasks of classification and object\ndetection.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 4 Mar 2021 18:55:09 GMT" + }, + { + "version":"v2", + "created":"Mon, 3 May 2021 09:36:29 GMT" + }, + { + "version":"v3", + "created":"Mon, 14 Jun 2021 14:09:43 GMT" + } + ], + "update_date":"2021-06-15", + "authors_parsed":[ + [ + "Zbontar", + "Jure", + "" + ], + [ + "Jing", + "Li", + "" + ], + [ + "Misra", + "Ishan", + "" + ], + [ + "LeCun", + "Yann", + "" + ], + [ + "Deny", + "St\u00e9phane", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG", + "q-bio.NC" + ], + "citation_count":1621.0, + "inf_cite_count":286.0, + "publication_date":"2021-03-04" + }, + { + "id":"cs\/9501101", + "submitter":null, + "authors":"T. G. Dietterich, G. Bakiri", + "title":"Solving Multiclass Learning Problems via Error-Correcting Output Codes", + "comments":"See http:\/\/www.jair.org\/ for any accompanying files", + "journal-ref":"Journal of Artificial Intelligence Research, Vol 2, (1995),\n 263-286", + "doi":null, + "report-no":null, + "categories":"cs.AI", + "license":null, + "abstract":" Multiclass learning problems involve finding a definition for an unknown\nfunction f(x) whose range is a discrete set containing k > 2 values (i.e., k\n``classes''). The definition is acquired by studying collections of training\nexamples of the form [x_i, f (x_i)]. Existing approaches to multiclass learning\nproblems include direct application of multiclass algorithms such as the\ndecision-tree algorithms C4.5 and CART, application of binary concept learning\nalgorithms to learn individual binary functions for each of the k classes, and\napplication of binary concept learning algorithms with distributed output\nrepresentations. This paper compares these three approaches to a new technique\nin which error-correcting codes are employed as a distributed output\nrepresentation. We show that these output representations improve the\ngeneralization performance of both C4.5 and backpropagation on a wide range of\nmulticlass learning tasks. We also demonstrate that this approach is robust\nwith respect to changes in the size of the training sample, the assignment of\ndistributed representations to particular classes, and the application of\noverfitting avoidance techniques such as decision-tree pruning. Finally, we\nshow that---like the other methods---the error-correcting code technique can\nprovide reliable class probability estimates. Taken together, these results\ndemonstrate that error-correcting output codes provide a general-purpose method\nfor improving the performance of inductive learning programs on multiclass\nproblems.\n", + "versions":[ + { + "version":"v1", + "created":"Sun, 1 Jan 1995 00:00:00 GMT" + } + ], + "update_date":"2014-11-17", + "authors_parsed":[ + [ + "Dietterich", + "T. G.", + "" + ], + [ + "Bakiri", + "G.", + "" + ] + ], + "categories_split":[ + "cs.AI" + ], + "citation_count":2979.0, + "inf_cite_count":286.0, + "publication_date":"1994-08-01" + }, + { + "id":"1705.05363", + "submitter":"Deepak Pathak", + "authors":"Deepak Pathak, Pulkit Agrawal, Alexei A. Efros, Trevor Darrell", + "title":"Curiosity-driven Exploration by Self-supervised Prediction", + "comments":"In ICML 2017. Website at https:\/\/pathak22.github.io\/noreward-rl\/", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.CV cs.RO stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" In many real-world scenarios, rewards extrinsic to the agent are extremely\nsparse, or absent altogether. In such cases, curiosity can serve as an\nintrinsic reward signal to enable the agent to explore its environment and\nlearn skills that might be useful later in its life. We formulate curiosity as\nthe error in an agent's ability to predict the consequence of its own actions\nin a visual feature space learned by a self-supervised inverse dynamics model.\nOur formulation scales to high-dimensional continuous state spaces like images,\nbypasses the difficulties of directly predicting pixels, and, critically,\nignores the aspects of the environment that cannot affect the agent. The\nproposed approach is evaluated in two environments: VizDoom and Super Mario\nBros. Three broad settings are investigated: 1) sparse extrinsic reward, where\ncuriosity allows for far fewer interactions with the environment to reach the\ngoal; 2) exploration with no extrinsic reward, where curiosity pushes the agent\nto explore more efficiently; and 3) generalization to unseen scenarios (e.g.\nnew levels of the same game) where the knowledge gained from earlier experience\nhelps the agent explore new places much faster than starting from scratch. Demo\nvideo and code available at https:\/\/pathak22.github.io\/noreward-rl\/\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 15 May 2017 17:56:22 GMT" + } + ], + "update_date":"2017-05-16", + "authors_parsed":[ + [ + "Pathak", + "Deepak", + "" + ], + [ + "Agrawal", + "Pulkit", + "" + ], + [ + "Efros", + "Alexei A.", + "" + ], + [ + "Darrell", + "Trevor", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.CV", + "cs.RO", + "stat.ML" + ], + "citation_count":2036.0, + "inf_cite_count":285.0, + "publication_date":"2017-05-15" + }, + { + "id":"1301.2300", + "submitter":"Judea Pearl", + "authors":"Judea Pearl", + "title":"Direct and Indirect Effects", + "comments":"Appears in Proceedings of the Seventeenth Conference on Uncertainty\n in Artificial Intelligence (UAI2001)", + "journal-ref":null, + "doi":null, + "report-no":"UAI-P-2001-PG-411-420", + "categories":"cs.AI stat.ME", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" The direct effect of one eventon another can be defined and measured\nbyholding constant all intermediate variables between the two.Indirect effects\npresent conceptual andpractical difficulties (in nonlinear models), because\nthey cannot be isolated by holding certain variablesconstant. This paper shows\na way of defining any path-specific effectthat does not invoke blocking the\nremainingpaths.This permits the assessment of a more naturaltype of direct and\nindirect effects, one thatis applicable in both linear and nonlinear models.\nThe paper establishesconditions under which such assessments can be estimated\nconsistentlyfrom experimental and nonexperimental data,and thus extends\npath-analytic techniques tononlinear and nonparametric models.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 10 Jan 2013 16:25:47 GMT" + } + ], + "update_date":"2013-01-14", + "authors_parsed":[ + [ + "Pearl", + "Judea", + "" + ] + ], + "categories_split":[ + "cs.AI", + "stat.ME" + ], + "citation_count":1998.0, + "inf_cite_count":284.0, + "publication_date":"2001-08-02" + }, + { + "id":"1801.07791", + "submitter":"Yangyan Li", + "authors":"Yangyan Li, Rui Bu, Mingchao Sun, Wei Wu, Xinhan Di, Baoquan Chen", + "title":"PointCNN: Convolution On $\\mathcal{X}$-Transformed Points", + "comments":"To be published in NIPS 2018, code available at\n https:\/\/github.com\/yangyanli\/PointCNN", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.GR", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We present a simple and general framework for feature learning from point\nclouds. The key to the success of CNNs is the convolution operator that is\ncapable of leveraging spatially-local correlation in data represented densely\nin grids (e.g. images). However, point clouds are irregular and unordered, thus\ndirectly convolving kernels against features associated with the points, will\nresult in desertion of shape information and variance to point ordering. To\naddress these problems, we propose to learn an $\\mathcal{X}$-transformation\nfrom the input points, to simultaneously promote two causes. The first is the\nweighting of the input features associated with the points, and the second is\nthe permutation of the points into a latent and potentially canonical order.\nElement-wise product and sum operations of the typical convolution operator are\nsubsequently applied on the $\\mathcal{X}$-transformed features. The proposed\nmethod is a generalization of typical CNNs to feature learning from point\nclouds, thus we call it PointCNN. Experiments show that PointCNN achieves on\npar or better performance than state-of-the-art methods on multiple challenging\nbenchmark datasets and tasks.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 23 Jan 2018 22:07:21 GMT" + }, + { + "version":"v2", + "created":"Thu, 25 Jan 2018 11:45:08 GMT" + }, + { + "version":"v3", + "created":"Fri, 26 Jan 2018 02:11:12 GMT" + }, + { + "version":"v4", + "created":"Thu, 25 Oct 2018 01:33:31 GMT" + }, + { + "version":"v5", + "created":"Mon, 5 Nov 2018 09:31:45 GMT" + } + ], + "update_date":"2018-11-06", + "authors_parsed":[ + [ + "Li", + "Yangyan", + "" + ], + [ + "Bu", + "Rui", + "" + ], + [ + "Sun", + "Mingchao", + "" + ], + [ + "Wu", + "Wei", + "" + ], + [ + "Di", + "Xinhan", + "" + ], + [ + "Chen", + "Baoquan", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.GR" + ], + "citation_count":1952.0, + "inf_cite_count":277.0, + "publication_date":"2018-01-23" + }, + { + "id":"2010.13902", + "submitter":"Yuning You", + "authors":"Yuning You, Tianlong Chen, Yongduo Sui, Ting Chen, Zhangyang Wang,\n Yang Shen", + "title":"Graph Contrastive Learning with Augmentations", + "comments":"Supplementary materials are available at\n https:\/\/yyou1996.github.io\/files\/neurips2020_graphcl_supplement.pdf. NeurIPS\n 2020", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Generalizable, transferrable, and robust representation learning on\ngraph-structured data remains a challenge for current graph neural networks\n(GNNs). Unlike what has been developed for convolutional neural networks (CNNs)\nfor image data, self-supervised learning and pre-training are less explored for\nGNNs. In this paper, we propose a graph contrastive learning (GraphCL)\nframework for learning unsupervised representations of graph data. We first\ndesign four types of graph augmentations to incorporate various priors. We then\nsystematically study the impact of various combinations of graph augmentations\non multiple datasets, in four different settings: semi-supervised,\nunsupervised, and transfer learning as well as adversarial attacks. The results\nshow that, even without tuning augmentation extents nor using sophisticated GNN\narchitectures, our GraphCL framework can produce graph representations of\nsimilar or better generalizability, transferrability, and robustness compared\nto state-of-the-art methods. We also investigate the impact of parameterized\ngraph augmentation extents and patterns, and observe further performance gains\nin preliminary experiments. Our codes are available at\nhttps:\/\/github.com\/Shen-Lab\/GraphCL.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 22 Oct 2020 20:13:43 GMT" + }, + { + "version":"v2", + "created":"Wed, 11 Nov 2020 15:16:33 GMT" + }, + { + "version":"v3", + "created":"Sat, 3 Apr 2021 15:34:00 GMT" + } + ], + "update_date":"2021-04-06", + "authors_parsed":[ + [ + "You", + "Yuning", + "" + ], + [ + "Chen", + "Tianlong", + "" + ], + [ + "Sui", + "Yongduo", + "" + ], + [ + "Chen", + "Ting", + "" + ], + [ + "Wang", + "Zhangyang", + "" + ], + [ + "Shen", + "Yang", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI" + ], + "citation_count":1232.0, + "inf_cite_count":275.0, + "publication_date":"2020-10-22" + }, + { + "id":"1505.05770", + "submitter":"Danilo Jimenez Rezende", + "authors":"Danilo Jimenez Rezende and Shakir Mohamed", + "title":"Variational Inference with Normalizing Flows", + "comments":"Proceedings of the 32nd International Conference on Machine Learning", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"stat.ML cs.AI cs.LG stat.CO stat.ME", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" The choice of approximate posterior distribution is one of the core problems\nin variational inference. Most applications of variational inference employ\nsimple families of posterior approximations in order to allow for efficient\ninference, focusing on mean-field or other simple structured approximations.\nThis restriction has a significant impact on the quality of inferences made\nusing variational methods. We introduce a new approach for specifying flexible,\narbitrarily complex and scalable approximate posterior distributions. Our\napproximations are distributions constructed through a normalizing flow,\nwhereby a simple initial density is transformed into a more complex one by\napplying a sequence of invertible transformations until a desired level of\ncomplexity is attained. We use this view of normalizing flows to develop\ncategories of finite and infinitesimal flows and provide a unified view of\napproaches for constructing rich posterior approximations. We demonstrate that\nthe theoretical advantages of having posteriors that better match the true\nposterior, combined with the scalability of amortized variational approaches,\nprovides a clear improvement in performance and applicability of variational\ninference.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 21 May 2015 15:36:37 GMT" + }, + { + "version":"v2", + "created":"Fri, 22 May 2015 09:13:28 GMT" + }, + { + "version":"v3", + "created":"Tue, 26 May 2015 15:46:33 GMT" + }, + { + "version":"v4", + "created":"Mon, 22 Jun 2015 18:36:32 GMT" + }, + { + "version":"v5", + "created":"Mon, 13 Jun 2016 08:46:44 GMT" + }, + { + "version":"v6", + "created":"Tue, 14 Jun 2016 09:01:36 GMT" + } + ], + "update_date":"2016-06-15", + "authors_parsed":[ + [ + "Rezende", + "Danilo Jimenez", + "" + ], + [ + "Mohamed", + "Shakir", + "" + ] + ], + "categories_split":[ + "stat.ML", + "cs.AI", + "cs.LG", + "stat.CO", + "stat.ME" + ], + "citation_count":3335.0, + "inf_cite_count":274.0, + "publication_date":"2015-05-21" + }, + { + "id":"2009.03300", + "submitter":"Dan Hendrycks", + "authors":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika,\n Dawn Song, Jacob Steinhardt", + "title":"Measuring Massive Multitask Language Understanding", + "comments":"ICLR 2021; the test and code is available at\n https:\/\/github.com\/hendrycks\/test", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CY cs.AI cs.CL cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" We propose a new test to measure a text model's multitask accuracy. The test\ncovers 57 tasks including elementary mathematics, US history, computer science,\nlaw, and more. To attain high accuracy on this test, models must possess\nextensive world knowledge and problem solving ability. We find that while most\nrecent models have near random-chance accuracy, the very largest GPT-3 model\nimproves over random chance by almost 20 percentage points on average. However,\non every one of the 57 tasks, the best models still need substantial\nimprovements before they can reach expert-level accuracy. Models also have\nlopsided performance and frequently do not know when they are wrong. Worse,\nthey still have near-random accuracy on some socially important subjects such\nas morality and law. By comprehensively evaluating the breadth and depth of a\nmodel's academic and professional understanding, our test can be used to\nanalyze models across many tasks and to identify important shortcomings.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 7 Sep 2020 17:59:25 GMT" + }, + { + "version":"v2", + "created":"Mon, 21 Sep 2020 05:06:57 GMT" + }, + { + "version":"v3", + "created":"Tue, 12 Jan 2021 18:57:11 GMT" + } + ], + "update_date":"2021-01-13", + "authors_parsed":[ + [ + "Hendrycks", + "Dan", + "" + ], + [ + "Burns", + "Collin", + "" + ], + [ + "Basart", + "Steven", + "" + ], + [ + "Zou", + "Andy", + "" + ], + [ + "Mazeika", + "Mantas", + "" + ], + [ + "Song", + "Dawn", + "" + ], + [ + "Steinhardt", + "Jacob", + "" + ] + ], + "categories_split":[ + "cs.CY", + "cs.AI", + "cs.CL", + "cs.LG" + ], + "citation_count":1169.0, + "inf_cite_count":274.0, + "publication_date":"2020-09-07" + }, + { + "id":"1509.00685", + "submitter":"Alexander M. Rush", + "authors":"Alexander M. Rush, Sumit Chopra and Jason Weston", + "title":"A Neural Attention Model for Abstractive Sentence Summarization", + "comments":"Proceedings of EMNLP 2015", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Summarization based on text extraction is inherently limited, but\ngeneration-style abstractive methods have proven challenging to build. In this\nwork, we propose a fully data-driven approach to abstractive sentence\nsummarization. Our method utilizes a local attention-based model that generates\neach word of the summary conditioned on the input sentence. While the model is\nstructurally simple, it can easily be trained end-to-end and scales to a large\namount of training data. The model shows significant performance gains on the\nDUC-2004 shared task compared with several strong baselines.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 2 Sep 2015 13:20:40 GMT" + }, + { + "version":"v2", + "created":"Thu, 3 Sep 2015 19:55:45 GMT" + } + ], + "update_date":"2015-09-04", + "authors_parsed":[ + [ + "Rush", + "Alexander M.", + "" + ], + [ + "Chopra", + "Sumit", + "" + ], + [ + "Weston", + "Jason", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI" + ], + "citation_count":2544.0, + "inf_cite_count":274.0, + "publication_date":"2015-09-02" + }, + { + "id":"1809.05053", + "submitter":"Alexis Conneau", + "authors":"Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel\n R. Bowman, Holger Schwenk, Veselin Stoyanov", + "title":"XNLI: Evaluating Cross-lingual Sentence Representations", + "comments":"EMNLP 2018", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CL cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" State-of-the-art natural language processing systems rely on supervision in\nthe form of annotated data to learn competent models. These models are\ngenerally trained on data in a single language (usually English), and cannot be\ndirectly used beyond that language. Since collecting data in every language is\nnot realistic, there has been a growing interest in cross-lingual language\nunderstanding (XLU) and low-resource cross-language transfer. In this work, we\nconstruct an evaluation set for XLU by extending the development and test sets\nof the Multi-Genre Natural Language Inference Corpus (MultiNLI) to 15\nlanguages, including low-resource languages such as Swahili and Urdu. We hope\nthat our dataset, dubbed XNLI, will catalyze research in cross-lingual sentence\nunderstanding by providing an informative standard evaluation task. In\naddition, we provide several baselines for multilingual sentence understanding,\nincluding two based on machine translation systems, and two that use parallel\ndata to train aligned multilingual bag-of-words and LSTM encoders. We find that\nXNLI represents a practical and challenging evaluation suite, and that directly\ntranslating the test data yields the best performance among available\nbaselines.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 13 Sep 2018 16:39:53 GMT" + } + ], + "update_date":"2018-09-14", + "authors_parsed":[ + [ + "Conneau", + "Alexis", + "" + ], + [ + "Lample", + "Guillaume", + "" + ], + [ + "Rinott", + "Ruty", + "" + ], + [ + "Williams", + "Adina", + "" + ], + [ + "Bowman", + "Samuel R.", + "" + ], + [ + "Schwenk", + "Holger", + "" + ], + [ + "Stoyanov", + "Veselin", + "" + ] + ], + "categories_split":[ + "cs.CL", + "cs.AI", + "cs.LG" + ], + "citation_count":1050.0, + "inf_cite_count":274.0, + "publication_date":"2018-09-13" + }, + { + "id":"1812.02900", + "submitter":"Scott Fujimoto", + "authors":"Scott Fujimoto, David Meger, Doina Precup", + "title":"Off-Policy Deep Reinforcement Learning without Exploration", + "comments":"ICML 2019", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Many practical applications of reinforcement learning constrain agents to\nlearn from a fixed batch of data which has already been gathered, without\noffering further possibility for data collection. In this paper, we demonstrate\nthat due to errors introduced by extrapolation, standard off-policy deep\nreinforcement learning algorithms, such as DQN and DDPG, are incapable of\nlearning with data uncorrelated to the distribution under the current policy,\nmaking them ineffective for this fixed batch setting. We introduce a novel\nclass of off-policy algorithms, batch-constrained reinforcement learning, which\nrestricts the action space in order to force the agent towards behaving close\nto on-policy with respect to a subset of the given data. We present the first\ncontinuous control deep reinforcement learning algorithm which can learn\neffectively from arbitrary, fixed batch data, and empirically demonstrate the\nquality of its behavior in several tasks.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 7 Dec 2018 04:03:25 GMT" + }, + { + "version":"v2", + "created":"Tue, 29 Jan 2019 19:58:23 GMT" + }, + { + "version":"v3", + "created":"Sat, 10 Aug 2019 03:36:31 GMT" + } + ], + "update_date":"2019-08-13", + "authors_parsed":[ + [ + "Fujimoto", + "Scott", + "" + ], + [ + "Meger", + "David", + "" + ], + [ + "Precup", + "Doina", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "stat.ML" + ], + "citation_count":1141.0, + "inf_cite_count":273.0, + "publication_date":"2018-12-07" + }, + { + "id":"1811.12231", + "submitter":"Robert Geirhos", + "authors":"Robert Geirhos, Patricia Rubisch, Claudio Michaelis, Matthias Bethge,\n Felix A. Wichmann, Wieland Brendel", + "title":"ImageNet-trained CNNs are biased towards texture; increasing shape bias\n improves accuracy and robustness", + "comments":"Accepted at ICLR 2019 (oral)", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG q-bio.NC stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Convolutional Neural Networks (CNNs) are commonly thought to recognise\nobjects by learning increasingly complex representations of object shapes. Some\nrecent studies suggest a more important role of image textures. We here put\nthese conflicting hypotheses to a quantitative test by evaluating CNNs and\nhuman observers on images with a texture-shape cue conflict. We show that\nImageNet-trained CNNs are strongly biased towards recognising textures rather\nthan shapes, which is in stark contrast to human behavioural evidence and\nreveals fundamentally different classification strategies. We then demonstrate\nthat the same standard architecture (ResNet-50) that learns a texture-based\nrepresentation on ImageNet is able to learn a shape-based representation\ninstead when trained on \"Stylized-ImageNet\", a stylized version of ImageNet.\nThis provides a much better fit for human behavioural performance in our\nwell-controlled psychophysical lab setting (nine experiments totalling 48,560\npsychophysical trials across 97 observers) and comes with a number of\nunexpected emergent benefits such as improved object detection performance and\npreviously unseen robustness towards a wide range of image distortions,\nhighlighting advantages of a shape-based representation.\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 29 Nov 2018 15:04:05 GMT" + }, + { + "version":"v2", + "created":"Mon, 14 Jan 2019 13:59:09 GMT" + }, + { + "version":"v3", + "created":"Wed, 9 Nov 2022 23:15:15 GMT" + } + ], + "update_date":"2022-11-11", + "authors_parsed":[ + [ + "Geirhos", + "Robert", + "" + ], + [ + "Rubisch", + "Patricia", + "" + ], + [ + "Michaelis", + "Claudio", + "" + ], + [ + "Bethge", + "Matthias", + "" + ], + [ + "Wichmann", + "Felix A.", + "" + ], + [ + "Brendel", + "Wieland", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG", + "q-bio.NC", + "stat.ML" + ], + "citation_count":2213.0, + "inf_cite_count":273.0, + "publication_date":"2018-09-27" + }, + { + "id":"1904.12848", + "submitter":"Qizhe Xie", + "authors":"Qizhe Xie, Zihang Dai, Eduard Hovy, Minh-Thang Luong, Quoc V. Le", + "title":"Unsupervised Data Augmentation for Consistency Training", + "comments":"NeurIPS 2020", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI cs.CL cs.CV stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Semi-supervised learning lately has shown much promise in improving deep\nlearning models when labeled data is scarce. Common among recent approaches is\nthe use of consistency training on a large amount of unlabeled data to\nconstrain model predictions to be invariant to input noise. In this work, we\npresent a new perspective on how to effectively noise unlabeled examples and\nargue that the quality of noising, specifically those produced by advanced data\naugmentation methods, plays a crucial role in semi-supervised learning. By\nsubstituting simple noising operations with advanced data augmentation methods\nsuch as RandAugment and back-translation, our method brings substantial\nimprovements across six language and three vision tasks under the same\nconsistency training framework. On the IMDb text classification dataset, with\nonly 20 labeled examples, our method achieves an error rate of 4.20,\noutperforming the state-of-the-art model trained on 25,000 labeled examples. On\na standard semi-supervised learning benchmark, CIFAR-10, our method outperforms\nall previous approaches and achieves an error rate of 5.43 with only 250\nexamples. Our method also combines well with transfer learning, e.g., when\nfinetuning from BERT, and yields improvements in high-data regime, such as\nImageNet, whether when there is only 10% labeled data or when a full labeled\nset with 1.3M extra unlabeled examples is used. Code is available at\nhttps:\/\/github.com\/google-research\/uda.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 29 Apr 2019 17:56:59 GMT" + }, + { + "version":"v2", + "created":"Wed, 10 Jul 2019 17:53:48 GMT" + }, + { + "version":"v3", + "created":"Thu, 26 Sep 2019 15:32:11 GMT" + }, + { + "version":"v4", + "created":"Mon, 30 Sep 2019 15:40:40 GMT" + }, + { + "version":"v5", + "created":"Thu, 25 Jun 2020 17:58:43 GMT" + }, + { + "version":"v6", + "created":"Thu, 5 Nov 2020 15:11:02 GMT" + } + ], + "update_date":"2020-11-06", + "authors_parsed":[ + [ + "Xie", + "Qizhe", + "" + ], + [ + "Dai", + "Zihang", + "" + ], + [ + "Hovy", + "Eduard", + "" + ], + [ + "Luong", + "Minh-Thang", + "" + ], + [ + "Le", + "Quoc V.", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "cs.CL", + "cs.CV", + "stat.ML" + ], + "citation_count":1848.0, + "inf_cite_count":268.0, + "publication_date":"2019-04-29" + }, + { + "id":"2112.01527", + "submitter":"Bowen Cheng", + "authors":"Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander\n Kirillov and Rohit Girdhar", + "title":"Masked-attention Mask Transformer for Universal Image Segmentation", + "comments":"CVPR 2022. Project page\/code\/models:\n https:\/\/bowenc0221.github.io\/mask2former", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI cs.LG", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Image segmentation is about grouping pixels with different semantics, e.g.,\ncategory or instance membership, where each choice of semantics defines a task.\nWhile only the semantics of each task differ, current research focuses on\ndesigning specialized architectures for each task. We present Masked-attention\nMask Transformer (Mask2Former), a new architecture capable of addressing any\nimage segmentation task (panoptic, instance or semantic). Its key components\ninclude masked attention, which extracts localized features by constraining\ncross-attention within predicted mask regions. In addition to reducing the\nresearch effort by at least three times, it outperforms the best specialized\narchitectures by a significant margin on four popular datasets. Most notably,\nMask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on\nCOCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7\nmIoU on ADE20K).\n", + "versions":[ + { + "version":"v1", + "created":"Thu, 2 Dec 2021 18:59:58 GMT" + }, + { + "version":"v2", + "created":"Fri, 10 Dec 2021 18:52:09 GMT" + }, + { + "version":"v3", + "created":"Wed, 15 Jun 2022 20:58:09 GMT" + } + ], + "update_date":"2022-06-17", + "authors_parsed":[ + [ + "Cheng", + "Bowen", + "" + ], + [ + "Misra", + "Ishan", + "" + ], + [ + "Schwing", + "Alexander G.", + "" + ], + [ + "Kirillov", + "Alexander", + "" + ], + [ + "Girdhar", + "Rohit", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "cs.LG" + ], + "citation_count":1071.0, + "inf_cite_count":266.0, + "publication_date":"2021-12-02" + }, + { + "id":"1612.03242", + "submitter":"Han Zhang", + "authors":"Han Zhang, Tao Xu, Hongsheng Li, Shaoting Zhang, Xiaogang Wang,\n Xiaolei Huang, Dimitris Metaxas", + "title":"StackGAN: Text to Photo-realistic Image Synthesis with Stacked\n Generative Adversarial Networks", + "comments":"ICCV 2017 Oral Presentation", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.CV cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Synthesizing high-quality images from text descriptions is a challenging\nproblem in computer vision and has many practical applications. Samples\ngenerated by existing text-to-image approaches can roughly reflect the meaning\nof the given descriptions, but they fail to contain necessary details and vivid\nobject parts. In this paper, we propose Stacked Generative Adversarial Networks\n(StackGAN) to generate 256x256 photo-realistic images conditioned on text\ndescriptions. We decompose the hard problem into more manageable sub-problems\nthrough a sketch-refinement process. The Stage-I GAN sketches the primitive\nshape and colors of the object based on the given text description, yielding\nStage-I low-resolution images. The Stage-II GAN takes Stage-I results and text\ndescriptions as inputs, and generates high-resolution images with\nphoto-realistic details. It is able to rectify defects in Stage-I results and\nadd compelling details with the refinement process. To improve the diversity of\nthe synthesized images and stabilize the training of the conditional-GAN, we\nintroduce a novel Conditioning Augmentation technique that encourages\nsmoothness in the latent conditioning manifold. Extensive experiments and\ncomparisons with state-of-the-arts on benchmark datasets demonstrate that the\nproposed method achieves significant improvements on generating photo-realistic\nimages conditioned on text descriptions.\n", + "versions":[ + { + "version":"v1", + "created":"Sat, 10 Dec 2016 03:11:37 GMT" + }, + { + "version":"v2", + "created":"Sat, 5 Aug 2017 02:18:21 GMT" + } + ], + "update_date":"2017-08-08", + "authors_parsed":[ + [ + "Zhang", + "Han", + "" + ], + [ + "Xu", + "Tao", + "" + ], + [ + "Li", + "Hongsheng", + "" + ], + [ + "Zhang", + "Shaoting", + "" + ], + [ + "Wang", + "Xiaogang", + "" + ], + [ + "Huang", + "Xiaolei", + "" + ], + [ + "Metaxas", + "Dimitris", + "" + ] + ], + "categories_split":[ + "cs.CV", + "cs.AI", + "stat.ML" + ], + "citation_count":2471.0, + "inf_cite_count":263.0, + "publication_date":"2016-12-10" + }, + { + "id":"1609.05473", + "submitter":"Lantao Yu", + "authors":"Lantao Yu, Weinan Zhang, Jun Wang, Yong Yu", + "title":"SeqGAN: Sequence Generative Adversarial Nets with Policy Gradient", + "comments":"The Thirty-First AAAI Conference on Artificial Intelligence (AAAI\n 2017)", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI", + "license":"http:\/\/creativecommons.org\/licenses\/by-sa\/4.0\/", + "abstract":" As a new way of training generative models, Generative Adversarial Nets (GAN)\nthat uses a discriminative model to guide the training of the generative model\nhas enjoyed considerable success in generating real-valued data. However, it\nhas limitations when the goal is for generating sequences of discrete tokens. A\nmajor reason lies in that the discrete outputs from the generative model make\nit difficult to pass the gradient update from the discriminative model to the\ngenerative model. Also, the discriminative model can only assess a complete\nsequence, while for a partially generated sequence, it is non-trivial to\nbalance its current score and the future one once the entire sequence has been\ngenerated. In this paper, we propose a sequence generation framework, called\nSeqGAN, to solve the problems. Modeling the data generator as a stochastic\npolicy in reinforcement learning (RL), SeqGAN bypasses the generator\ndifferentiation problem by directly performing gradient policy update. The RL\nreward signal comes from the GAN discriminator judged on a complete sequence,\nand is passed back to the intermediate state-action steps using Monte Carlo\nsearch. Extensive experiments on synthetic data and real-world tasks\ndemonstrate significant improvements over strong baselines.\n", + "versions":[ + { + "version":"v1", + "created":"Sun, 18 Sep 2016 11:42:23 GMT" + }, + { + "version":"v2", + "created":"Tue, 20 Sep 2016 09:44:18 GMT" + }, + { + "version":"v3", + "created":"Sun, 25 Sep 2016 13:06:24 GMT" + }, + { + "version":"v4", + "created":"Mon, 24 Oct 2016 13:19:26 GMT" + }, + { + "version":"v5", + "created":"Fri, 9 Dec 2016 14:37:13 GMT" + }, + { + "version":"v6", + "created":"Fri, 25 Aug 2017 16:22:57 GMT" + } + ], + "update_date":"2017-08-28", + "authors_parsed":[ + [ + "Yu", + "Lantao", + "" + ], + [ + "Zhang", + "Weinan", + "" + ], + [ + "Wang", + "Jun", + "" + ], + [ + "Yu", + "Yong", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI" + ], + "citation_count":2140.0, + "inf_cite_count":263.0, + "publication_date":"2016-09-18" + }, + { + "id":"1711.00399", + "submitter":"Brent Mittelstadt", + "authors":"Sandra Wachter, Brent Mittelstadt, Chris Russell", + "title":"Counterfactual Explanations without Opening the Black Box: Automated\n Decisions and the GDPR", + "comments":null, + "journal-ref":"Harvard Journal of Law & Technology, 2018", + "doi":null, + "report-no":null, + "categories":"cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" There has been much discussion of the right to explanation in the EU General\nData Protection Regulation, and its existence, merits, and disadvantages.\nImplementing a right to explanation that opens the black box of algorithmic\ndecision-making faces major legal and technical barriers. Explaining the\nfunctionality of complex algorithmic decision-making systems and their\nrationale in specific cases is a technically challenging problem. Some\nexplanations may offer little meaningful information to data subjects, raising\nquestions around their value. Explanations of automated decisions need not\nhinge on the general public understanding how algorithmic systems function.\nEven though such interpretability is of great importance and should be pursued,\nexplanations can, in principle, be offered without opening the black box.\nLooking at explanations as a means to help a data subject act rather than\nmerely understand, one could gauge the scope and content of explanations\naccording to the specific goal or action they are intended to support. From the\nperspective of individuals affected by automated decision-making, we propose\nthree aims for explanations: (1) to inform and help the individual understand\nwhy a particular decision was reached, (2) to provide grounds to contest the\ndecision if the outcome is undesired, and (3) to understand what would need to\nchange in order to receive a desired result in the future, based on the current\ndecision-making model. We assess how each of these goals finds support in the\nGDPR. We suggest data controllers should offer a particular type of\nexplanation, unconditional counterfactual explanations, to support these three\naims. These counterfactual explanations describe the smallest change to the\nworld that can be made to obtain a desirable outcome, or to arrive at the\nclosest possible world, without needing to explain the internal logic of the\nsystem.\n", + "versions":[ + { + "version":"v1", + "created":"Wed, 1 Nov 2017 15:39:23 GMT" + }, + { + "version":"v2", + "created":"Mon, 25 Dec 2017 12:26:47 GMT" + }, + { + "version":"v3", + "created":"Wed, 21 Mar 2018 11:43:46 GMT" + } + ], + "update_date":"2018-03-22", + "authors_parsed":[ + [ + "Wachter", + "Sandra", + "" + ], + [ + "Mittelstadt", + "Brent", + "" + ], + [ + "Russell", + "Chris", + "" + ] + ], + "categories_split":[ + "cs.AI" + ], + "citation_count":1785.0, + "inf_cite_count":259.0, + "publication_date":"2017-10-06" + }, + { + "id":"1910.10045", + "submitter":"Javier Del Ser Dr.", + "authors":"Alejandro Barredo Arrieta, Natalia D\\'iaz-Rodr\\'iguez, Javier Del Ser,\n Adrien Bennetot, Siham Tabik, Alberto Barbado, Salvador Garc\\'ia, Sergio\n Gil-L\\'opez, Daniel Molina, Richard Benjamins, Raja Chatila, Francisco\n Herrera", + "title":"Explainable Artificial Intelligence (XAI): Concepts, Taxonomies,\n Opportunities and Challenges toward Responsible AI", + "comments":"67 pages, 13 figures, accepted for its publication in Information\n Fusion", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.AI cs.LG cs.NE", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" In the last years, Artificial Intelligence (AI) has achieved a notable\nmomentum that may deliver the best of expectations over many application\nsectors across the field. For this to occur, the entire community stands in\nfront of the barrier of explainability, an inherent problem of AI techniques\nbrought by sub-symbolism (e.g. ensembles or Deep Neural Networks) that were not\npresent in the last hype of AI. Paradigms underlying this problem fall within\nthe so-called eXplainable AI (XAI) field, which is acknowledged as a crucial\nfeature for the practical deployment of AI models. This overview examines the\nexisting literature in the field of XAI, including a prospect toward what is\nyet to be reached. We summarize previous efforts to define explainability in\nMachine Learning, establishing a novel definition that covers prior conceptual\npropositions with a major focus on the audience for which explainability is\nsought. We then propose and discuss about a taxonomy of recent contributions\nrelated to the explainability of different Machine Learning models, including\nthose aimed at Deep Learning methods for which a second taxonomy is built. This\nliterature analysis serves as the background for a series of challenges faced\nby XAI, such as the crossroads between data fusion and explainability. Our\nprospects lead toward the concept of Responsible Artificial Intelligence,\nnamely, a methodology for the large-scale implementation of AI methods in real\norganizations with fairness, model explainability and accountability at its\ncore. Our ultimate goal is to provide newcomers to XAI with a reference\nmaterial in order to stimulate future research advances, but also to encourage\nexperts and professionals from other disciplines to embrace the benefits of AI\nin their activity sectors, without any prior bias for its lack of\ninterpretability.\n", + "versions":[ + { + "version":"v1", + "created":"Tue, 22 Oct 2019 15:27:30 GMT" + }, + { + "version":"v2", + "created":"Thu, 26 Dec 2019 08:09:25 GMT" + } + ], + "update_date":"2019-12-30", + "authors_parsed":[ + [ + "Arrieta", + "Alejandro Barredo", + "" + ], + [ + "D\u00edaz-Rodr\u00edguez", + "Natalia", + "" + ], + [ + "Del Ser", + "Javier", + "" + ], + [ + "Bennetot", + "Adrien", + "" + ], + [ + "Tabik", + "Siham", + "" + ], + [ + "Barbado", + "Alberto", + "" + ], + [ + "Garc\u00eda", + "Salvador", + "" + ], + [ + "Gil-L\u00f3pez", + "Sergio", + "" + ], + [ + "Molina", + "Daniel", + "" + ], + [ + "Benjamins", + "Richard", + "" + ], + [ + "Chatila", + "Raja", + "" + ], + [ + "Herrera", + "Francisco", + "" + ] + ], + "categories_split":[ + "cs.AI", + "cs.LG", + "cs.NE" + ], + "citation_count":4104.0, + "inf_cite_count":259.0, + "publication_date":"2019-10-22" + }, + { + "id":"2002.00269", + "submitter":"David Heckerman", + "authors":"David Heckerman", + "title":"A Tutorial on Learning With Bayesian Networks", + "comments":"Added a note on averaging causal models", + "journal-ref":"Original version published in Learning in Graphical Models, M.\n Jordan, ed., MIT Press, Cambridge, MA, 1999", + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" A Bayesian network is a graphical model that encodes probabilistic\nrelationships among variables of interest. When used in conjunction with\nstatistical techniques, the graphical model has several advantages for data\nanalysis. One, because the model encodes dependencies among all variables, it\nreadily handles situations where some data entries are missing. Two, a Bayesian\nnetwork can be used to learn causal relationships, and hence can be used to\ngain understanding about a problem domain and to predict the consequences of\nintervention. Three, because the model has both a causal and probabilistic\nsemantics, it is an ideal representation for combining prior knowledge (which\noften comes in causal form) and data. Four, Bayesian statistical methods in\nconjunction with Bayesian networks offer an efficient and principled approach\nfor avoiding the overfitting of data. In this paper, we discuss methods for\nconstructing Bayesian networks from prior knowledge and summarize Bayesian\nstatistical methods for using data to improve these models. With regard to the\nlatter task, we describe methods for learning both the parameters and structure\nof a Bayesian network, including techniques for learning with incomplete data.\nIn addition, we relate Bayesian-network methods for learning to techniques for\nsupervised and unsupervised learning. We illustrate the graphical-modeling\napproach using a real-world case study.\n", + "versions":[ + { + "version":"v1", + "created":"Sat, 1 Feb 2020 20:03:21 GMT" + }, + { + "version":"v2", + "created":"Mon, 8 Mar 2021 22:18:01 GMT" + }, + { + "version":"v3", + "created":"Mon, 10 Jan 2022 14:26:03 GMT" + } + ], + "update_date":"2022-01-11", + "authors_parsed":[ + [ + "Heckerman", + "David", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "stat.ML" + ], + "citation_count":3440.0, + "inf_cite_count":256.0, + "publication_date":"1999-02-01" + }, + { + "id":"1806.01261", + "submitter":"Peter Battaglia", + "authors":"Peter W. Battaglia, Jessica B. Hamrick, Victor Bapst, Alvaro\n Sanchez-Gonzalez, Vinicius Zambaldi, Mateusz Malinowski, Andrea Tacchetti,\n David Raposo, Adam Santoro, Ryan Faulkner, Caglar Gulcehre, Francis Song,\n Andrew Ballard, Justin Gilmer, George Dahl, Ashish Vaswani, Kelsey Allen,\n Charles Nash, Victoria Langston, Chris Dyer, Nicolas Heess, Daan Wierstra,\n Pushmeet Kohli, Matt Botvinick, Oriol Vinyals, Yujia Li, Razvan Pascanu", + "title":"Relational inductive biases, deep learning, and graph networks", + "comments":null, + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.LG cs.AI stat.ML", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" Artificial intelligence (AI) has undergone a renaissance recently, making\nmajor progress in key domains such as vision, language, control, and\ndecision-making. This has been due, in part, to cheap data and cheap compute\nresources, which have fit the natural strengths of deep learning. However, many\ndefining characteristics of human intelligence, which developed under much\ndifferent pressures, remain out of reach for current approaches. In particular,\ngeneralizing beyond one's experiences--a hallmark of human intelligence from\ninfancy--remains a formidable challenge for modern AI.\n The following is part position paper, part review, and part unification. We\nargue that combinatorial generalization must be a top priority for AI to\nachieve human-like abilities, and that structured representations and\ncomputations are key to realizing this objective. Just as biology uses nature\nand nurture cooperatively, we reject the false choice between\n\"hand-engineering\" and \"end-to-end\" learning, and instead advocate for an\napproach which benefits from their complementary strengths. We explore how\nusing relational inductive biases within deep learning architectures can\nfacilitate learning about entities, relations, and rules for composing them. We\npresent a new building block for the AI toolkit with a strong relational\ninductive bias--the graph network--which generalizes and extends various\napproaches for neural networks that operate on graphs, and provides a\nstraightforward interface for manipulating structured knowledge and producing\nstructured behaviors. We discuss how graph networks can support relational\nreasoning and combinatorial generalization, laying the foundation for more\nsophisticated, interpretable, and flexible patterns of reasoning. As a\ncompanion to this paper, we have released an open-source software library for\nbuilding graph networks, with demonstrations of how to use them in practice.\n", + "versions":[ + { + "version":"v1", + "created":"Mon, 4 Jun 2018 17:58:18 GMT" + }, + { + "version":"v2", + "created":"Mon, 11 Jun 2018 13:33:54 GMT" + }, + { + "version":"v3", + "created":"Wed, 17 Oct 2018 17:51:36 GMT" + } + ], + "update_date":"2018-10-18", + "authors_parsed":[ + [ + "Battaglia", + "Peter W.", + "" + ], + [ + "Hamrick", + "Jessica B.", + "" + ], + [ + "Bapst", + "Victor", + "" + ], + [ + "Sanchez-Gonzalez", + "Alvaro", + "" + ], + [ + "Zambaldi", + "Vinicius", + "" + ], + [ + "Malinowski", + "Mateusz", + "" + ], + [ + "Tacchetti", + "Andrea", + "" + ], + [ + "Raposo", + "David", + "" + ], + [ + "Santoro", + "Adam", + "" + ], + [ + "Faulkner", + "Ryan", + "" + ], + [ + "Gulcehre", + "Caglar", + "" + ], + [ + "Song", + "Francis", + "" + ], + [ + "Ballard", + "Andrew", + "" + ], + [ + "Gilmer", + "Justin", + "" + ], + [ + "Dahl", + "George", + "" + ], + [ + "Vaswani", + "Ashish", + "" + ], + [ + "Allen", + "Kelsey", + "" + ], + [ + "Nash", + "Charles", + "" + ], + [ + "Langston", + "Victoria", + "" + ], + [ + "Dyer", + "Chris", + "" + ], + [ + "Heess", + "Nicolas", + "" + ], + [ + "Wierstra", + "Daan", + "" + ], + [ + "Kohli", + "Pushmeet", + "" + ], + [ + "Botvinick", + "Matt", + "" + ], + [ + "Vinyals", + "Oriol", + "" + ], + [ + "Li", + "Yujia", + "" + ], + [ + "Pascanu", + "Razvan", + "" + ] + ], + "categories_split":[ + "cs.LG", + "cs.AI", + "stat.ML" + ], + "citation_count":2643.0, + "inf_cite_count":254.0, + "publication_date":"2018-06-04" + }, + { + "id":"1605.08695", + "submitter":"Derek Murray", + "authors":"Mart\\'in Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis,\n Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael\n Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G.\n Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin\n Wicke, Yuan Yu and Xiaoqiang Zheng", + "title":"TensorFlow: A system for large-scale machine learning", + "comments":"18 pages, 9 figures; v2 has a spelling correction in the metadata", + "journal-ref":null, + "doi":null, + "report-no":null, + "categories":"cs.DC cs.AI", + "license":"http:\/\/arxiv.org\/licenses\/nonexclusive-distrib\/1.0\/", + "abstract":" TensorFlow is a machine learning system that operates at large scale and in\nheterogeneous environments. TensorFlow uses dataflow graphs to represent\ncomputation, shared state, and the operations that mutate that state. It maps\nthe nodes of a dataflow graph across many machines in a cluster, and within a\nmachine across multiple computational devices, including multicore CPUs,\ngeneral-purpose GPUs, and custom designed ASICs known as Tensor Processing\nUnits (TPUs). This architecture gives flexibility to the application developer:\nwhereas in previous \"parameter server\" designs the management of shared state\nis built into the system, TensorFlow enables developers to experiment with\nnovel optimizations and training algorithms. TensorFlow supports a variety of\napplications, with particularly strong support for training and inference on\ndeep neural networks. Several Google services use TensorFlow in production, we\nhave released it as an open-source project, and it has become widely used for\nmachine learning research. In this paper, we describe the TensorFlow dataflow\nmodel in contrast to existing systems, and demonstrate the compelling\nperformance that TensorFlow achieves for several real-world applications.\n", + "versions":[ + { + "version":"v1", + "created":"Fri, 27 May 2016 15:49:50 GMT" + }, + { + "version":"v2", + "created":"Tue, 31 May 2016 19:46:10 GMT" + } + ], + "update_date":"2016-06-01", + "authors_parsed":[ + [ + "Abadi", + "Mart\u00edn", + "" + ], + [ + "Barham", + "Paul", + "" + ], + [ + "Chen", + "Jianmin", + "" + ], + [ + "Chen", + "Zhifeng", + "" + ], + [ + "Davis", + "Andy", + "" + ], + [ + "Dean", + "Jeffrey", + "" + ], + [ + "Devin", + "Matthieu", + "" + ], + [ + "Ghemawat", + "Sanjay", + "" + ], + [ + "Irving", + "Geoffrey", + "" + ], + [ + "Isard", + "Michael", + "" + ], + [ + "Kudlur", + "Manjunath", + "" + ], + [ + "Levenberg", + "Josh", + "" + ], + [ + "Monga", + "Rajat", + "" + ], + [ + "Moore", + "Sherry", + "" + ], + [ + "Murray", + "Derek G.", + "" + ], + [ + "Steiner", + "Benoit", + "" + ], + [ + "Tucker", + "Paul", + "" + ], + [ + "Vasudevan", + "Vijay", + "" + ], + [ + "Warden", + "Pete", + "" + ], + [ + "Wicke", + "Martin", + "" + ], + [ + "Yu", + "Yuan", + "" + ], + [ + "Zheng", + "Xiaoqiang", + "" + ] + ], + "categories_split":[ + "cs.DC", + "cs.AI" + ], + "citation_count":null, + "inf_cite_count":null, + "publication_date":null + } +] \ No newline at end of file diff --git a/backend/backend/utils/scripts/seed_db.py b/backend/backend/utils/scripts/seed_db.py new file mode 100644 index 00000000..8a95d34f --- /dev/null +++ b/backend/backend/utils/scripts/seed_db.py @@ -0,0 +1,30 @@ +from utils.api.workspace import create_workspace +from utils.api.paper import upload_paper_with_metadata +from models.dto_models import Paper +import json +import os +import uuid + +def seed_db(): + create_workspace(user_id="user1", workspace_name="workspace1", workspace_description="my first workspace") + + # upload_paper_with_metadata(Paper(title="My First Paper", id="123", abstract="This is the abstract of my first paper", arxiv_id="2106.15928", cite_count=10, inf_cite_count=5, publication_date="2021-06-28", pdf_blob="https://arthasai.s3.us-east-2.amazonaws.com/arxiv_pdfs/2106.15928.pdf", raw_markdown="# test")) + + print(os.getcwd()) + papers = json.load(open(os.path.join(os.getcwd(), "utils/scripts/paper_metadata.json"), "r")) + + for paper in papers: + paper_uuid = str(uuid.uuid4()) + parsed_paper: Paper = { + "cite_count": paper["citation_count"], + "inf_cite_count": paper["inf_cite_count"], + "publication_date": paper["publication_date"] if "publication_date" in paper and paper["publication_date"] is not None else "2021-06-28", + "pdf_blob": f"https://arxiv.org/pdf/{paper['id']}.pdf", + "raw_markdown": "# test", + "title": paper["title"], + "abstract": paper["abstract"], + "arxiv_id": paper["id"], + "id": paper_uuid + } + + upload_paper_with_metadata(Paper(**parsed_paper)) diff --git a/backend/poetry.lock b/backend/poetry.lock index 47aae08b..1a6ac408 100644 --- a/backend/poetry.lock +++ b/backend/poetry.lock @@ -1,4 +1,8 @@ +<<<<<<< HEAD +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +======= # This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +>>>>>>> development [[package]] name = "accelerate" @@ -173,6 +177,8 @@ test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>= trio = ["trio (<0.22)"] [[package]] +<<<<<<< HEAD +======= name = "appnope" version = "0.1.4" description = "Disable App Nap on macOS >= 10.9" @@ -202,6 +208,7 @@ astroid = ["astroid (>=1,<2)", "astroid (>=2,<4)"] test = ["astroid (>=1,<2)", "astroid (>=2,<4)", "pytest"] [[package]] +>>>>>>> development name = "async-timeout" version = "4.0.3" description = "Timeout context manager for asyncio programs" @@ -233,17 +240,30 @@ tests-no-zope = ["attrs[tests-mypy]", "cloudpickle", "hypothesis", "pympler", "p [[package]] name = "boto3" +<<<<<<< HEAD +version = "1.34.81" +======= version = "1.34.82" +>>>>>>> development description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ +<<<<<<< HEAD + {file = "boto3-1.34.81-py3-none-any.whl", hash = "sha256:18224d206a8a775bcaa562d22ed3d07854934699190e12b52fcde87aac76a80e"}, + {file = "boto3-1.34.81.tar.gz", hash = "sha256:004dad209d37b3d2df88f41da13b7ad702a751904a335fac095897ff7a19f82b"}, +] + +[package.dependencies] +botocore = ">=1.34.81,<1.35.0" +======= {file = "boto3-1.34.82-py3-none-any.whl", hash = "sha256:6e0ee12e87b37fa81133e9308d0957fce4200c1ff37c96346538dba5e857da18"}, {file = "boto3-1.34.82.tar.gz", hash = "sha256:fcdb84936b04d5f78c8c8667b65bf5b9803cf39fd25bb7fe57ba237074e36171"}, ] [package.dependencies] botocore = ">=1.34.82,<1.35.0" +>>>>>>> development jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -252,13 +272,22 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" +<<<<<<< HEAD +version = "1.34.81" +======= version = "1.34.82" +>>>>>>> development description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ +<<<<<<< HEAD + {file = "botocore-1.34.81-py3-none-any.whl", hash = "sha256:85f6fd7c5715eeef7a236c50947de00f57d72e7439daed1125491014b70fab01"}, + {file = "botocore-1.34.81.tar.gz", hash = "sha256:f79bf122566cc1f09d71cc9ac9fcf52d47ba48b761cbc3f064017b36a3c40eb8"}, +======= {file = "botocore-1.34.82-py3-none-any.whl", hash = "sha256:8f839e9a88e7ac7185e406be4cf9926673374e8a6ecc295302f56f7e3c618692"}, {file = "botocore-1.34.82.tar.gz", hash = "sha256:2fd14676152f9d64541099090cc64973fdf8232744256454de443583e35e497d"}, +>>>>>>> development ] [package.dependencies] @@ -284,6 +313,8 @@ files = [ ] [[package]] +<<<<<<< HEAD +======= name = "cffi" version = "1.16.0" description = "Foreign Function Interface for Python calling C code." @@ -348,6 +379,7 @@ files = [ pycparser = "*" [[package]] +>>>>>>> development name = "charset-normalizer" version = "3.3.2" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." @@ -472,6 +504,8 @@ files = [ ] [[package]] +<<<<<<< HEAD +======= name = "comm" version = "0.2.2" description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-python etc." @@ -531,6 +565,7 @@ files = [ ] [[package]] +>>>>>>> development name = "distro" version = "1.9.0" description = "Distro - an OS platform information API" @@ -581,6 +616,8 @@ files = [ test = ["pytest (>=6)"] [[package]] +<<<<<<< HEAD +======= name = "executing" version = "2.0.1" description = "Get the currently executing AST node of a frame, and other information" @@ -595,6 +632,7 @@ files = [ tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] [[package]] +>>>>>>> development name = "faiss-cpu" version = "1.8.0" description = "A library for efficient similarity search and clustering of dense vectors." @@ -890,6 +928,8 @@ files = [ ] [[package]] +<<<<<<< HEAD +======= name = "importlib-metadata" version = "7.1.0" description = "Read metadata from Python packages" @@ -998,6 +1038,7 @@ qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] [[package]] +>>>>>>> development name = "jinja2" version = "3.1.3" description = "A very fast and expressive template engine." @@ -1032,6 +1073,13 @@ description = "Lightweight pipelining with Python functions" optional = false python-versions = ">=3.8" files = [ +<<<<<<< HEAD + {file = "joblib-1.4.0-py3-none-any.whl", hash = "sha256:42942470d4062537be4d54c83511186da1fc14ba354961a2114da91efa9a4ed7"}, + {file = "joblib-1.4.0.tar.gz", hash = "sha256:1eb0dc091919cd384490de890cb5dfd538410a6d4b3b54eef09fb8c50b409b1c"}, +] + +[[package]] +======= {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, ] @@ -1080,6 +1128,7 @@ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx-autodoc-typehints", "sphin test = ["ipykernel", "pre-commit", "pytest (<8)", "pytest-cov", "pytest-timeout"] [[package]] +>>>>>>> development name = "llvmlite" version = "0.42.0" description = "lightweight wrapper around basic LLVM functionality" @@ -1203,6 +1252,8 @@ files = [ ] [[package]] +<<<<<<< HEAD +======= name = "matplotlib-inline" version = "0.1.6" description = "Inline Matplotlib backend for Jupyter" @@ -1217,6 +1268,7 @@ files = [ traitlets = "*" [[package]] +>>>>>>> development name = "mdurl" version = "0.1.2" description = "Markdown URL utilities" @@ -1644,6 +1696,8 @@ files = [ ] [[package]] +<<<<<<< HEAD +======= name = "pandas" version = "2.2.1" description = "Powerful data structures for data analysis, time series, and statistics" @@ -1746,6 +1800,7 @@ files = [ ptyprocess = ">=0.5" [[package]] +>>>>>>> development name = "pillow" version = "10.3.0" description = "Python Imaging Library (Fork)" @@ -1832,6 +1887,8 @@ typing = ["typing-extensions"] xmp = ["defusedxml"] [[package]] +<<<<<<< HEAD +======= name = "platformdirs" version = "4.2.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." @@ -1861,6 +1918,7 @@ files = [ wcwidth = "*" [[package]] +>>>>>>> development name = "psutil" version = "5.9.8" description = "Cross-platform lib for process and system monitoring in Python." @@ -1887,6 +1945,8 @@ files = [ [package.extras] test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +<<<<<<< HEAD +======= [[package]] name = "ptyprocess" @@ -1923,6 +1983,7 @@ files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] +>>>>>>> development [[package]] name = "pydantic" @@ -2126,6 +2187,8 @@ files = [ ] [[package]] +<<<<<<< HEAD +======= name = "pywin32" version = "306" description = "Python for Window Extensions" @@ -2149,6 +2212,7 @@ files = [ ] [[package]] +>>>>>>> development name = "pyyaml" version = "6.0.1" description = "YAML parser and emitter for Python" @@ -2209,6 +2273,8 @@ files = [ ] [[package]] +<<<<<<< HEAD +======= name = "pyzmq" version = "25.1.2" description = "Python bindings for 0MQ" @@ -2314,6 +2380,7 @@ files = [ cffi = {version = "*", markers = "implementation_name == \"pypy\""} [[package]] +>>>>>>> development name = "regex" version = "2023.12.25" description = "Alternative regular expression module, to replace re." @@ -2742,6 +2809,8 @@ files = [ ] [[package]] +<<<<<<< HEAD +======= name = "stack-data" version = "0.6.3" description = "Extract data from python stack frames and tracebacks for informative displays" @@ -2761,6 +2830,7 @@ pure-eval = "*" tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] [[package]] +>>>>>>> development name = "starlette" version = "0.37.2" description = "The little ASGI library that shines." @@ -2832,6 +2902,54 @@ files = [ ] [[package]] +<<<<<<< HEAD +name = "tiktoken" +version = "0.5.1" +description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tiktoken-0.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2b0bae3fd56de1c0a5874fb6577667a3c75bf231a6cef599338820210c16e40a"}, + {file = "tiktoken-0.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e529578d017045e2f0ed12d2e00e7e99f780f477234da4aae799ec4afca89f37"}, + {file = "tiktoken-0.5.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edd2ffbb789712d83fee19ab009949f998a35c51ad9f9beb39109357416344ff"}, + {file = "tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4c73d47bdc1a3f1f66ffa019af0386c48effdc6e8797e5e76875f6388ff72e9"}, + {file = "tiktoken-0.5.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:46b8554b9f351561b1989157c6bb54462056f3d44e43aa4e671367c5d62535fc"}, + {file = "tiktoken-0.5.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:92ed3bbf71a175a6a4e5fbfcdb2c422bdd72d9b20407e00f435cf22a68b4ea9b"}, + {file = "tiktoken-0.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:714efb2f4a082635d9f5afe0bf7e62989b72b65ac52f004eb7ac939f506c03a4"}, + {file = "tiktoken-0.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a10488d1d1a5f9c9d2b2052fdb4cf807bba545818cb1ef724a7f5d44d9f7c3d4"}, + {file = "tiktoken-0.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8079ac065572fe0e7c696dbd63e1fdc12ce4cdca9933935d038689d4732451df"}, + {file = "tiktoken-0.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ef730db4097f5b13df8d960f7fdda2744fe21d203ea2bb80c120bb58661b155"}, + {file = "tiktoken-0.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:426e7def5f3f23645dada816be119fa61e587dfb4755de250e136b47a045c365"}, + {file = "tiktoken-0.5.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:323cec0031358bc09aa965c2c5c1f9f59baf76e5b17e62dcc06d1bb9bc3a3c7c"}, + {file = "tiktoken-0.5.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5abd9436f02e2c8eda5cce2ff8015ce91f33e782a7423de2a1859f772928f714"}, + {file = "tiktoken-0.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:1fe99953b63aabc0c9536fbc91c3c9000d78e4755edc28cc2e10825372046a2d"}, + {file = "tiktoken-0.5.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:dcdc630461927718b317e6f8be7707bd0fc768cee1fdc78ddaa1e93f4dc6b2b1"}, + {file = "tiktoken-0.5.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1f2b3b253e22322b7f53a111e1f6d7ecfa199b4f08f3efdeb0480f4033b5cdc6"}, + {file = "tiktoken-0.5.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43ce0199f315776dec3ea7bf86f35df86d24b6fcde1babd3e53c38f17352442f"}, + {file = "tiktoken-0.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a84657c083d458593c0235926b5c993eec0b586a2508d6a2020556e5347c2f0d"}, + {file = "tiktoken-0.5.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c008375c0f3d97c36e81725308699116cd5804fdac0f9b7afc732056329d2790"}, + {file = "tiktoken-0.5.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:779c4dea5edd1d3178734d144d32231e0b814976bec1ec09636d1003ffe4725f"}, + {file = "tiktoken-0.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:b5dcfcf9bfb798e86fbce76d40a1d5d9e3f92131aecfa3d1e5c9ea1a20f1ef1a"}, + {file = "tiktoken-0.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b180a22db0bbcc447f691ffc3cf7a580e9e0587d87379e35e58b826ebf5bc7b"}, + {file = "tiktoken-0.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2b756a65d98b7cf760617a6b68762a23ab8b6ef79922be5afdb00f5e8a9f4e76"}, + {file = "tiktoken-0.5.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba9873c253ca1f670e662192a0afcb72b41e0ba3e730f16c665099e12f4dac2d"}, + {file = "tiktoken-0.5.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74c90d2be0b4c1a2b3f7dde95cd976757817d4df080d6af0ee8d461568c2e2ad"}, + {file = "tiktoken-0.5.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:709a5220891f2b56caad8327fab86281787704931ed484d9548f65598dea9ce4"}, + {file = "tiktoken-0.5.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5d5a187ff9c786fae6aadd49f47f019ff19e99071dc5b0fe91bfecc94d37c686"}, + {file = "tiktoken-0.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:e21840043dbe2e280e99ad41951c00eff8ee3b63daf57cd4c1508a3fd8583ea2"}, + {file = "tiktoken-0.5.1.tar.gz", hash = "sha256:27e773564232004f4f810fd1f85236673ec3a56ed7f1206fc9ed8670ebedb97a"}, +] + +[package.dependencies] +regex = ">=2022.1.18" +requests = ">=2.26.0" + +[package.extras] +blobfile = ["blobfile (>=2)"] + +[[package]] +======= +>>>>>>> development name = "together" version = "1.0.1" description = "Python client for Together's Cloud Platform!" @@ -3040,6 +3158,8 @@ opt-einsum = ["opt-einsum (>=3.3)"] optree = ["optree (>=0.9.1)"] [[package]] +<<<<<<< HEAD +======= name = "tornado" version = "6.4" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." @@ -3060,6 +3180,7 @@ files = [ ] [[package]] +>>>>>>> development name = "tqdm" version = "4.66.2" description = "Fast, Extensible Progress Meter" @@ -3080,6 +3201,8 @@ slack = ["slack-sdk"] telegram = ["requests"] [[package]] +<<<<<<< HEAD +======= name = "traitlets" version = "5.14.2" description = "Traitlets Python configuration system" @@ -3095,6 +3218,7 @@ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.1)", "pytest-mock", "pytest-mypy-testing"] [[package]] +>>>>>>> development name = "transformers" version = "4.39.3" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" @@ -3214,6 +3338,8 @@ files = [ ] [[package]] +<<<<<<< HEAD +======= name = "tzdata" version = "2024.1" description = "Provider of IANA time zone data" @@ -3225,6 +3351,7 @@ files = [ ] [[package]] +>>>>>>> development name = "umap-learn" version = "0.5.5" description = "Uniform Manifold Approximation and Projection" @@ -3283,6 +3410,8 @@ typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] [[package]] +<<<<<<< HEAD +======= name = "wcwidth" version = "0.2.13" description = "Measures the displayed width of unicode strings in a terminal" @@ -3294,6 +3423,7 @@ files = [ ] [[package]] +>>>>>>> development name = "yarl" version = "1.9.4" description = "Yet another URL library" @@ -3396,6 +3526,12 @@ files = [ idna = ">=2.0" multidict = ">=4.0" +<<<<<<< HEAD +[metadata] +lock-version = "2.0" +python-versions = "^3.9" +content-hash = "50719ec1c0bba387d736fdeb55abc8f2ea0bc1c92526097a6df92e074e09e7c2" +======= [[package]] name = "zipp" version = "3.18.1" @@ -3415,3 +3551,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p lock-version = "2.0" python-versions = "^3.9" content-hash = "3af8f413db5db85b6d22ad2a542577a2db5c983d949c0a014bd251ff59521171" +>>>>>>> development diff --git a/backend/pyproject.toml b/backend/pyproject.toml index af389a3e..628ab3d5 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -29,11 +29,14 @@ einops = "*" accelerate = "*" together = "^1.0.1" pydantic-settings = "^2.2.1" +<<<<<<< HEAD +======= pandas = "^2.2.1" [tool.poetry.group.dev.dependencies] ipykernel = "^6.29.4" scipy = "^1.13.0" +>>>>>>> development [build-system] requires = ["poetry-core"]