-
Notifications
You must be signed in to change notification settings - Fork 4
/
bucket.js
173 lines (144 loc) · 4.47 KB
/
bucket.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
const request = require("request-promise-native");
const xml2js = require("xml2js");
const idx = require("idx");
const lodash = require("lodash");
const { promisify } = require("es6-promisify");
const logger = require("./util/logger");
/**
* Lookup available data dumps on the S3 bucket
* @module bucket
*/
const parseString = promisify(xml2js.parseString);
const BUCKET_URL = "https://discogs-data.s3-us-west-2.amazonaws.com";
const S3B_ROOT_DIR = "data/";
/**
* Get the URL for a specific data dump
* @param version {string} The exact version name, eg '20180101'
* @param collection {string} The type of data. Can be either "artists", "labels",
* "masters" or "releases"
* @returns {string}
*/
function getDumpURL(version, collection) {
return `https://discogs-data.s3-us-west-2.amazonaws.com/data/${version.substring(
0,
4
)}/discogs_${version}_${collection}.xml.gz`;
}
/**
* Get the URL for a checksum file of the specified version
* @param version {string} The exact version name, eg '20180101'
* @returns {string}
*/
function getChecksumURL(version) {
return `https://discogs-data.s3-us-west-2.amazonaws.com/data/${version.substring(
0,
4
)}/discogs_${version}_CHECKSUM.txt`;
}
function createS3QueryUrl(prefix = S3B_ROOT_DIR, marker) {
let s3RestUrl = BUCKET_URL;
s3RestUrl += "?delimiter=/";
if (prefix) {
s3RestUrl += `&prefix=${prefix.replace(/\/$/, "")}/`;
}
if (marker) {
s3RestUrl += `&marker=${marker}`;
}
return s3RestUrl;
}
async function requestListing(yearPrefix) {
const s3QueryUrl = createS3QueryUrl(yearPrefix);
let directoryResponse;
try {
directoryResponse = await request({
url: s3QueryUrl
});
} catch (e) {
throw new Error(`Failed to request ${s3QueryUrl}:\n${e}`);
}
let parsed;
try {
parsed = await parseString(directoryResponse);
} catch (e) {
throw new Error(`Error parsing directory XML: ${e}`);
}
return parsed;
}
/**
* Fetch a set of years available on the Discogs data S3 bucket with their
* paths on the bucket.
* @returns {Promise<Array<{path:string, year:number}>>}
*/
async function fetchYearListings() {
logger.status("Fetching year listings...", true);
const parsed = await await requestListing();
const prefixes = idx(parsed, _ => _.ListBucketResult.CommonPrefixes);
if (!prefixes) {
throw new Error("Could not find prefixes on S3 directory listings");
}
const years = prefixes.map(({ Prefix: [Prefix] }) => ({
path: Prefix,
year: parseInt(Prefix.replace(/^data\//, "").replace(/\/$/, ""), 10)
}));
years.sort((a, b) => (b.year - a.year));
return years;
}
/**
* Fetch the list of files available on the S3 bucket for a certain year
* @param yearPrefix {string} The year prefix of the file. For example:
* "data/2016/"
* @returns {Promise<Array<string>>} An array of paths
*/
async function fetchFileListing(yearPrefix) {
logger.status("Fetching file listings...", true);
const parsed = await requestListing(yearPrefix);
const files = idx(parsed, _ => _.ListBucketResult.Contents);
if (!files) {
throw new Error("Could not find files on S3 listings");
}
return files.map(({ Key: [key] }) => key).filter(f => f.endsWith(".xml.gz"));
}
/**
* Parse a list of file paths (as returned by fetchFileListing). Groups them
* by year
* @param filenames {Array<string>}
* @returns {Object} An object with keys for each year and an array of parsed
* path objects as values.
*/
function parseFileNames(filenames) {
const parsed = filenames.map(path => {
const match = path.match(/discogs_([\d]+)_([^.]+)\.xml/);
if (!match) {
throw new Error("Unable to parse filenames from S3 listing");
}
return {
url: `${BUCKET_URL}/${path}`,
path,
year: parseInt(match[1], 10),
type: match[2]
};
});
return lodash.mapValues(lodash.groupBy(parsed, _ => _.year), r =>
lodash.keyBy(r, _ => _.type)
);
}
/**
* Gets the name of the latest version available in the S3 bucket
* @returns {Promise<string>} A promise that resolves with the version name
*/
async function getLatestVersion() {
const years = await fetchYearListings();
const files = await fetchFileListing(years[0].path);
const versions = parseFileNames(files);
const versionNames = Object.keys(versions).map(v => parseInt(v, 10));
versionNames.sort((a, b) => b - a);
return versionNames[0].toString();
}
module.exports = {
fetchYearListings,
getDumpURL,
getChecksumURL,
getLatestVersion,
fetchFileListing,
parseFileNames
};