-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocess.py
56 lines (45 loc) · 1.83 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from datetime import datetime
import pandas as pd
import os
def convert_dates(date):
"""
Converts time since the Unix epoch in milliseconds to a datetime object.
"""
return datetime.fromtimestamp(date / 1000)
def read_trade_data(filepath):
"""
Read csv containing crypto trade data at <filepath>, convert time fields
and select relevant columns.
:param filepath: path to csv file
:returns: (exchange, symbol, preprocessed Pandas dataframe)
"""
trades = pd.read_csv(filepath)
# convert dates from unix timestamps
trades['date'] = trades['date'].apply(convert_dates)
trades.set_index('date', inplace=True)
trades.sort_index(ascending=True, inplace=True)
# check data relates to a single exchange and coin pair
assert trades['exchange'].nunique() == 1, 'Multiple exchanges present'
assert trades['symbol'].nunique() == 1, 'Multiple symbols present'
# select relevant columns
trade_features = trades[['price', 'amount', 'sell']]
exchange = trades['exchange'].iloc[0]
symbol = trades['symbol'].iloc[0]
return exchange, symbol, trade_features
def write_processed(exchange, symbol, data, loc=None):
"""
Write processed features for a given exchange and coin pair
to a parquet file named '<exchange>_<symbol>_trades.parquet'.
:param exchange: str exchange name
:param symbol: str symbol name
:param data: dataframe
:param loc: alternative filepath in which to save
"""
filename = f'{exchange}_{symbol}_trades.parquet'
path = os.getcwd() if loc is None else loc
filepath = os.path.join(path, filename)
data.to_parquet(filepath)
if __name__ == "__main__":
exchange, symbol, data = read_trade_data('Bitfinex_BTCEUR_trades_'
'2018_02_02.csv')
write_processed(exchange, symbol, data)