TimeSeriesForecastingNN/curateData.py at main · ElisonSherton/TimeSeriesForecastingNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from load_data import stockTickerDataset

def curateData(pth, price_col, date_col, n_steps):
    """Reads the dataset and based on n_steps/lags to consider in the time series, creates input output pairs

    Args:
        pth ([str]): [Path to the csv file]
        price_col ([str]): [The name of column in the dataframe that holds the closing price for the stock]
        date_col ([str]): [The nameo oc column in the dataframe which holds dates values]
        n_steps ([int]): [Number of steps/ lags based on which prediction is made]
    """
    df = pd.read_csv(pth)

    # Create lags for the price column
    for idx in range(n_steps):
        df[f"lag_{idx + 1}"] = df[price_col].shift(periods = (idx + 1))

    # Create a dataframe which has only the lags and the date
    new_df = df[[date_col, price_col] + [f"lag_{x + 1}" for x in range(n_steps)]]
    new_df = new_df.iloc[n_steps:-1, :]

    # Get a list of dates for which these inputs and outputs are
    dates = list(new_df[date_col])

    # Create input and output pairs out of this new_df
    ips = []
    ops = []
    for entry in new_df.itertuples():
        ip = entry[-n_steps:][::-1]
        op = entry[-(n_steps + 1)]
        ips.append(ip)
        ops.append(op)

    return (ips, ops, dates)

def standardizeData(X, SS = None, train = False):
    """Given a list of input features, standardizes them to bring them onto a homogenous scale

    Args:
        X ([dataframe]): [A dataframe of all the input values]
        SS ([object], optional): [A StandardScaler object that holds mean and std of a standardized dataset]. Defaults to None.
        train (bool, optional): [If False, means validation set to be loaded and SS needs to be passed to scale it]. Defaults to False.
    """
    if train:
        SS = StandardScaler()
        new_X = SS.fit_transform(X)
        return (new_X, SS)
    else:
        new_X = SS.transform(X)
        return (new_X, None)

def getDL(x, y, params):
    """Given the inputs, labels and dataloader parameters, returns a pytorch dataloader

    Args:
        x ([list]): [inputs list]
        y ([list]): [target variable list]
        params ([dict]): [Parameters pertaining to dataloader eg. batch size]
    """
    training_set = stockTickerDataset(x, y)
    training_generator = torch.utils.data.DataLoader(training_set, **params)
    return training_generator

def get_preds(generator, model):
    """Given a pytorch neural network model and a generator object, extracts predictions and returns the same

    Args:
        generator ([object]): [A pytorch dataloader which holds inputs on which we wanna predict]
        model ([object]): [A pytorch model with which we will predict stock prices on input data]

    """
    all_preds = []
    all_labels = []
    all_ips = []
    for xb, yb in generator:
        ips = xb.unsqueeze(0)
        ops = model.predict(ips)
        all_preds.append(ops)
        all_ips.append(ips)
        all_labels.append(yb)
    return (torch.cat(all_preds), torch.cat(all_labels))