-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
62 lines (47 loc) · 1.29 KB
/
preprocessing.py
File metadata and controls
62 lines (47 loc) · 1.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
import numpy as np
def load_data(
path: str,
dropna: bool = True
) -> pd.DataFrame:
""" Loads the .csv at the given path and returns a pandas dataframe.
"""
df = pd.read_csv(path, index_col=False)
excluded = [
'Index',
'Hogwarts House',
'Arithmancy',
'Care of Magical Creatures',
'Potions',
'Flying',
]
features = [
f
for f in df.select_dtypes('number').columns.tolist()
if f not in excluded
]
if dropna:
df = df.dropna(subset=features)
pd.options.display.float_format = '{:.1f}'.format # 2 decimals only
return df, features
def train_test_split(
x: np.ndarray,
y: np.ndarray,
test_size: float
):
if len(x) != len(y):
raise ValueError('x and y have to be of same size.')
if not (0 <= test_size <= 1):
raise ValueError('test_size must be between 0 and 1.')
n = len(x)
n_test = int(n * test_size)
n_train = n - n_test
indices = np.arange(n)
np.random.shuffle(indices)
x_shuffle = x[indices]
y_shuffle = y[indices]
X_train = x_shuffle[:n_train]
X_test = x_shuffle[n_train:]
y_train = y_shuffle[:n_train]
y_test = y_shuffle[n_train:]
return X_train, X_test, y_train, y_test