Complex_Phenogeno/Functions.py at master · NCBI-Hackathons/Complex_Phenogeno · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def PhenVar():
	print("\nAs a first step, you can confirm that your query rsID returns results from literature that are specific to your complex disease of interest.\n")
	web = input("Do you want to open a browser to check your rsID with PhenVar? (Yes/No)").lower()
	ID =input("Please enter a query rsID number (do not include the letters 'rs'): ")
	if web == 'yes':
		import webbrowser
		url = 'https://phenvar.colorado.edu/results/?rsids='+ID+'&visualization=png-wordcloud&visualization=js-graph&normalization_type=default'
		webbrowser.open_new(url)
		print("A new web browser window has opened that is showing the PhenVar results for rs{}".format(ID))
	return(ID)

def rev_dict(dictionary):
	'''
	This function takes a dictionary with strings as keys and a single number as its value.
	It reverses it by grouping all the keys with the same numeric values together and using
	that numeric value as the key of the new dictionary.
	'''
	newDict = {i:[] for i in set(dictionary.values())}
	for i in dictionary.items():
		newDict[i[1]].append(str(i[0]))
	return(newDict)

def ExtractRsID(Prefix):
	'''The following function requires installation of plink 1.9 from https:
	//www.cog-genomics.org/plink2 at the PATH directory'''
	from os import system
	print("Processing {}".format(Prefix))
	system('plink --vcf ./../files/{}.vcf.gz --recode --extract ./../Outputs/rsids.txt --out {}'.format(Prefix, Prefix))
	system('plink --file {} --recodeAD --out {}'.format(Prefix, Prefix))

def importRaw(Prefix):
	'''Imports and cleanups .raw files created by ExtractRsID'''
	import pandas as pd
	tab = pd.read_table(Prefix + '.raw', sep=' ')
	to_drop = [i for i in list(tab)[1:] if 'HET' in i or 'rs' not in i]
	tab.drop(to_drop, axis=1, inplace=True, errors='raise')
	return(tab)

def clin_data(name,pct_train,regressor):

    ''' This function reads in the csv file of the given name from files, splits it randomly to
    train and test data according to the percentage of train data given,
    and returns the train and test, outputs(Y) and features(X) as a dictionary'''

    import pandas as pd
    import numpy as np

    data = pd.read_csv('./../files/' + name + '.csv',sep = "," , index_col = 0)
    is_train = np.random.uniform(0, 1, len(data)) <= pct_train
    train_idx = [i[0] for i in zip(range(len(data)), is_train) if i[1]==True]
    test_idx = [i[0] for i in zip(range(len(data)), is_train) if i[1]==False]
    train_data = data.filter(items=train_idx, axis = 0)
    test_data = data.filter(items=test_idx, axis = 0)

    d = {
    'train_Y' : train_data[regressor],
    'test_Y' : test_data[regressor],
    'train_X' : train_data.drop(regressor, axis=1),
    'test_X' : test_data.drop(regressor,axis=1),
    'dataset' : data
    }
    return(d)

def empirical_auc(data, Y_hat):
    ''' data = is the dictionary returned from function clin_data()
        Y_hat = is the rpedicted values'''

    import pandas as pd

    TPR = []
    FPR = []
    ranges = range(min(data['test_Y'])+2 , max(data['test_Y'])-2 ,1)
    temp = pd.DataFrame({'obs': list(data['test_Y']), 'pred': Y_hat})

    for i in ranges:
        TP = 0
        FP = 0
        TN = 0
        FN = 0
        for j in range(len(temp)):
            if temp['obs'][j]>=i and temp['pred'][j]>=i:
                TP = TP + 1
            elif temp['obs'][j]<i and temp['pred'][j]<i:
                TN = TN + 1
            elif temp['obs'][j]>=i and temp['pred'][j]<i:
                FN = FN + 1
            elif temp['obs'][j]<i and temp['pred'][j]>=i:
                FP = FP + 1
            else:
                print("Something went wrong with calculating AUC scores.\n The calculated AUC might be inaccurate.")
                break

        TPR.append(TP / (TP + FN))
        FPR.append(FP / (TN + FP))

    from numpy import trapz
    return(trapz(TPR[::-1], x=FPR[::-1]))