-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathregression_rq2.py
54 lines (45 loc) · 2.14 KB
/
regression_rq2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import numpy as np
import argparse
from config import *
import statsmodels.formula.api as sm
np.random.seed(42)
parser = argparse.ArgumentParser(description="Run regression.")
parser.add_argument('--dataset', type=str, default='gowalla')
parser.add_argument('--start_id', type=int, default=0)
parser.add_argument('--end_id', type=int, default=600)
parser.add_argument('--characteristics', type=str, nargs='+', default=ACCEPTED_CHARACTERISTICS)
parser.add_argument('--strategy', type=str, default='edge_dropout')
parser.add_argument('--alpha', type=int, default=1.0)
args = parser.parse_args()
results_edge = pd.read_csv(f'data/{args.dataset}/characteristics_{args.start_id}_{args.end_id}_edge_dropout.tsv', sep='\t')
results_node = pd.read_csv(f'data/{args.dataset}/characteristics_{args.start_id}_{args.end_id}_node_dropout.tsv', sep='\t')
results_edge = results_edge.sample(frac=args.alpha)
results_node = results_node.sample(frac=(1-args.alpha))
results = pd.concat([results_edge, results_node]).sample(frac=1).reset_index(drop=True)
models = ['SVDGCN']
metrics = ['Recall']
characteristics = args.characteristics
results[characteristics] = results[characteristics].apply(
lambda x: (x - x.mean()))
msk = np.random.rand(len(results)) < 0.9
test = results[~msk]
train = results[msk]
for metric in metrics:
models_results = []
for idx, model in enumerate(models):
X = train[characteristics]
y = train[model + '_' + metric]
formula_str_ml = y.name + ' ~ ' + '+'.join(characteristics)
model_ml = sm.ols(formula=formula_str_ml, data=train[characteristics+[model + '_' + metric]])
fitted_ml = model_ml.fit(cov_type='HC1')
models_results.append({
'model': model,
'score': fitted_ml.rsquared,
'adjusted_score': fitted_ml.rsquared_adj,
**fitted_ml.params.to_dict(),
**fitted_ml.pvalues.rename(lambda x: 'p_'+x).to_dict()
})
df = pd.DataFrame.from_dict(models_results)
df.to_csv(f'data/{args.dataset}/regression_{args.alpha}_{metric.lower()}_{args.start_id}_{args.end_id}.tsv',
sep='\t', index=None)