NTaylor commited on
Commit
4486c9b
·
1 Parent(s): 124182f

Uploaded main app file and package requirements

Browse files
Files changed (2) hide show
  1. app.py +239 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Demo is Derived from https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-fa-model-selection-py
3
+ """
4
+
5
+ import numpy as np
6
+ import matplotlib.pyplot as plt
7
+
8
+
9
+ from scipy import linalg
10
+ import gradio as gr
11
+ import matplotlib.pyplot as plt
12
+
13
+ from sklearn.decomposition import PCA, FactorAnalysis
14
+ from sklearn.covariance import ShrunkCovariance, LedoitWolf
15
+ from sklearn.model_selection import cross_val_score
16
+ from sklearn.model_selection import GridSearchCV
17
+
18
+
19
+ def create_dataset(n_samples=500, n_features=25, rank=5, sigma=1.0, random_state=42, n_components=5):
20
+ '''
21
+ Function to create a dataset with homoscedastic noise and heteroscedastic noise
22
+ '''
23
+
24
+ # Create a random dataset and add homoscedastic noise and heteroscedastic noise
25
+
26
+ rng = np.random.RandomState(random_state)
27
+ U, _, _ = linalg.svd(rng.randn(n_features, n_features))
28
+ # here n_features must be >= rank as we do a dot product with U[:, :rank].T
29
+ X = np.dot(rng.randn(n_samples, rank), U[:, :rank].T)
30
+
31
+ # Adding homoscedastic noise
32
+ X_homo = X + sigma * rng.randn(n_samples, n_features)
33
+
34
+ # Adding heteroscedastic noise
35
+ sigmas = sigma * rng.rand(n_features) + sigma / 2.0
36
+ X_hetero = X + rng.randn(n_samples, n_features) * sigmas
37
+ n_components_range = np.arange(0, n_features, n_components)
38
+ return X_homo, X_hetero, n_components_range, rank
39
+
40
+
41
+ def compute_scores(X, n_components_range):
42
+ '''
43
+ Function to run PCA and FA with different number of componenets and run cross validation
44
+
45
+ Returns mean PCA and FA scores
46
+ '''
47
+
48
+ pca = PCA(svd_solver="full")
49
+ fa = FactorAnalysis()
50
+
51
+ pca_scores, fa_scores = [], []
52
+ for n in n_components_range:
53
+ pca.n_components = n
54
+ fa.n_components = n
55
+ pca_scores.append(np.mean(cross_val_score(pca, X)))
56
+ fa_scores.append(np.mean(cross_val_score(fa, X)))
57
+
58
+ return pca_scores, fa_scores
59
+
60
+
61
+ def shrunk_cov_score(X):
62
+ shrinkages = np.logspace(-2, 0, 30)
63
+ cv = GridSearchCV(ShrunkCovariance(), {"shrinkage": shrinkages})
64
+ return np.mean(cross_val_score(cv.fit(X).best_estimator_, X))
65
+
66
+
67
+ def lw_score(X):
68
+ return np.mean(cross_val_score(LedoitWolf(), X))
69
+
70
+ #TODO - allow selection of one or both methods
71
+ # def plot_pca_fa_analysis(n_features, n_components):
72
+
73
+ # '''
74
+ # Function to plot results of PCA and FA cross validation analysis
75
+ # '''
76
+
77
+ # X_homo, X_hetero, n_components_range, rank = create_dataset(n_features=n_features, n_components = n_components)
78
+
79
+
80
+ # for X, title in [(X_homo, "Homoscedastic Noise"), (X_hetero, "Heteroscedastic Noise")]:
81
+
82
+ # # compute the pca and fa scores
83
+ # pca_scores, fa_scores = compute_scores(X, n_components_range=n_components_range)
84
+ # n_components_pca = n_components_range[np.argmax(pca_scores)]
85
+ # n_components_fa = n_components_range[np.argmax(fa_scores)]
86
+
87
+ # pca = PCA(svd_solver="full", n_components="mle")
88
+ # pca.fit(X)
89
+ # n_components_pca_mle = pca.n_components_
90
+
91
+ # print("best n_components by PCA CV = %d" % n_components_pca)
92
+ # print("best n_components by FactorAnalysis CV = %d" % n_components_fa)
93
+ # print("best n_components by PCA MLE = %d" % n_components_pca_mle)
94
+
95
+ # fig = plt.figure()
96
+ # fig, (ax1, ax2) = plt.subplots(1,2)
97
+ # plt.plot(n_components_range, pca_scores, "b", label="PCA scores")
98
+ # plt.plot(n_components_range, fa_scores, "r", label="FA scores")
99
+ # plt.axvline(rank, color="g", label="TRUTH: %d" % rank, linestyle="-")
100
+ # plt.axvline(
101
+ # n_components_pca,
102
+ # color="b",
103
+ # label="PCA CV: %d" % n_components_pca,
104
+ # linestyle="--",
105
+ # )
106
+ # plt.axvline(
107
+ # n_components_fa,
108
+ # color="r",
109
+ # label="FactorAnalysis CV: %d" % n_components_fa,
110
+ # linestyle="--",
111
+ # )
112
+ # plt.axvline(
113
+ # n_components_pca_mle,
114
+ # color="k",
115
+ # label="PCA MLE: %d" % n_components_pca_mle,
116
+ # linestyle="--",
117
+ # )
118
+
119
+ # # compare with other covariance estimators
120
+ # plt.axhline(
121
+ # shrunk_cov_score(X),
122
+ # color="violet",
123
+ # label="Shrunk Covariance MLE",
124
+ # linestyle="-.",
125
+ # )
126
+ # plt.axhline(
127
+ # lw_score(X),
128
+ # color="orange",
129
+ # label="LedoitWolf MLE" % n_components_pca_mle,
130
+ # linestyle="-.",
131
+ # )
132
+
133
+ # plt.xlabel("nb of components")
134
+ # plt.ylabel("CV scores")
135
+ # plt.legend(loc="lower right")
136
+ # plt.title(title)
137
+
138
+ # return fig
139
+
140
+ def plot_pca_fa_analysis_side(n_samples, n_features, n_components):
141
+
142
+ X_homo, X_hetero, n_components_range, rank = create_dataset(n_samples = n_samples, n_features=n_features, n_components = n_components)
143
+
144
+ # set up figure - here we will be doing a side by side plot
145
+ fig, axes = plt.subplots(2,1, sharey= False, sharex=True, figsize = (10,8))
146
+
147
+ for X, title, idx in [(X_homo, "Homoscedastic Noise", 0), (X_hetero, "Heteroscedastic Noise", 1)]:
148
+
149
+ # compute the pca and fa scores
150
+ pca_scores, fa_scores = compute_scores(X, n_components_range=n_components_range)
151
+ n_components_pca = n_components_range[np.argmax(pca_scores)]
152
+ n_components_fa = n_components_range[np.argmax(fa_scores)]
153
+
154
+ pca = PCA(svd_solver="full", n_components="mle")
155
+ pca.fit(X)
156
+ n_components_pca_mle = pca.n_components_
157
+
158
+ print("best n_components by PCA CV = %d" % n_components_pca)
159
+ print("best n_components by FactorAnalysis CV = %d" % n_components_fa)
160
+ print("best n_components by PCA MLE = %d" % n_components_pca_mle)
161
+
162
+
163
+ axes[idx].plot(n_components_range, pca_scores, "b", label="PCA scores")
164
+ axes[idx].plot(n_components_range, fa_scores, "r", label="FA scores")
165
+ axes[idx].axvline(rank, color="g", label="TRUTH: %d" % rank, linestyle="-")
166
+ axes[idx].axvline(
167
+ n_components_pca,
168
+ color="b",
169
+ label="PCA CV: %d" % n_components_pca,
170
+ linestyle="--",
171
+ )
172
+ axes[idx].axvline(
173
+ n_components_fa,
174
+ color="r",
175
+ label="FactorAnalysis CV: %d" % n_components_fa,
176
+ linestyle="--",
177
+ )
178
+ axes[idx].axvline(
179
+ n_components_pca_mle,
180
+ color="k",
181
+ label="PCA MLE: %d" % n_components_pca_mle,
182
+ linestyle="--",
183
+ )
184
+
185
+ # compare with other covariance estimators
186
+ axes[idx].axhline(
187
+ shrunk_cov_score(X),
188
+ color="violet",
189
+ label="Shrunk Covariance MLE",
190
+ linestyle="-.",
191
+ )
192
+ axes[idx].axhline(
193
+ lw_score(X),
194
+ color="orange",
195
+ label="LedoitWolf MLE" % n_components_pca_mle,
196
+ linestyle="-.",
197
+ )
198
+
199
+
200
+ # axes[idx].legend(bbox_to_anchor=(1.01, 1.05))
201
+ # plt.xlabel("nb of components")
202
+ # plt.ylabel("CV scores")
203
+ axes[idx].set_xlabel("nb of components")
204
+ axes[idx].set_ylabel("CV scores")
205
+ axes[idx].legend(loc="lower right")
206
+ axes[idx].set_title(title)
207
+
208
+ return fig
209
+
210
+
211
+
212
+ title = " Illustration of Model Selection with Probabilistic PCA and Factor Analysis (FA)"
213
+ with gr.Blocks(title=title) as demo:
214
+ gr.Markdown(f"# {title}")
215
+ gr.Markdown(" This example shows how one can use Prinicipal Components Analysis (PCA) and Factor Analysis (FA) for model selection by observing the likelihood of a held-out dataset with added noise <br>"
216
+ " The number of samples (n_samples) will determine the number of data points to produce. <br>"
217
+ " The number of components (n_components) will determine the number of components each method will fit to, and will affect the likelihood of the held-out set. <br>"
218
+ " The number of features (n_components) determine the number of features the toy dataset X variable will have. <br>"
219
+ " Play with the n_components parameter to see.<br>")
220
+
221
+ gr.Markdown(" **[Demo is based on sklearn docs](https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-fa-model-selection-py)** <br>")
222
+
223
+ gr.Markdown(" **Dataset** : A toy dataset with corrupted with homoscedastic noise (noise variance is the same for each feature) or heteroscedastic noise (noise variance is the different for each feature) . <br>")
224
+ gr.Markdown(" Different number of features and number of components affect how well the low rank space is recovered. <br>"
225
+ " Larger Depth trying to overfit and learn even the finner details of the data.<br>"
226
+ )
227
+
228
+ with gr.Row():
229
+ n_samples = gr.Slider(value=100, min=100, maximum=1000, step=100, label="n_samples")
230
+ n_components = gr.Slider(value=2, min=1, maximum=20, step=1, label="n_components")
231
+ n_features = gr.Slider(value=5, min=5, maximum=25, step=1, label="n_features")
232
+
233
+
234
+ # options for n_components
235
+ btn = gr.Button(value="Submit")
236
+ btn.click(plot_pca_fa_analysis_side, inputs= [n_samples, n_features, n_components], outputs= gr.Plot(label='Multi-output regression with decision trees') ) #
237
+
238
+
239
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ scikit-learn==1.2.2
2
+ matplotlib==3.5.1
3
+ numpy==1.21.6