Upload 2 files
Browse files- .gitattributes +1 -0
- IMDB.ipynb +658 -0
- imdb-002.keras +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
imdb-002.keras filter=lfs diff=lfs merge=lfs -text
|
IMDB.ipynb
ADDED
@@ -0,0 +1,658 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 3,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import pandas as pd\n",
|
10 |
+
"import numpy as np\n",
|
11 |
+
"import re\n",
|
12 |
+
"import nltk\n",
|
13 |
+
"from nltk.corpus import stopwords\n",
|
14 |
+
"from nltk.stem import WordNetLemmatizer\n",
|
15 |
+
"import tensorflow as tf\n",
|
16 |
+
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
|
17 |
+
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
|
18 |
+
"from tensorflow.keras.models import Sequential\n",
|
19 |
+
"from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout\n",
|
20 |
+
"from sklearn.model_selection import train_test_split\n",
|
21 |
+
"from sklearn.preprocessing import LabelEncoder"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": 4,
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [
|
29 |
+
{
|
30 |
+
"name": "stderr",
|
31 |
+
"output_type": "stream",
|
32 |
+
"text": [
|
33 |
+
"[nltk_data] Downloading package stopwords to\n",
|
34 |
+
"[nltk_data] C:\\Users\\gouth\\AppData\\Roaming\\nltk_data...\n",
|
35 |
+
"[nltk_data] Unzipping corpora\\stopwords.zip.\n",
|
36 |
+
"[nltk_data] Downloading package wordnet to\n",
|
37 |
+
"[nltk_data] C:\\Users\\gouth\\AppData\\Roaming\\nltk_data...\n",
|
38 |
+
"[nltk_data] Package wordnet is already up-to-date!\n",
|
39 |
+
"[nltk_data] Downloading package omw-1.4 to\n",
|
40 |
+
"[nltk_data] C:\\Users\\gouth\\AppData\\Roaming\\nltk_data...\n",
|
41 |
+
"[nltk_data] Package omw-1.4 is already up-to-date!\n",
|
42 |
+
"[nltk_data] Downloading package punkt to\n",
|
43 |
+
"[nltk_data] C:\\Users\\gouth\\AppData\\Roaming\\nltk_data...\n",
|
44 |
+
"[nltk_data] Package punkt is already up-to-date!\n"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"data": {
|
49 |
+
"text/plain": [
|
50 |
+
"True"
|
51 |
+
]
|
52 |
+
},
|
53 |
+
"execution_count": 4,
|
54 |
+
"metadata": {},
|
55 |
+
"output_type": "execute_result"
|
56 |
+
}
|
57 |
+
],
|
58 |
+
"source": [
|
59 |
+
"nltk.download('stopwords')\n",
|
60 |
+
"nltk.download('wordnet')\n",
|
61 |
+
"nltk.download('omw-1.4')\n",
|
62 |
+
"nltk.download('punkt')"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"cell_type": "code",
|
67 |
+
"execution_count": 5,
|
68 |
+
"metadata": {},
|
69 |
+
"outputs": [
|
70 |
+
{
|
71 |
+
"data": {
|
72 |
+
"text/html": [
|
73 |
+
"<div>\n",
|
74 |
+
"<style scoped>\n",
|
75 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
76 |
+
" vertical-align: middle;\n",
|
77 |
+
" }\n",
|
78 |
+
"\n",
|
79 |
+
" .dataframe tbody tr th {\n",
|
80 |
+
" vertical-align: top;\n",
|
81 |
+
" }\n",
|
82 |
+
"\n",
|
83 |
+
" .dataframe thead th {\n",
|
84 |
+
" text-align: right;\n",
|
85 |
+
" }\n",
|
86 |
+
"</style>\n",
|
87 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
88 |
+
" <thead>\n",
|
89 |
+
" <tr style=\"text-align: right;\">\n",
|
90 |
+
" <th></th>\n",
|
91 |
+
" <th>review</th>\n",
|
92 |
+
" <th>sentiment</th>\n",
|
93 |
+
" </tr>\n",
|
94 |
+
" </thead>\n",
|
95 |
+
" <tbody>\n",
|
96 |
+
" <tr>\n",
|
97 |
+
" <th>0</th>\n",
|
98 |
+
" <td>One of the other reviewers has mentioned that ...</td>\n",
|
99 |
+
" <td>positive</td>\n",
|
100 |
+
" </tr>\n",
|
101 |
+
" <tr>\n",
|
102 |
+
" <th>1</th>\n",
|
103 |
+
" <td>A wonderful little production. <br /><br />The...</td>\n",
|
104 |
+
" <td>positive</td>\n",
|
105 |
+
" </tr>\n",
|
106 |
+
" <tr>\n",
|
107 |
+
" <th>2</th>\n",
|
108 |
+
" <td>I thought this was a wonderful way to spend ti...</td>\n",
|
109 |
+
" <td>positive</td>\n",
|
110 |
+
" </tr>\n",
|
111 |
+
" <tr>\n",
|
112 |
+
" <th>3</th>\n",
|
113 |
+
" <td>Basically there's a family where a little boy ...</td>\n",
|
114 |
+
" <td>negative</td>\n",
|
115 |
+
" </tr>\n",
|
116 |
+
" <tr>\n",
|
117 |
+
" <th>4</th>\n",
|
118 |
+
" <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
|
119 |
+
" <td>positive</td>\n",
|
120 |
+
" </tr>\n",
|
121 |
+
" <tr>\n",
|
122 |
+
" <th>...</th>\n",
|
123 |
+
" <td>...</td>\n",
|
124 |
+
" <td>...</td>\n",
|
125 |
+
" </tr>\n",
|
126 |
+
" <tr>\n",
|
127 |
+
" <th>49995</th>\n",
|
128 |
+
" <td>I thought this movie did a down right good job...</td>\n",
|
129 |
+
" <td>positive</td>\n",
|
130 |
+
" </tr>\n",
|
131 |
+
" <tr>\n",
|
132 |
+
" <th>49996</th>\n",
|
133 |
+
" <td>Bad plot, bad dialogue, bad acting, idiotic di...</td>\n",
|
134 |
+
" <td>negative</td>\n",
|
135 |
+
" </tr>\n",
|
136 |
+
" <tr>\n",
|
137 |
+
" <th>49997</th>\n",
|
138 |
+
" <td>I am a Catholic taught in parochial elementary...</td>\n",
|
139 |
+
" <td>negative</td>\n",
|
140 |
+
" </tr>\n",
|
141 |
+
" <tr>\n",
|
142 |
+
" <th>49998</th>\n",
|
143 |
+
" <td>I'm going to have to disagree with the previou...</td>\n",
|
144 |
+
" <td>negative</td>\n",
|
145 |
+
" </tr>\n",
|
146 |
+
" <tr>\n",
|
147 |
+
" <th>49999</th>\n",
|
148 |
+
" <td>No one expects the Star Trek movies to be high...</td>\n",
|
149 |
+
" <td>negative</td>\n",
|
150 |
+
" </tr>\n",
|
151 |
+
" </tbody>\n",
|
152 |
+
"</table>\n",
|
153 |
+
"<p>50000 rows Γ 2 columns</p>\n",
|
154 |
+
"</div>"
|
155 |
+
],
|
156 |
+
"text/plain": [
|
157 |
+
" review sentiment\n",
|
158 |
+
"0 One of the other reviewers has mentioned that ... positive\n",
|
159 |
+
"1 A wonderful little production. <br /><br />The... positive\n",
|
160 |
+
"2 I thought this was a wonderful way to spend ti... positive\n",
|
161 |
+
"3 Basically there's a family where a little boy ... negative\n",
|
162 |
+
"4 Petter Mattei's \"Love in the Time of Money\" is... positive\n",
|
163 |
+
"... ... ...\n",
|
164 |
+
"49995 I thought this movie did a down right good job... positive\n",
|
165 |
+
"49996 Bad plot, bad dialogue, bad acting, idiotic di... negative\n",
|
166 |
+
"49997 I am a Catholic taught in parochial elementary... negative\n",
|
167 |
+
"49998 I'm going to have to disagree with the previou... negative\n",
|
168 |
+
"49999 No one expects the Star Trek movies to be high... negative\n",
|
169 |
+
"\n",
|
170 |
+
"[50000 rows x 2 columns]"
|
171 |
+
]
|
172 |
+
},
|
173 |
+
"execution_count": 5,
|
174 |
+
"metadata": {},
|
175 |
+
"output_type": "execute_result"
|
176 |
+
}
|
177 |
+
],
|
178 |
+
"source": [
|
179 |
+
"df = pd.read_csv(\"D:/New download/archive (1)/IMDB Dataset.csv\")\n",
|
180 |
+
"df"
|
181 |
+
]
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"cell_type": "code",
|
185 |
+
"execution_count": 15,
|
186 |
+
"metadata": {},
|
187 |
+
"outputs": [],
|
188 |
+
"source": [
|
189 |
+
"import emoji\n",
|
190 |
+
"\n",
|
191 |
+
"def preprocess_text(text):\n",
|
192 |
+
" text = text.lower()\n",
|
193 |
+
" text = re.sub(r'<.*?>', '', text)\n",
|
194 |
+
" text = re.sub(r'http\\S+|www\\S+', '', text)\n",
|
195 |
+
" text = re.sub(r'[^a-zA-Z\\s]', '', text)\n",
|
196 |
+
" text = emoji.replace_emoji(text, replace=\"\")\n",
|
197 |
+
" return text"
|
198 |
+
]
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"cell_type": "code",
|
202 |
+
"execution_count": 16,
|
203 |
+
"metadata": {},
|
204 |
+
"outputs": [],
|
205 |
+
"source": [
|
206 |
+
"df[\"c_review\"] = df[\"review\"].apply(preprocess_text)"
|
207 |
+
]
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"cell_type": "code",
|
211 |
+
"execution_count": 17,
|
212 |
+
"metadata": {},
|
213 |
+
"outputs": [
|
214 |
+
{
|
215 |
+
"data": {
|
216 |
+
"text/plain": [
|
217 |
+
"'a wonderful little production the filming technique is very unassuming very oldtimebbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece the actors are extremely well chosen michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great masters of comedy and his life the realism really comes home with the little things the fantasy of the guard which rather than use the traditional dream techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwells murals decorating every surface are terribly well done'"
|
218 |
+
]
|
219 |
+
},
|
220 |
+
"execution_count": 17,
|
221 |
+
"metadata": {},
|
222 |
+
"output_type": "execute_result"
|
223 |
+
}
|
224 |
+
],
|
225 |
+
"source": [
|
226 |
+
"df[\"c_review\"][1]"
|
227 |
+
]
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"cell_type": "code",
|
231 |
+
"execution_count": 19,
|
232 |
+
"metadata": {},
|
233 |
+
"outputs": [],
|
234 |
+
"source": [
|
235 |
+
"le = LabelEncoder()\n",
|
236 |
+
"df[\"n_sentiment\"] = le.fit_transform(df[\"sentiment\"])"
|
237 |
+
]
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"cell_type": "code",
|
241 |
+
"execution_count": 20,
|
242 |
+
"metadata": {},
|
243 |
+
"outputs": [
|
244 |
+
{
|
245 |
+
"data": {
|
246 |
+
"text/html": [
|
247 |
+
"<div>\n",
|
248 |
+
"<style scoped>\n",
|
249 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
250 |
+
" vertical-align: middle;\n",
|
251 |
+
" }\n",
|
252 |
+
"\n",
|
253 |
+
" .dataframe tbody tr th {\n",
|
254 |
+
" vertical-align: top;\n",
|
255 |
+
" }\n",
|
256 |
+
"\n",
|
257 |
+
" .dataframe thead th {\n",
|
258 |
+
" text-align: right;\n",
|
259 |
+
" }\n",
|
260 |
+
"</style>\n",
|
261 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
262 |
+
" <thead>\n",
|
263 |
+
" <tr style=\"text-align: right;\">\n",
|
264 |
+
" <th></th>\n",
|
265 |
+
" <th>review</th>\n",
|
266 |
+
" <th>sentiment</th>\n",
|
267 |
+
" <th>c_review</th>\n",
|
268 |
+
" <th>n_sentiment</th>\n",
|
269 |
+
" </tr>\n",
|
270 |
+
" </thead>\n",
|
271 |
+
" <tbody>\n",
|
272 |
+
" <tr>\n",
|
273 |
+
" <th>0</th>\n",
|
274 |
+
" <td>One of the other reviewers has mentioned that ...</td>\n",
|
275 |
+
" <td>positive</td>\n",
|
276 |
+
" <td>one of the other reviewers has mentioned that ...</td>\n",
|
277 |
+
" <td>1</td>\n",
|
278 |
+
" </tr>\n",
|
279 |
+
" <tr>\n",
|
280 |
+
" <th>1</th>\n",
|
281 |
+
" <td>A wonderful little production. <br /><br />The...</td>\n",
|
282 |
+
" <td>positive</td>\n",
|
283 |
+
" <td>a wonderful little production the filming tech...</td>\n",
|
284 |
+
" <td>1</td>\n",
|
285 |
+
" </tr>\n",
|
286 |
+
" <tr>\n",
|
287 |
+
" <th>2</th>\n",
|
288 |
+
" <td>I thought this was a wonderful way to spend ti...</td>\n",
|
289 |
+
" <td>positive</td>\n",
|
290 |
+
" <td>i thought this was a wonderful way to spend ti...</td>\n",
|
291 |
+
" <td>1</td>\n",
|
292 |
+
" </tr>\n",
|
293 |
+
" <tr>\n",
|
294 |
+
" <th>3</th>\n",
|
295 |
+
" <td>Basically there's a family where a little boy ...</td>\n",
|
296 |
+
" <td>negative</td>\n",
|
297 |
+
" <td>basically theres a family where a little boy j...</td>\n",
|
298 |
+
" <td>0</td>\n",
|
299 |
+
" </tr>\n",
|
300 |
+
" <tr>\n",
|
301 |
+
" <th>4</th>\n",
|
302 |
+
" <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
|
303 |
+
" <td>positive</td>\n",
|
304 |
+
" <td>petter matteis love in the time of money is a ...</td>\n",
|
305 |
+
" <td>1</td>\n",
|
306 |
+
" </tr>\n",
|
307 |
+
" <tr>\n",
|
308 |
+
" <th>...</th>\n",
|
309 |
+
" <td>...</td>\n",
|
310 |
+
" <td>...</td>\n",
|
311 |
+
" <td>...</td>\n",
|
312 |
+
" <td>...</td>\n",
|
313 |
+
" </tr>\n",
|
314 |
+
" <tr>\n",
|
315 |
+
" <th>49995</th>\n",
|
316 |
+
" <td>I thought this movie did a down right good job...</td>\n",
|
317 |
+
" <td>positive</td>\n",
|
318 |
+
" <td>i thought this movie did a down right good job...</td>\n",
|
319 |
+
" <td>1</td>\n",
|
320 |
+
" </tr>\n",
|
321 |
+
" <tr>\n",
|
322 |
+
" <th>49996</th>\n",
|
323 |
+
" <td>Bad plot, bad dialogue, bad acting, idiotic di...</td>\n",
|
324 |
+
" <td>negative</td>\n",
|
325 |
+
" <td>bad plot bad dialogue bad acting idiotic direc...</td>\n",
|
326 |
+
" <td>0</td>\n",
|
327 |
+
" </tr>\n",
|
328 |
+
" <tr>\n",
|
329 |
+
" <th>49997</th>\n",
|
330 |
+
" <td>I am a Catholic taught in parochial elementary...</td>\n",
|
331 |
+
" <td>negative</td>\n",
|
332 |
+
" <td>i am a catholic taught in parochial elementary...</td>\n",
|
333 |
+
" <td>0</td>\n",
|
334 |
+
" </tr>\n",
|
335 |
+
" <tr>\n",
|
336 |
+
" <th>49998</th>\n",
|
337 |
+
" <td>I'm going to have to disagree with the previou...</td>\n",
|
338 |
+
" <td>negative</td>\n",
|
339 |
+
" <td>im going to have to disagree with the previous...</td>\n",
|
340 |
+
" <td>0</td>\n",
|
341 |
+
" </tr>\n",
|
342 |
+
" <tr>\n",
|
343 |
+
" <th>49999</th>\n",
|
344 |
+
" <td>No one expects the Star Trek movies to be high...</td>\n",
|
345 |
+
" <td>negative</td>\n",
|
346 |
+
" <td>no one expects the star trek movies to be high...</td>\n",
|
347 |
+
" <td>0</td>\n",
|
348 |
+
" </tr>\n",
|
349 |
+
" </tbody>\n",
|
350 |
+
"</table>\n",
|
351 |
+
"<p>50000 rows Γ 4 columns</p>\n",
|
352 |
+
"</div>"
|
353 |
+
],
|
354 |
+
"text/plain": [
|
355 |
+
" review sentiment \\\n",
|
356 |
+
"0 One of the other reviewers has mentioned that ... positive \n",
|
357 |
+
"1 A wonderful little production. <br /><br />The... positive \n",
|
358 |
+
"2 I thought this was a wonderful way to spend ti... positive \n",
|
359 |
+
"3 Basically there's a family where a little boy ... negative \n",
|
360 |
+
"4 Petter Mattei's \"Love in the Time of Money\" is... positive \n",
|
361 |
+
"... ... ... \n",
|
362 |
+
"49995 I thought this movie did a down right good job... positive \n",
|
363 |
+
"49996 Bad plot, bad dialogue, bad acting, idiotic di... negative \n",
|
364 |
+
"49997 I am a Catholic taught in parochial elementary... negative \n",
|
365 |
+
"49998 I'm going to have to disagree with the previou... negative \n",
|
366 |
+
"49999 No one expects the Star Trek movies to be high... negative \n",
|
367 |
+
"\n",
|
368 |
+
" c_review n_sentiment \n",
|
369 |
+
"0 one of the other reviewers has mentioned that ... 1 \n",
|
370 |
+
"1 a wonderful little production the filming tech... 1 \n",
|
371 |
+
"2 i thought this was a wonderful way to spend ti... 1 \n",
|
372 |
+
"3 basically theres a family where a little boy j... 0 \n",
|
373 |
+
"4 petter matteis love in the time of money is a ... 1 \n",
|
374 |
+
"... ... ... \n",
|
375 |
+
"49995 i thought this movie did a down right good job... 1 \n",
|
376 |
+
"49996 bad plot bad dialogue bad acting idiotic direc... 0 \n",
|
377 |
+
"49997 i am a catholic taught in parochial elementary... 0 \n",
|
378 |
+
"49998 im going to have to disagree with the previous... 0 \n",
|
379 |
+
"49999 no one expects the star trek movies to be high... 0 \n",
|
380 |
+
"\n",
|
381 |
+
"[50000 rows x 4 columns]"
|
382 |
+
]
|
383 |
+
},
|
384 |
+
"execution_count": 20,
|
385 |
+
"metadata": {},
|
386 |
+
"output_type": "execute_result"
|
387 |
+
}
|
388 |
+
],
|
389 |
+
"source": [
|
390 |
+
"df"
|
391 |
+
]
|
392 |
+
},
|
393 |
+
{
|
394 |
+
"cell_type": "code",
|
395 |
+
"execution_count": 44,
|
396 |
+
"metadata": {},
|
397 |
+
"outputs": [
|
398 |
+
{
|
399 |
+
"data": {
|
400 |
+
"text/plain": [
|
401 |
+
"'one of the other reviewers has mentioned that after watching just oz episode youll be hooked they are right as this is exactly what happened with methe first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordit is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayi would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty pictures painted for mainstream audiences forget charm forget romanceoz doesnt mess around the first episode i ever saw struck me as so nasty it was surreal i couldnt say i was ready for it but as i watched more i developed a taste for oz and got accustomed to the high levels of graphic violence not just violence but injustice crooked guards wholl be sold out for a nickel inmates wholl kill on order and get away with it well mannered middle class inmates being turned into prison bitches due to their lack of street skills or prison experience watching oz you may become comfortable with what is uncomfortable viewingthats if you can get in touch with your darker side'"
|
402 |
+
]
|
403 |
+
},
|
404 |
+
"execution_count": 44,
|
405 |
+
"metadata": {},
|
406 |
+
"output_type": "execute_result"
|
407 |
+
}
|
408 |
+
],
|
409 |
+
"source": [
|
410 |
+
"df['c_review'][0]"
|
411 |
+
]
|
412 |
+
},
|
413 |
+
{
|
414 |
+
"cell_type": "code",
|
415 |
+
"execution_count": 21,
|
416 |
+
"metadata": {},
|
417 |
+
"outputs": [],
|
418 |
+
"source": [
|
419 |
+
"x_train, x_test, y_train, y_test = train_test_split(df[\"c_review\"],df[\"n_sentiment\"], test_size=0.2, random_state=42)"
|
420 |
+
]
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"cell_type": "code",
|
424 |
+
"execution_count": 22,
|
425 |
+
"metadata": {},
|
426 |
+
"outputs": [],
|
427 |
+
"source": [
|
428 |
+
"max_words = 10000\n",
|
429 |
+
"max_len = 20\n",
|
430 |
+
"\n",
|
431 |
+
"token = Tokenizer(num_words=max_words, oov_token= \"<oov>\")\n",
|
432 |
+
"token.fit_on_texts(x_train)\n",
|
433 |
+
"x_train_seq = token.texts_to_sequences(x_train)\n",
|
434 |
+
"x_test_seq = token.texts_to_sequences(x_test)"
|
435 |
+
]
|
436 |
+
},
|
437 |
+
{
|
438 |
+
"cell_type": "code",
|
439 |
+
"execution_count": 26,
|
440 |
+
"metadata": {},
|
441 |
+
"outputs": [],
|
442 |
+
"source": [
|
443 |
+
"x_train_pad = pad_sequences(x_train_seq, maxlen = max_len, padding = 'post', truncating = 'post')\n",
|
444 |
+
"x_test_pad = pad_sequences(x_test_seq, maxlen = max_len, padding = 'post', truncating = 'post')"
|
445 |
+
]
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"cell_type": "code",
|
449 |
+
"execution_count": 35,
|
450 |
+
"metadata": {},
|
451 |
+
"outputs": [],
|
452 |
+
"source": [
|
453 |
+
"X_train_pad = np.array(x_train_pad, dtype=np.int32)\n",
|
454 |
+
"X_test_pad = np.array(x_test_pad, dtype=np.int32)\n",
|
455 |
+
"y_train = np.array(y_train, dtype=np.float32)\n",
|
456 |
+
"y_test = np.array(y_test, dtype=np.float32)"
|
457 |
+
]
|
458 |
+
},
|
459 |
+
{
|
460 |
+
"cell_type": "code",
|
461 |
+
"execution_count": null,
|
462 |
+
"metadata": {},
|
463 |
+
"outputs": [
|
464 |
+
{
|
465 |
+
"name": "stdout",
|
466 |
+
"output_type": "stream",
|
467 |
+
"text": [
|
468 |
+
"X_train_pad dtype: int32, shape: (40000, 20)\n",
|
469 |
+
"X_test_pad dtype: int32, shape: (10000, 20)\n",
|
470 |
+
"y_train dtype: float32, shape: (40000,)\n",
|
471 |
+
"y_test dtype: float32, shape: (10000,)\n"
|
472 |
+
]
|
473 |
+
}
|
474 |
+
],
|
475 |
+
"source": [
|
476 |
+
"import numpy as np\n",
|
477 |
+
"\n",
|
478 |
+
"X_train_pad = np.array(X_train_pad, dtype=np.int32)\n",
|
479 |
+
"X_test_pad = np.array(X_test_pad, dtype=np.int32)\n",
|
480 |
+
"y_train = np.array(y_train, dtype=np.float32)\n",
|
481 |
+
"y_test = np.array(y_test, dtype=np.float32)\n",
|
482 |
+
"\n",
|
483 |
+
"print(f\"X_train_pad dtype: {X_train_pad.dtype}, shape: {X_train_pad.shape}\")\n",
|
484 |
+
"print(f\"X_test_pad dtype: {X_test_pad.dtype}, shape: {X_test_pad.shape}\")\n",
|
485 |
+
"print(f\"y_train dtype: {y_train.dtype}, shape: {y_train.shape}\")\n",
|
486 |
+
"print(f\"y_test dtype: {y_test.dtype}, shape: {y_test.shape}\")\n"
|
487 |
+
]
|
488 |
+
},
|
489 |
+
{
|
490 |
+
"cell_type": "code",
|
491 |
+
"execution_count": 39,
|
492 |
+
"metadata": {},
|
493 |
+
"outputs": [
|
494 |
+
{
|
495 |
+
"name": "stderr",
|
496 |
+
"output_type": "stream",
|
497 |
+
"text": [
|
498 |
+
"c:\\Users\\gouth\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\keras\\src\\layers\\core\\embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it.\n",
|
499 |
+
" warnings.warn(\n"
|
500 |
+
]
|
501 |
+
}
|
502 |
+
],
|
503 |
+
"source": [
|
504 |
+
"model = Sequential([\n",
|
505 |
+
" Embedding(input_dim=max_words, output_dim=128, input_length= max_len), \n",
|
506 |
+
" LSTM(128, return_sequences=True),\n",
|
507 |
+
" Dropout(0.3),\n",
|
508 |
+
" LSTM(64),\n",
|
509 |
+
" Dropout(0.3),\n",
|
510 |
+
" Dense(64, activation='relu'),\n",
|
511 |
+
" Dense(1, activation='sigmoid')])\n",
|
512 |
+
"\n",
|
513 |
+
"model.compile(loss= 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])"
|
514 |
+
]
|
515 |
+
},
|
516 |
+
{
|
517 |
+
"cell_type": "code",
|
518 |
+
"execution_count": 40,
|
519 |
+
"metadata": {},
|
520 |
+
"outputs": [
|
521 |
+
{
|
522 |
+
"name": "stdout",
|
523 |
+
"output_type": "stream",
|
524 |
+
"text": [
|
525 |
+
"Epoch 1/5\n",
|
526 |
+
"\u001b[1m625/625\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 65ms/step - accuracy: 0.6499 - loss: 0.6137\n",
|
527 |
+
"Epoch 1: val_accuracy improved from -inf to 0.72350, saving model to imdb-001.keras\n",
|
528 |
+
"\u001b[1m625/625\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m50s\u001b[0m 71ms/step - accuracy: 0.6500 - loss: 0.6136 - val_accuracy: 0.7235 - val_loss: 0.5364\n",
|
529 |
+
"Epoch 2/5\n",
|
530 |
+
"\u001b[1m625/625\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 67ms/step - accuracy: 0.7795 - loss: 0.4639\n",
|
531 |
+
"Epoch 2: val_accuracy improved from 0.72350 to 0.72690, saving model to imdb-002.keras\n",
|
532 |
+
"\u001b[1m625/625\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m45s\u001b[0m 71ms/step - accuracy: 0.7795 - loss: 0.4639 - val_accuracy: 0.7269 - val_loss: 0.5347\n",
|
533 |
+
"Epoch 3/5\n",
|
534 |
+
"\u001b[1m625/625\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 68ms/step - accuracy: 0.8218 - loss: 0.3898\n",
|
535 |
+
"Epoch 3: val_accuracy did not improve from 0.72690\n",
|
536 |
+
"\u001b[1m625/625\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m45s\u001b[0m 72ms/step - accuracy: 0.8218 - loss: 0.3898 - val_accuracy: 0.7197 - val_loss: 0.5766\n",
|
537 |
+
"Epoch 4/5\n",
|
538 |
+
"\u001b[1m625/625\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 69ms/step - accuracy: 0.8574 - loss: 0.3142\n",
|
539 |
+
"Epoch 4: val_accuracy did not improve from 0.72690\n",
|
540 |
+
"\u001b[1m625/625\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m46s\u001b[0m 73ms/step - accuracy: 0.8573 - loss: 0.3143 - val_accuracy: 0.7087 - val_loss: 0.6811\n",
|
541 |
+
"Epoch 5/5\n",
|
542 |
+
"\u001b[1m625/625\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 71ms/step - accuracy: 0.8940 - loss: 0.2464\n",
|
543 |
+
"Epoch 5: val_accuracy did not improve from 0.72690\n",
|
544 |
+
"\u001b[1m625/625\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m83s\u001b[0m 75ms/step - accuracy: 0.8940 - loss: 0.2464 - val_accuracy: 0.7032 - val_loss: 0.7815\n"
|
545 |
+
]
|
546 |
+
},
|
547 |
+
{
|
548 |
+
"data": {
|
549 |
+
"text/plain": [
|
550 |
+
"<keras.src.callbacks.history.History at 0x21dac69a560>"
|
551 |
+
]
|
552 |
+
},
|
553 |
+
"execution_count": 40,
|
554 |
+
"metadata": {},
|
555 |
+
"output_type": "execute_result"
|
556 |
+
}
|
557 |
+
],
|
558 |
+
"source": [
|
559 |
+
"from tensorflow.keras.callbacks import ModelCheckpoint\n",
|
560 |
+
"\n",
|
561 |
+
"checkpoint = ModelCheckpoint(\"imdb-{epoch:03d}.keras\", monitor = 'val_accuracy', save_best_only = True, mode = 'max', verbose = 1)\n",
|
562 |
+
"model.fit(x_train_pad,y_train, epochs=5, batch_size=64,validation_data=(x_test_pad,y_test), callbacks=[checkpoint])"
|
563 |
+
]
|
564 |
+
},
|
565 |
+
{
|
566 |
+
"cell_type": "code",
|
567 |
+
"execution_count": 41,
|
568 |
+
"metadata": {},
|
569 |
+
"outputs": [
|
570 |
+
{
|
571 |
+
"name": "stdout",
|
572 |
+
"output_type": "stream",
|
573 |
+
"text": [
|
574 |
+
"\u001b[1m313/313\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m4s\u001b[0m 14ms/step - accuracy: 0.7220 - loss: 0.5362\n",
|
575 |
+
"Accuracy: 0.7269\n"
|
576 |
+
]
|
577 |
+
}
|
578 |
+
],
|
579 |
+
"source": [
|
580 |
+
"model.load_weights(\"imdb-002.keras\")\n",
|
581 |
+
"\n",
|
582 |
+
"loss, accuracy = model.evaluate(X_test_pad, y_test)\n",
|
583 |
+
"print(f'Accuracy: {accuracy:.4f}')"
|
584 |
+
]
|
585 |
+
},
|
586 |
+
{
|
587 |
+
"cell_type": "code",
|
588 |
+
"execution_count": 46,
|
589 |
+
"metadata": {},
|
590 |
+
"outputs": [
|
591 |
+
{
|
592 |
+
"name": "stdout",
|
593 |
+
"output_type": "stream",
|
594 |
+
"text": [
|
595 |
+
"\u001b[1m1/1\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 99ms/step\n",
|
596 |
+
"Predicted Sentiment: Positive (Confidence: 0.9227)\n",
|
597 |
+
"\u001b[1m1/1\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 86ms/step\n",
|
598 |
+
"Predicted Sentiment: Positive (Confidence: 0.9227)\n",
|
599 |
+
"\u001b[1m1/1\u001b[0m \u001b[32mββββββββββββββββββββ\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 86ms/step\n",
|
600 |
+
"Predicted Sentiment: Negative (Confidence: 0.0638)\n"
|
601 |
+
]
|
602 |
+
}
|
603 |
+
],
|
604 |
+
"source": [
|
605 |
+
"def predict_sentiment(review):\n",
|
606 |
+
" processed_review = preprocess_text(review)\n",
|
607 |
+
" sequence = token.texts_to_sequences([processed_review])\n",
|
608 |
+
" padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')\n",
|
609 |
+
" prediction = model.predict(padded_sequence)[0][0]\n",
|
610 |
+
" sentiment = 'Positive' if prediction > 0.5 else 'Negative'\n",
|
611 |
+
" return sentiment, prediction\n",
|
612 |
+
"\n",
|
613 |
+
"while True:\n",
|
614 |
+
" user_input = input(\"Enter a review (or type 'exit' to quit): \")\n",
|
615 |
+
" if user_input.lower() == 'exit':\n",
|
616 |
+
" break\n",
|
617 |
+
" sentiment, confidence = predict_sentiment(user_input)\n",
|
618 |
+
" print(f\"Predicted Sentiment: {sentiment} (Confidence: {confidence:.4f})\")"
|
619 |
+
]
|
620 |
+
},
|
621 |
+
{
|
622 |
+
"cell_type": "code",
|
623 |
+
"execution_count": 47,
|
624 |
+
"metadata": {},
|
625 |
+
"outputs": [],
|
626 |
+
"source": [
|
627 |
+
"import json\n",
|
628 |
+
"\n",
|
629 |
+
"tokenizer_json = token.to_json()\n",
|
630 |
+
"with open(\"imdb.json\", \"w\") as file:\n",
|
631 |
+
" file.write(tokenizer_json)\n",
|
632 |
+
"\n",
|
633 |
+
"model.save(\"imdb-002.keras\")\n"
|
634 |
+
]
|
635 |
+
}
|
636 |
+
],
|
637 |
+
"metadata": {
|
638 |
+
"kernelspec": {
|
639 |
+
"display_name": "Python 3",
|
640 |
+
"language": "python",
|
641 |
+
"name": "python3"
|
642 |
+
},
|
643 |
+
"language_info": {
|
644 |
+
"codemirror_mode": {
|
645 |
+
"name": "ipython",
|
646 |
+
"version": 3
|
647 |
+
},
|
648 |
+
"file_extension": ".py",
|
649 |
+
"mimetype": "text/x-python",
|
650 |
+
"name": "python",
|
651 |
+
"nbconvert_exporter": "python",
|
652 |
+
"pygments_lexer": "ipython3",
|
653 |
+
"version": "3.10.0"
|
654 |
+
}
|
655 |
+
},
|
656 |
+
"nbformat": 4,
|
657 |
+
"nbformat_minor": 2
|
658 |
+
}
|
imdb-002.keras
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f34bc06baf4f17c57597d9acde8ad223d2922e783cd170a3b1c1176a7bee4557
|
3 |
+
size 17628414
|