1+ {
2+ "nbformat" : 4 ,
3+ "nbformat_minor" : 0 ,
4+ "metadata" : {
5+ "colab" : {
6+ "private_outputs" : true ,
7+ "provenance" : [],
8+ "collapsed_sections" : []
9+ },
10+ "kernelspec" : {
11+ "name" : " python3" ,
12+ "display_name" : " Python 3"
13+ },
14+ "language_info" : {
15+ "name" : " python"
16+ }
17+ },
18+ "cells" : [
19+ {
20+ "cell_type" : " markdown" ,
21+ "source" : [
22+ " install selenium\n " ,
23+ " and\n " ,
24+ " chrome driver (also define path)"
25+ ],
26+ "metadata" : {
27+ "id" : " -lYAwjJwe4Oa"
28+ }
29+ },
30+ {
31+ "cell_type" : " code" ,
32+ "source" : [
33+ " !pip install selenium\n " ,
34+ " !apt-get update # to update ubuntu to correctly run apt install\n " ,
35+ " !apt install chromium-chromedriver\n " ,
36+ " !cp /usr/lib/chromium-browser/chromedriver /usr/bin\n " ,
37+ " import sys\n " ,
38+ " sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')"
39+ ],
40+ "metadata" : {
41+ "id" : " rjyqtRrYewxu"
42+ },
43+ "execution_count" : null ,
44+ "outputs" : []
45+ },
46+ {
47+ "cell_type" : " markdown" ,
48+ "source" : [
49+ " install tesseract"
50+ ],
51+ "metadata" : {
52+ "id" : " FFYAh381exh4"
53+ }
54+ },
55+ {
56+ "cell_type" : " code" ,
57+ "source" : [
58+ " !sudo apt install tesseract-ocr\n " ,
59+ " !pip install pytesseract"
60+ ],
61+ "metadata" : {
62+ "id" : " 2_AwFU5SQ5lT"
63+ },
64+ "execution_count" : null ,
65+ "outputs" : []
66+ },
67+ {
68+ "cell_type" : " markdown" ,
69+ "source" : [
70+ " restart runtime"
71+ ],
72+ "metadata" : {
73+ "id" : " MBT77b1SBIWq"
74+ }
75+ },
76+ {
77+ "cell_type" : " code" ,
78+ "source" : [
79+ " import os\n " ,
80+ " os.kill(os.getpid(), 9)\n " ,
81+ " #-----------OR-----------\n " ,
82+ " # quit()\n " ,
83+ " #-----------OR-----------\n " ,
84+ " # exit()"
85+ ],
86+ "metadata" : {
87+ "id" : " LQu1QserAnKc"
88+ },
89+ "execution_count" : null ,
90+ "outputs" : []
91+ },
92+ {
93+ "cell_type" : " markdown" ,
94+ "source" : [
95+ " import dependancies"
96+ ],
97+ "metadata" : {
98+ "id" : " QQovnVXAfNbx"
99+ }
100+ },
101+ {
102+ "cell_type" : " code" ,
103+ "source" : [
104+ " from selenium import webdriver\n " ,
105+ " from selenium.webdriver.common.by import By\n " ,
106+ " from selenium.webdriver.common.keys import Keys\n " ,
107+ " from selenium.webdriver.support.select import Select\n " ,
108+ " from selenium.webdriver.chrome.service import Service\n " ,
109+ " \n " ,
110+ " import cv2\n " ,
111+ " from PIL import Image, ImageCms, ImageFilter\n " ,
112+ " import pytesseract\n " ,
113+ " \n " ,
114+ " import pandas as pd\n " ,
115+ " import warnings\n " ,
116+ " warnings.filterwarnings('ignore')"
117+ ],
118+ "metadata" : {
119+ "id" : " RhvflwoJfMZZ"
120+ },
121+ "execution_count" : null ,
122+ "outputs" : []
123+ },
124+ {
125+ "cell_type" : " markdown" ,
126+ "source" : [
127+ " helper functions"
128+ ],
129+ "metadata" : {
130+ "id" : " T2jcfxOqfdvk"
131+ }
132+ },
133+ {
134+ "cell_type" : " code" ,
135+ "source" : [
136+ " def step1():\n " ,
137+ " # open webpage\n " ,
138+ " driver.get(URL)\n " ,
139+ " \n " ,
140+ " # save captcha\n " ,
141+ " imdata = driver.find_element(By.ID,\" imgCaptcha\" )\n " ,
142+ " with open(path, 'wb') as file:\n " ,
143+ " file.write(imdata.screenshot_as_png)\n " ,
144+ " \n " ,
145+ " def step2():\n " ,
146+ " # convert to inverted mask and save img_temp\n " ,
147+ " im = cv2.imread(path)\n " ,
148+ " gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)\n " ,
149+ " thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]\n " ,
150+ " horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))\n " ,
151+ " Mask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN,horizontal_kernel, iterations=2)\n " ,
152+ " #Mask = cv2.bitwise_not(Mask)\n " ,
153+ " cv2.imwrite(\" old.png\" , Mask)\n " ,
154+ " \n " ,
155+ " # open img_temp and reinvert mask\n " ,
156+ " img = Image.open(\" old.png\" )\n " ,
157+ " img = img.convert(\" RGBA\" )\n " ,
158+ " datas = img.getdata()\n " ,
159+ " newData = []\n " ,
160+ " for item in datas:\n " ,
161+ " if item[0] == 0 and item[1] == 0 and item[2] == 0:\n " ,
162+ " newData.append((255, 255, 255, 0))\n " ,
163+ " else:\n " ,
164+ " newData.append(item)\n " ,
165+ " img.putdata(newData)\n " ,
166+ " \n " ,
167+ " # paste mask on img and save new_temp_img\n " ,
168+ " background = Image.open(path)\n " ,
169+ " background = background.convert(\" RGBA\" )\n " ,
170+ " background.paste(img,mask=img)\n " ,
171+ " background.save(\" new.png\" ,\" PNG\" )\n " ,
172+ " \n " ,
173+ " def step3(im): # solve captcha\n " ,
174+ " im = Image.open(im) # open last saved img\n " ,
175+ " im = im.crop((5,5,115,35)) # crop it\n " ,
176+ " # conver image to extractable form elements (deffer captcha styles)\n " ,
177+ " rgb = ImageCms.createProfile(colorSpace='sRGB')\n " ,
178+ " lab = ImageCms.createProfile(colorSpace='LAB')\n " ,
179+ " transform = ImageCms.buildTransform(inputProfile=rgb, outputProfile=lab, inMode='RGB', outMode='LAB')\n " ,
180+ " lab_im = ImageCms.applyTransform(im=im, transform=transform)\n " ,
181+ " l, a, b = lab_im.split()\n " ,
182+ " im=l # select an element which is most extractable\n " ,
183+ " im = im.filter(ImageFilter.MinFilter(3)) # filter it\n " ,
184+ " result = pytesseract.image_to_string(im) # send it to ocr and save results to a variable\n " ,
185+ " l=[]\n " ,
186+ " l.append(result.strip())\n " ,
187+ " if l[0]==\" \" or l[0]==\"\" : # if result will be empty then it will do above steps again untill it gets the result\n " ,
188+ " step1()\n " ,
189+ " step2()\n " ,
190+ " l[0]=step3(\" new.png\" )\n " ,
191+ " return l[0] # return final result (maybe right or wrong)\n " ,
192+ " \n " ,
193+ " def step4(enroll,ans): # return data\n " ,
194+ " # site automation \n " ,
195+ " sel = Select (driver.find_element(By.ID,\" ddlbatch\" )) # focus on select element\n " ,
196+ " sel.select_by_value(exam) # select element by giving id (specific for a exam)\n " ,
197+ " enr = driver.find_element(By.ID,\" txtenroll\" ) # get enrollment no. text box\n " ,
198+ " captex = driver.find_element(By.ID,\" CodeNumberTextBox\" ) # get captcha text box\n " ,
199+ " enr.send_keys(enroll) # send (type) given enrollment number to text box\n " ,
200+ " captex.send_keys(ans) # send (type) extracted captcha text to text box\n " ,
201+ " captex.send_keys(Keys.RETURN) # return (ENTER)\n " ,
202+ " \n " ,
203+ " ere = driver.find_element(By.ID,\" lblmsg\" ).text\n " ,
204+ " if ere == \" ERROR: Incorrect captcha code, try again.\" : \n " ,
205+ " return \" err\"\n " ,
206+ " if ere == \" Your request count is reached to maximum limit, Please try again later.\" : \n " ,
207+ " return \" reqover\"\n " ,
208+ " if ere == \" Oppssss! Data not available.\" : \n " ,
209+ " return \" nodata\"\n " ,
210+ " \n " ,
211+ " name = driver.find_element(By.ID,\" lblName\" ).text\n " ,
212+ " sess = driver.find_element(By.ID,\" lblSession\" ).text\n " ,
213+ " dd = driver.find_element(By.ID,\" lblDeclaredOn\" ).text\n " ,
214+ " bra = driver.find_element(By.ID,\" lblBranchName\" ).text\n " ,
215+ " cs = driver.find_element(By.ID,\" lblExamName\" ).text\n " ,
216+ " csb = driver.find_element(By.ID,\" lblCUPBack\" ).text\n " ,
217+ " tb = driver.find_element(By.ID,\" lblTotalBack\" ).text\n " ,
218+ " spi = driver.find_element(By.ID,\" lblSPI\" ).text\n " ,
219+ " cpi = driver.find_element(By.ID,\" lblCPI\" ).text\n " ,
220+ " cgpa = driver.find_element(By.ID,\" lblCGPA\" ).text\n " ,
221+ " cp = driver.find_element(By.ID,\" pt100Curr\" ).text\n " ,
222+ " cup = driver.find_element(By.ID,\" pt100Cuml\" ).text\n " ,
223+ " return [enroll,name,sess,cs,dd,bra,int(csb),int(tb),float(spi),float(cpi),float(cgpa),int(cp),int(cup),ere]\n " ,
224+ " \n " ,
225+ " def loop():\n " ,
226+ " # just a loop through different enrollment numbers\n " ,
227+ " global counter\n " ,
228+ " mynewlist = []\n " ,
229+ " for i in mylist :\n " ,
230+ " enroll = \" {}\" .format(i)\n " ,
231+ " step1()\n " ,
232+ " step2()\n " ,
233+ " ans=step3(\" new.png\" )\n " ,
234+ " nr=step4(enroll,ans)\n " ,
235+ " if nr == \" err\" :\n " ,
236+ " mynewlist.append(enroll)\n " ,
237+ " elif nr == \" reqover\" :\n " ,
238+ " print(\" Change the SERVER!\" )\n " ,
239+ " break\n " ,
240+ " elif nr == \" nodata\" :\n " ,
241+ " df.loc[len(df)] = [enroll,\" nodata\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ]\n " ,
242+ " counter += 1\n " ,
243+ " print(f\" {counter}/{tc} {int(counter*100/tc)}%\" )\n " ,
244+ " else :\n " ,
245+ " df.loc[len(df)] = nr\n " ,
246+ " counter += 1\n " ,
247+ " print(f\" {counter}/{tc} {int(counter*100/tc)}%\" )\n " ,
248+ " return mynewlist"
249+ ],
250+ "metadata" : {
251+ "id" : " t9H8OdFrRLBY"
252+ },
253+ "execution_count" : null ,
254+ "outputs" : []
255+ },
256+ {
257+ "cell_type" : " markdown" ,
258+ "source" : [
259+ " main function"
260+ ],
261+ "metadata" : {
262+ "id" : " 5QIu2udZgKKq"
263+ }
264+ },
265+ {
266+ "cell_type" : " code" ,
267+ "execution_count" : null ,
268+ "metadata" : {
269+ "id" : " E_BfboOxLMJd"
270+ },
271+ "outputs" : [],
272+ "source" : [
273+ " try:\n " ,
274+ " \t # initiate webdriver and configure options\n " ,
275+ " \t chrome_options = webdriver.ChromeOptions()\n " ,
276+ " \t chrome_options.add_argument('--headless')\n " ,
277+ " \t chrome_options.add_argument('--no-sandbox')\n " ,
278+ " \t chrome_options.add_argument('--disable-dev-shm-usage')\n " ,
279+ " \t chrome_options.add_argument(\" --incognito\" )\n " ,
280+ " \n " ,
281+ " \t ser = Service(\" chromedriver\" )\n " ,
282+ " \t driver = webdriver.Chrome(service=ser,options=chrome_options)\n " ,
283+ " \n " ,
284+ " \t # define url and filename for download captcha_temp\n " ,
285+ " \t URL = \" https://www.gturesults.in/\"\n " ,
286+ " \t exam = \" 3361$S2022$2022-08-25$current$0\"\n " ,
287+ " \t path=\" cap.jpg\"\n " ,
288+ " \n " ,
289+ " \t # create empty dataframe for filling output data with same labels that input file has\n " ,
290+ " \t df = pd.read_json('{\" Enrollment No.\" :{},\" Name\" :{},\" Session\" :{},\" Exam\" :{},\" Declared On\" :{},\" Branch\" :{},\" Current Sem. Backlog\" :{},\" Total Backlog\" :{},\" SPI\" :{},\" CPI\" :{},\" CGPA\" :{},\" Current Points\" :{},\" Cumulative points\" :{},\" Message\" :{}}')\n " ,
291+ " \n " ,
292+ " \t mylist = range(190280111001,190280111010+1) # give range of enrollment no. (here i given our batch's range)\n " ,
293+ " \t\n " ,
294+ " \t counter = 0\n " ,
295+ " \t tc = len(mylist)\n " ,
296+ " \n " ,
297+ " \t # main driver programm\n " ,
298+ " \t # loop runs untill all data has scraped if any server error not happens\n " ,
299+ " \t while 1:\n " ,
300+ " \t\t mynewlist=loop()\n " ,
301+ " \t\t if len(mynewlist) != 0:\n " ,
302+ " \t\t\t mylist = mynewlist\n " ,
303+ " \t\t else:\n " ,
304+ " \t\t\t break\n " ,
305+ " \n " ,
306+ " \t # save dataframe to excel file\n " ,
307+ " \t df.to_excel(\" out.xlsx\" )\n " ,
308+ " \n " ,
309+ " finally:\n " ,
310+ " \t driver.close() # close the window\n " ,
311+ " \t driver.quit() # stop the driver\n " ,
312+ " \t # remove unnecessary files\n " ,
313+ " \t import os\n " ,
314+ " \t os.remove(\" cap.jpg\" )\n " ,
315+ " \t os.remove(\" old.png\" )\n " ,
316+ " \t os.remove(\" new.png\" )"
317+ ]
318+ },
319+ {
320+ "cell_type" : " markdown" ,
321+ "source" : [
322+ " 👈 download <font color='yellow'>out.xlsx</font> from left side bar by double clicking it"
323+ ],
324+ "metadata" : {
325+ "id" : " qgkULANw3ZQ2"
326+ }
327+ }
328+ ]
329+ }
0 commit comments