#!/usr/bin/env python # coding: utf-8 # # ---------------------------------------------- # # Warrants Scraper # # ---------------------------------------------- print("Opening warrant scraper program: Importing libraries needed.") # Import the crawlers #import sys #sys.path.insert(0, './source_code') print("regular imports") # regular imports import numpy as np import os import mysql.connector import time import datetime import pandas as pd import csv from tkinter import Tk, messagebox, Entry, Button, Label print("selenium libs") # libs #import numpy as np #------------------------- from selenium import webdriver from selenium.webdriver.chrome.options import Options options = Options() options.add_argument("start-maximized") driver = webdriver.Chrome(options=options) driver.get("https://www.google.com/") #-------------------------- from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium_stealth import stealth import json5 print("scrapy libs") #import os #import csv #import pandas as pd #from datetime import datetime import time from scrapy.selector import Selector from string import ascii_lowercase from string import ascii_uppercase from selenium.webdriver.support.select import Select from itertools import combinations_with_replacement from PIL import Image import scrapy from scrapy_selenium import SeleniumRequest from scrapy.http import request # special imports #import wyomingmn #import weldgov #import warrantSearch #import sedgwickcounty #import pennco #import ndcourts #import jocosheriff #import harriscountyso #import fdle_state #import dallascounty # imports for process spiders #from scrapy.utils.project import get_project_settings print("scrapy crawler process import.") from scrapy.crawler import CrawlerProcess import pytesseract from PIL import Image from twocaptcha import TwoCaptcha ######## CHANGE HERE ########## #pytesseract.pytesseract.tesseract_cmd = r'C:\Users\tichmangono\AppData\Local\Programs\Tesseract-OCR\tesseract.exe' API_KEY = '326845c4b299db5dd6c3cf7a1c4bbda5' ######## CHANGE HERE ########## import os import cv2 from time import sleep import io window = Tk() window.geometry('400x600') window.title("Warrants Scraping Progam") window.configure(background='light gray') ####################################################################################################### # SPIDER CRAWLERS ####################################################################################################### print("scrapy spiders setup.") class BrownsoSpider(scrapy.Spider): name = 'brownso' cntr = 1 DOWNLOAD_FOLDER = "./downloads/" OUTPUT_FILE = DOWNLOAD_FOLDER + '1_bronso.csv' custom_settings = { # 'FEED_EXPORT_FIELDS': ["Warrant_Last_Name", "Warrant_First_Name", 'Warrant_Number'], 'FEED_FORMAT': 'csv', 'FEED_URI': OUTPUT_FILE } def start_requests(self): yield scrapy.Request( url="https://www.brownso.org/warrants", callback=self.parse ) def parse(self, response): # (//div[contains(@id, 'view')]/h3//following-sibling::div[1])[1]/text()[2] # remove old file warrant_li = response.xpath("//div[contains(@id, 'view')]/h3") for warrant in warrant_li: name = warrant.xpath("normalize-space(.//text())").get() dt = warrant.xpath( "normalize-space(.//following-sibling::div[1]/text()[8])").get() dt = f"{dt[:4]}/{dt[4:6]}/{dt[6:8]}" if dt else None details = warrant.xpath( "normalize-space(.//following-sibling::div[1]/text()[9])").get() yield { 'Warrant_Last_Name': name.split(",")[0].strip(), 'Warrant_First_Name': name.split(",")[-1].strip(), 'Warrant_Number': warrant.xpath("normalize-space(.//following-sibling::div[1]/text()[2])").get(), 'Charges': warrant.xpath("normalize-space(.//following-sibling::div[1]/text()[4])").get(), 'Bond': warrant.xpath("normalize-space(.//following-sibling::div[1]/text()[6])").get(), 'Date': dt, 'Age': details.split(",")[0].split(":")[-1].strip(), 'Sex': details.split(",")[1].split(":")[-1].strip(), 'Race': details.split(",")[2].split(":")[-1].strip(), "source_file": "brownso" } self.cntr += 1 if self.cntr <= 100: yield scrapy.Request( url=f"https://www.brownso.org/warrants/?wpv_view_count=3071&wpv_post_search=&wpv_paged={self.cntr}", callback=self.parse ) class ClackamasSpider(scrapy.Spider): name = 'clackamas' DOWNLOAD_FOLDER = "./downloads/" OUTPUT_FILE = DOWNLOAD_FOLDER + '2_clackamas.csv' custom_settings = {'FEED_FORMAT': 'csv', 'FEED_URI': OUTPUT_FILE } def start_requests(self): yield scrapy.Request( url="https://web3.clackamas.us/roster/extract/warrants", callback=self.parse ) def parse(self, response): resp = json5.loads(response.body) results = resp.get('results') for result in results: name = result.get('name') charges = result.get('charges') for charge in charges: yield { 'Warrant_Last_Name': name.split(",")[0].strip(), 'Warrant_First_Name': name.split(",")[-1].strip(), 'Warrant_Birthdate': result.get('dob'), 'Warrant_Gender': result.get('sex'), 'Warrant_Image': result.get('image'), 'Warrant_Crime': charge.get('charge'), 'Warrant_Bail_Amount': charge.get('bail'), 'Warrant_Charge_Type': charge.get('type'), 'Warrant_Date': charge.get('issued'), "source_file": "clackamas" } class FlatheadSpider(scrapy.Spider): name = 'flathead' DOWNLOAD_FOLDER = "./downloads/" OUTPUT_FILE = DOWNLOAD_FOLDER + "3_flathead.csv" custom_settings = {'FEED_FORMAT': 'csv', 'FEED_URI': OUTPUT_FILE } upper_chars = [i for i in ascii_uppercase] df = pd.read_csv('flathead.csv')['source_url'].tolist( ) if os.path.isfile('flathead.csv') else [] def start_requests(self): for upper_char in self.upper_chars[20:]: yield scrapy.Request( url=f"https://apps.flathead.mt.gov/warrants/warrants_list.php?letter={upper_char}", callback=self.get_warrant_listings ) def get_warrant_listings(self, response): warrant_urls = response.xpath( '//div[@class="group"]/a[@class="a_details"]/@href').getall() for warrant_url in warrant_urls: url = f"https://apps.flathead.mt.gov/warrants/{warrant_url}" if url not in self.df: yield scrapy.Request( url=f"https://apps.flathead.mt.gov/warrants/{warrant_url}", callback=self.parse ) def parse(self, response): name = response.xpath("normalize-space((//h3)[1]/text())").get() alaises = response.xpath( "normalize-space(//td[contains(text(), 'Aliases')]/following-sibling::td/text())").get() alaises = response.xpath( "normalize-space(//td[contains(text(), 'Aliases')]/following-sibling::td/em/text())").get() if not alaises else None warrants = response.xpath( '//div[@id="warrant_list"]/div[@class="group detail"]') for warrant in warrants: yield { 'Warrant_Last_Name': name.split(",")[0].strip(), 'Warrant_First_Name': name.split(",")[-1].strip(), 'Warrant_Age': response.xpath("normalize-space(//td[contains(text(), 'Age')]/following-sibling::td/text())").get(), 'Warrant_Address': response.xpath("normalize-space(//td[contains(text(), 'Known Location')]/following-sibling::td/text())").get(), 'Warrant_Number': warrant.xpath("normalize-space(.//span[contains(text(), 'Number')]/parent::div/text())").get(), 'Warrant_Crime': warrant.xpath("normalize-space(.//span[contains(text(), 'Description')]/parent::div/text()[2])").get(), 'Warrant_Date': warrant.xpath("normalize-space(.//span[contains(text(), 'Date')]/parent::div/text())").get(), 'Warrant_Bond_Amount': warrant.xpath("normalize-space(.//span[contains(text(), 'Bond')]/parent::div/text())").get(), 'Warrant_Image_Url': f'''https://apps.flathead.mt.gov/warrants/{response.xpath("//img[@style]/@src").get()}''', 'source_url': response.url, "source_file": "flathead" } class HallcountyneSpider(scrapy.Spider): name = 'hallcountyne' DOWNLOAD_FOLDER = "./downloads/" OUTPUT_FILE = DOWNLOAD_FOLDER + '4_hallcountyne.csv' custom_settings = {'FEED_FORMAT': 'csv', 'FEED_URI': OUTPUT_FILE } def start_requests(self): yield scrapy.Request( url="https://www.hallcountyne.gov/content.lasso?page=7249", callback=self.get_urls ) def get_urls(self, response): url_li = response.xpath('//span[@class="size14bdblue"]/a') for url in url_li: yield scrapy.Request( url=f'https://www.hallcountyne.gov/{url.xpath(".//@href").get()}', callback=self.parse ) def parse(self, response): warrant_li = response.xpath('//center/table[@id="wanted"]') print(warrant_li) for warrant in warrant_li: name = warrant.xpath( 'normalize-space(.//tr[2]/td[1]/text())').get() yield { 'Warrant_Last_Name': name.split(",")[0].strip(), 'Warrant_First_Name': name.split(",")[1].strip(), 'Warrant_Birthdate': warrant.xpath('normalize-space(.//tr[2]/td[2]/text())').get(), 'Name_Number': warrant.xpath('normalize-space(.//tr[4]/td[1]/text())').get(), 'Weight': warrant.xpath('normalize-space(.//tr[4]/td[2]/text())').get(), 'Height': warrant.xpath('normalize-space(.//tr[4]/td[3]/text())').get(), 'Court_Number': warrant.xpath('normalize-space(.//tr[6]/td[1]/text())').get(), 'Hair': warrant.xpath('normalize-space(.//tr[6]/td[2]/text())').get(), 'Eyes': warrant.xpath('normalize-space(.//tr[6]/td[3]/text())').get(), 'Sex': warrant.xpath('normalize-space(.//tr[8]/td[1]/text())').get(), 'Race': warrant.xpath('normalize-space(.//tr[8]/td[2]/text())').get(), 'Process_Type': warrant.xpath('normalize-space(.//tr[8]/td[3]/text())').get(), 'Expires': warrant.xpath('normalize-space(.//tr[10]/td[1]/text())').get(), 'Crime_Class': warrant.xpath('normalize-space(.//tr[10]/td[2]/text())').get(), 'Issues': warrant.xpath('normalize-space(.//tr[10]/td[3]/text())').get(), 'Wanted_For': warrant.xpath('normalize-space(.//tr[12]/td[1]/text())').get(), "source_file": "hallcountyne" } class LincolnSpider(scrapy.Spider): name = 'lincoln' DOWNLOAD_FOLDER = "./downloads/" OUTPUT_FILE = DOWNLOAD_FOLDER + "5_lincoln.csv" custom_settings = {'FEED_FORMAT': 'csv', 'FEED_URI': OUTPUT_FILE } urls = [ # 'https://app.lincoln.ne.gov/city/police/stats/warrant1.htm', # 'https://app.lincoln.ne.gov/city/police/stats/warrant2.htm#FLOC', # 'https://app.lincoln.ne.gov/city/police/stats/warrant3.htm#LLOC', 'https://app.lincoln.ne.gov/city/police/stats/warrant4.htm#SLOC' ] def start_requests(self): for url in self.urls: yield scrapy.Request( url=url, callback=self.parse ) def parse(self, response): # print(response.text) #-- Writing data to a text file --# f = open("amazon.html", "w", encoding="utf-8") f.write(response.text) f.close() warrant_li = response.xpath( "//th[text()='Name']/parent::tr/following-sibling::tr") print(warrant_li) for warrant in warrant_li: name = warrant.xpath("normalize-space(.//td[1]/text())").get() img = warrant.xpath(".//td[7]/img/@src").get() img = img.replace("\r", "").strip() if img else None charge = warrant.xpath("normalize-space(.//td[6]/text())").get() charge = charge.strip() if charge else None if name: yield { 'Warrant_Last_Name': name.split(",")[0].strip(), 'Warrant_First_Name': name.split(",")[-1].strip(), 'Warrant_Race': warrant.xpath("normalize-space(.//td[2]/text())").get(), 'Sex': warrant.xpath("normalize-space(.//td[3]/text())").get(), 'Warrant_Birthdate': warrant.xpath("normalize-space(.//td[4]/text())").get(), 'Agency': warrant.xpath("normalize-space(.//td[5]/text())").get(), 'Charge': charge, 'Warrant_Image': img, "source_file": "lincoln" } class TomgreencountysheriffSpider(scrapy.Spider): name = 'tomgreencountysheriff' DOWNLOAD_FOLDER = "./downloads/" OUTPUT_FILE = DOWNLOAD_FOLDER + '6_tomgreencountysheriff.csv' custom_settings = {'FEED_FORMAT': 'csv', 'FEED_URI': OUTPUT_FILE } def start_requests(self): yield scrapy.Request( url="https://www.tomgreencountysheriff.org/warrants.php?grp=1", callback=self.parse ) def parse(self, response): warrant_listings = response.xpath( '//div[@id="warrantsDiv"]/div//div[@class="charge_desc"]/parent::div') if warrant_listings: for warrant in warrant_listings: name = warrant.xpath( "normalize-space(.//div[1]/span/text())").get() yield { 'Warrant_Last_Name': name.split(",")[0], 'Warrant_First_Name': name.split(",")[-1], 'Age': warrant.xpath('normalize-space(.//span[contains(text(), "Age")]/parent::div/text())').get(), 'Warrant_Date': warrant.xpath('normalize-space(.//span[contains(text(), "Date")]/parent::div/text())').get(), 'Bond_Amount': warrant.xpath('normalize-space(.//span[contains(text(), "Bond")]/parent::div/text())').get(), 'Charge_Description': warrant.xpath('normalize-space(.//div[@class="charge_desc"]/text())').get(), 'Warrant_Image': f'''https://www.tomgreencountysheriff.org/{warrant.xpath('.//div[1]/img[contains(@id, "warrant-image")]/@src').get()}''', "source_file": "tomgreencountysheriff" } next_page = response.xpath('//a[text()=">"]/@href').get() if next_page: yield scrapy.Request( url=f'''https://www.tomgreencountysheriff.org/{next_page}''', callback=self.parse ) ####################################################################################################### #--- ####################################################################################################### # NON-SPIDER CRAWLERS ####################################################################################################### print("NON-scrapy spiders setup.") def write_to_csv(arr, op_file_name): with open(op_file_name, 'a', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(arr) def writeCSV(data, fieldName, file_name): fileExists = os.path.isfile(file_name) with open(file_name, 'a', encoding='utf-8') as csvfile: writer = csv.DictWriter( csvfile, fieldnames=fieldName, lineterminator='\n') if not fileExists: writer.writeheader() writer.writerow(data) #-- Initializing the chrome driver --# def initChromeDriver(CHROME_LOC): options = webdriver.ChromeOptions() options.add_argument("--log-level=3") # options.add_argument('--headless') options.add_argument('--start-maximized') #-- Uncomment the below two lines while running on MAC/LINUX --# # options.add_argument('--no-sandbox') # options.add_argument('--disable-dev-shm-usage') options.add_argument('--incognito') options.add_argument('ignore-certificate-errors') options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option('useAutomationExtension', False) # prefs = { # "profile.managed_default_content_settings.images": 2, # "download.default_directory" : f"{os.getcwd()}\\downloads\\" # } # options.add_experimental_option("prefs", prefs) # driver = webdriver.Chrome(os.environ.get('chromedriver'), options=options) #driver = webdriver.Chrome( # CHROME_LOC+'chromedriver_win32/chromedriver', options=options) driver = webdriver.Chrome(executable_path= ChromeDriverManager().install(), options=options) stealth(driver, languages=["en-US", "en"], vendor="Google Inc.", platform="Win32", webgl_vendor="Intel Inc.", renderer="Intel Iris OpenGL Engine", fix_hairline=True, ) return driver def capture_screenshot(driver, warrant_no): # identifying the element to capture the screenshot s = driver.find_element_by_xpath('//img[@id="Image1"]') # to get the element location location = s.location # to get the dimension the element size = s.size # to get the screenshot of complete page # driver.switch_to.default_content() # driver.save_screenshot("temp.png") driver.save_screenshot( f"D:/myWorkspace/Web-Scrapping/fiverrProjects/project-40/data/jocosheriff/img/{warrant_no}.png") # driver.switch_to.frame(0) # to get the x axis # x = location['x'] # to get the y axis # y = location['y'] # to get the length the element # height = location['y']+size['height'] # to get the width the element # width = location['x']+size['width'] # to open the captured image # imgOpen = Image.open("temp.png") # to crop the captured image to size of that element # imgOpen = imgOpen.crop((int(x), int(y), int(width), int(height))) # to save the cropped image # imgOpen.save(f"./jocosheriff/img/{warrant_no}.png") def dallascounty_run(CHROME_LOC, DOWNLOAD_LOC, script, num): # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' OUTPUT_FILE = DOWNLOAD_LOC + "downloads/"+str(num)+"_" + script + ".csv" #-- Read the output file & if not available create a new csv file & write the column names into it --# # df_op = [] if os.path.isfile(OUTPUT_FILE): os.remove(OUTPUT_FILE) write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Address_1', 'Warrant_Birthdate', 'Race', 'Sex', 'Outstanding Balance', 'Warrant_Address_2', 'Case_Number', 'Citation_Number', 'Charge_Description', 'Offense_Date', 'Offense_Location', 'Balance', 'Recent_Payment', 'Warrant_Date', "source_file"], OUTPUT_FILE) pass # df_op = pd.read_csv(OUTPUT_FILE)['Advertisement link'].tolist() # print("Please wait while navigating through the pages...") else: write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Address_1', 'Warrant_Birthdate', 'Race', 'Sex', 'Outstanding Balance', 'Warrant_Address_2', 'Case_Number', 'Citation_Number', 'Charge_Description', 'Offense_Date', 'Offense_Location', 'Balance', 'Recent_Payment', 'Warrant_Date', "source_file"], OUTPUT_FILE) #-- Initializing the chromedriver --# driver = initChromeDriver(CHROME_LOC) driver.set_page_load_timeout(100) driver.get("https://www.dallascounty.org/dcwantedsearch/offendersList.jsp") WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.XPATH, "//tbody/tr"))) html = driver.page_source resp = Selector(text=html) warrant_listings = resp.xpath('//tbody/tr') for indx, warrant in enumerate(warrant_listings): name = warrant.xpath("normalize-space(.//td[1]/a/text())").get() temp = [ name.split(",")[0].strip(), name.split(",")[-1].strip(), warrant.xpath("normalize-space(.//td[2]/text())").get(), warrant.xpath("normalize-space(.//td[3]/text())").get(), warrant.xpath("normalize-space(.//td[4]/text())").get(), warrant.xpath("normalize-space(.//td[5]/text())").get(), warrant.xpath("normalize-space(.//td[6]/text())").get(), ] warrant_btn = driver.find_element_by_xpath( f"//tbody/tr[{indx+1}]//td[1]/a") driver.execute_script("arguments[0].click()", warrant_btn) time.sleep(2.5) html = driver.page_source resp = Selector(text=html) cases = resp.xpath('//table/tbody/tr') for case in cases[:-1]: temp.extend([ resp.xpath("(//div[@class='row'])[4]/div/b/text()").get(), case.xpath("normalize-space(.//td[1]/a/text())").get(), case.xpath("normalize-space(.//td[2]/text())").get(), case.xpath("normalize-space(.//td[3]/text())").get(), case.xpath("normalize-space(.//td[4]/text())").get(), case.xpath("normalize-space(.//td[5]/a/text())").get(), case.xpath("normalize-space(.//td[6]/text())").get(), case.xpath("normalize-space(.//td[7]/text())").get(), case.xpath("normalize-space(.//td[9]/text())").get(), "dallascounty" ]) #print(temp) write_to_csv(temp, OUTPUT_FILE) temp = temp[:7] back_btn = driver.find_element_by_xpath("//input[@value='Back']") driver.execute_script("arguments[0].click()", back_btn) time.sleep(2) #-- Shutting down the chromedriver --# driver.quit() return def flde_run(CHROME_LOC, DOWNLOAD_LOC, script, num): # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' OUTPUT_FILE = DOWNLOAD_LOC + "downloads/"+str(num)+"_" + script + ".csv" # search_query_li = ["".join(i) for i in list(combinations_with_replacement([i for i in ascii_lowercase],2))] search_query_li = [i for i in ascii_lowercase] # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' #OUTPUT_FILE = DOWNLOAD_LOC + "downloads/8_fdle_state.csv" #-- Read the output file & if not available create a new csv file & write the column names into it --# df_op = [] if os.path.isfile(OUTPUT_FILE): df_op_temp = pd.read_csv(OUTPUT_FILE) for _, val in df_op_temp.iterrows(): df_op.append( f"{val['Warrant_Last_Name']}_{val['Warrant_First_Name']}_{val['Warrant_Birthdate']}") # print("Please wait while navigating through the pages...") else: write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Birthdate', 'Warrant_Race', 'Warrant_Sex', 'Reporting_Agency', 'Warrant_Nicknames', 'Warrant_Aliases', 'Warrant_Offense', 'Agency_Case', 'Warrant_Date', 'Warrant_Number', 'Warrant_Height', 'Warrant_Weight', 'Warrant_Hair', 'Warrant_Eye', 'Warrant_Scras_Marks_Tattoos', 'Warrant_Occupation', 'Warrant_Address', 'Warrant_Image', "source_file"], OUTPUT_FILE) #-- Initializing the chromedriver --# driver = initChromeDriver(CHROME_LOC) driver.set_page_load_timeout(100) driver.get( "http://pas.fdle.state.fl.us/pas/restricted/PAS/person/WantedPersons.jsf") #### 2Captcha Approach ########## def crop_sceen_shot(img, box, fn): region = img.crop(box) region.save(fn, 'PNG', optimize=True, quality=95) return region def get_2captcha(API_KEY, filename): solver = TwoCaptcha(API_KEY) result = solver.normal(filename) return result['code'] fn_original = "screenshot_flde_run.JPG" fn_cropped = "screenshot_cropped.png" box = (520, 400, 1000, 600) #### 2Captcha Approach ########## for indx, query in enumerate(search_query_li): WebDriverWait(driver, 10).until(EC.visibility_of_element_located( (By.XPATH, '//table[contains(@id, "searchForm")]'))) if indx == 0: #ss_filename = f"screenshot_flde_run.JPG" #driver.save_screenshot("temp.png") #captcha_txt = input("Enter Captcha Text . . . ") try: sleep(1) driver.save_screenshot(fn_original) img = Image.open(fn_original) img_crop = crop_sceen_shot(img=img, box = box, fn = fn_cropped) captcha_txt = get_2captcha(API_KEY=API_KEY, filename=fn_cropped) print("Found captcha >>", captcha_txt) except: try: sleep(1) driver.save_screenshot(fn_original) img = Image.open(fn_original) img_crop = crop_sceen_shot(img=img, box = box, fn = fn_cropped) captcha_txt = get_2captcha(API_KEY=API_KEY, filename=fn_cropped) print("Found captcha >>", captcha_txt) except: try: sleep(1) driver.save_screenshot(fn_original) img = Image.open(fn_original) img_crop = crop_sceen_shot(img=img, box = box, fn = fn_cropped) captcha_txt = get_2captcha(API_KEY=API_KEY, filename=fn_cropped) print("Found captcha >>", captcha_txt) except: try: sleep(1) driver.save_screenshot(fn_original) img = Image.open(fn_original) img_crop = crop_sceen_shot(img=img, box = box, fn = fn_cropped) captcha_txt = get_2captcha(API_KEY=API_KEY, filename=fn_cropped) print("Found captcha >>", captcha_txt) except: sleep(1) driver.save_screenshot(fn_original) img = Image.open(fn_original) img_crop = crop_sceen_shot(img=img, box = box, fn = fn_cropped) captcha_txt = get_2captcha(API_KEY=API_KEY, filename=fn_cropped) print("Found captcha >>", captcha_txt) sleep(1) captcha_box = driver.find_element_by_xpath( "//input[contains(@id, 'captchaInput')]") captcha_box.send_keys(captcha_txt) lname_box = driver.find_element_by_xpath( "//input[contains(@id, 'lastNameInput')]") lname_box.send_keys(query) search_btn = driver.find_element_by_xpath( '//span[text()="Submit Search"]') driver.execute_script("arguments[0].click()", search_btn) WebDriverWait(driver, 10).until(EC.visibility_of_element_located( (By.XPATH, "//tbody[contains(@id, 'searchWantedPersonsResultTable_data')]"))) if indx == 0: sel = Select(driver.find_element_by_xpath( "(//select[contains(@id, '')])[last()]")) sel.select_by_index(5) time.sleep(3) while True: print(f"\n\n {indx} => {query} \n\n") html = driver.page_source resp = Selector(text=html) warrant_li = resp.xpath( "//tbody[contains(@id, 'searchWantedPersonsResultTable_data')]/tr") for idx, warrant in enumerate(warrant_li): name = warrant.xpath( "normalize-space(.//span[contains(@id, 'nameTable')]/text())").get() lname = name.split(",")[0].strip() fname = name.split(",")[-1].strip() dob = warrant.xpath( "normalize-space(.//span[contains(@id, 'dobTable')]/text())").get() if f"{lname}_{fname}_{dob}" not in df_op: df_op.append(f"{lname}_{fname}_{dob}") temp = [ lname, fname, dob, warrant.xpath( "normalize-space(.//span[contains(@id, 'raceTable')]/text())").get(), warrant.xpath( "normalize-space(.//span[contains(@id, 'sexTable')]/text())").get(), warrant.xpath( "normalize-space(.//span[contains(@id, 'repAgencyTable')]/text())").get(), ] warrant_btn = driver.find_element_by_xpath( f'''(//tbody[contains(@id, 'searchWantedPersonsResultTable_data')]/tr//span[contains(@id, 'nameTable')])[{idx+1}]''') driver.execute_script("arguments[0].click()", warrant_btn) try: WebDriverWait(driver, 6).until(EC.visibility_of_element_located( (By.XPATH, "//table[contains(@id, 'personDetailsGrid')]"))) except: # driver.save_screenshot("temp.png") html_temp = driver.page_source resp_temp = Selector(text=html_temp) print( f'''Captcha Img URK: http://pas.fdle.state.fl.us{resp_temp.xpath("//img[contains(@id, 'captchaPopup')]/@src").get()}''') #captcha_txt = input("Enter Captcha Text . . . ") # Redefine box for cropping region ... box = (620, 400, 1100, 600) try: sleep(1) driver.save_screenshot(fn_original) img = Image.open(fn_original) img_crop = crop_sceen_shot(img=img, box = box, fn = fn_cropped) captcha_txt = get_2captcha(API_KEY=API_KEY, filename=fn_cropped) print("Found captcha >>", captcha_txt) except: try: sleep(1) driver.save_screenshot(fn_original) img = Image.open(fn_original) img_crop = crop_sceen_shot(img=img, box = box, fn = fn_cropped) captcha_txt = get_2captcha(API_KEY=API_KEY, filename=fn_cropped) print("Found captcha >>", captcha_txt) except: try: sleep(1) driver.save_screenshot(fn_original) img = Image.open(fn_original) img_crop = crop_sceen_shot(img=img, box = box, fn = fn_cropped) captcha_txt = get_2captcha(API_KEY=API_KEY, filename=fn_cropped) print("Found captcha >>", captcha_txt) except: try: sleep(1) driver.save_screenshot(fn_original) img = Image.open(fn_original) img_crop = crop_sceen_shot(img=img, box = box, fn = fn_cropped) captcha_txt = get_2captcha(API_KEY=API_KEY, filename=fn_cropped) print("Found captcha >>", captcha_txt) except: sleep(1) driver.save_screenshot(fn_original) img = Image.open(fn_original) img_crop = crop_sceen_shot(img=img, box = box, fn = fn_cropped) captcha_txt = get_2captcha(API_KEY=API_KEY, filename=fn_cropped) print("Found captcha >>", captcha_txt) sleep(1) captcha_box = driver.find_element_by_xpath( "//input[contains(@id, 'captchaInput')]") captcha_box.send_keys(captcha_txt) submit_btn = driver.find_element_by_xpath( '//span[text()="Submit"]') driver.execute_script( "arguments[0].click()", submit_btn) time.sleep(2) html_inr = driver.page_source resp_inr = Selector(text=html_inr) temp.extend([ resp_inr.xpath( "normalize-space(//td[contains(text(), 'Nicknames:')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Aliases:')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Offense:')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Agency Case')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Date of Warrant')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Warrant #')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Height')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Weight')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Hair')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Eye')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Scars')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Occupation')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "normalize-space(//td[contains(text(), 'Last Known')]/text())").get().split(":")[-1].strip(), resp_inr.xpath( "//img[contains(@id, 'subjectDetails')]/@src").get(), "fldestate" ]) #print(temp) write_to_csv(temp, OUTPUT_FILE) back_btn = driver.find_element_by_xpath( "//span[contains(text(), 'Back To')]") driver.execute_script("arguments[0].click()", back_btn) WebDriverWait(driver, 10).until(EC.visibility_of_element_located( (By.XPATH, "//tbody[contains(@id, 'searchWantedPersonsResultTable_data')]/tr//span[contains(@id, 'nameTable')]"))) next_page = resp.xpath( "//span[text()='N']/parent::a[contains(@class, 'disabled')]") if next_page: break else: time.sleep(1) next_btn = driver.find_element_by_xpath("//span[text()='N']") driver.execute_script("arguments[0].click()", next_btn) time.sleep(1) new_search_btn = driver.find_element_by_xpath( "//span[text()='New Search']") driver.execute_script("arguments[0].click()", new_search_btn) time.sleep(2) #-- Shutting down chromedriver --# driver.quit() return def harriscounty_run(CHROME_LOC, DOWNLOAD_LOC, script, num): # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' OUTPUT_FILE = DOWNLOAD_LOC + "downloads/"+str(num)+"_" + script + ".csv" #-- Read the output file & if not available create a new csv file & write the column names into it --# # df_op = [] if os.path.isfile(OUTPUT_FILE): # pass df_op = pd.read_csv(OUTPUT_FILE)['SPN'].tolist() print("Found existing OUTPUT FILE.") # print("Please wait while navigating through the pages...") else: write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'SPN', 'Warrant_Race', 'Warrant_Sex', 'Warrant_Birthdate', 'Offense Description', 'Case Number', 'Bond', 'Warrant_Image', "source_file"], OUTPUT_FILE) df_op = pd.read_csv(OUTPUT_FILE)['SPN'].tolist() print("Made new file OUTPUT FILE.") #-- Initializing the chromedriver --# driver = initChromeDriver(CHROME_LOC) driver.set_page_load_timeout(100) driver.get( "https://apps.harriscountyso.org/JailInfo/warrants_search_results.aspx") page_cntr = 1 while True: WebDriverWait(driver, 10).until(EC.visibility_of_element_located( (By.XPATH, "//table[contains(@id, 'warrantsearchresults')]"))) html = driver.page_source resp = Selector(text=html) warrant_li = resp.xpath( "//table[contains(@id, 'warrantsearchresults')]/tbody/tr") for indx, warrant in enumerate(warrant_li[1:-1]): spn = warrant.xpath("normalize-space(.//td[2]/text())").get() if int(spn) not in df_op: warrant_btn = driver.find_element_by_xpath( f"(//table[contains(@id, 'warrantsearchresults')]/tbody/tr/td[1]/a)[{indx+1}]") driver.execute_script("arguments[0].click()", warrant_btn) WebDriverWait(driver, 10).until(EC.visibility_of_element_located( (By.XPATH, f"//table[@id='ContentPlaceHolder1_dv_searchresults']//td[text()='{spn}']"))) time.sleep(1.5) html = driver.page_source resp = Selector(text=html) cases = resp.xpath( "//table[contains(@id, 'GridView1')]/tbody/tr") for case in cases[1:]: temp = [ resp.xpath( "normalize-space(//table[@id='ContentPlaceHolder1_dv_searchresults']//td[text()='Last Name']/following-sibling::td)").get(), resp.xpath( "normalize-space(//table[@id='ContentPlaceHolder1_dv_searchresults']//td[text()='First Name']/following-sibling::td)").get(), spn, resp.xpath( "normalize-space(//table[@id='ContentPlaceHolder1_dv_searchresults']//td[text()='Race']/following-sibling::td)").get(), resp.xpath( "normalize-space(//table[@id='ContentPlaceHolder1_dv_searchresults']//td[text()='Sex']/following-sibling::td)").get(), resp.xpath( "normalize-space(//span[contains(@id, 'dob')]/text())").get(), case.xpath("normalize-space(.//td[1]/text())").get(), case.xpath("normalize-space(.//td[2]/text())").get(), case.xpath("normalize-space(.//td[3]/text())").get(), f'''https://apps.harriscountyso.org{resp.xpath("//img[contains(@id, 'Image1')]/@src").get().replace("..","")}''', "harriscounty" ] #print(temp) write_to_csv(temp, OUTPUT_FILE) try: page_cntr += 1 if page_cntr % 10 == 1: next_btn_10step = driver.find_element_by_xpath(f"//td/a[contains(@href, 'Page${page_cntr}')]") next_btn_10step.click() time.sleep(2.5) else: next_btn_1step = driver.find_element_by_xpath(f"//td/a[text()={page_cntr}]") next_btn_1step.click() time.sleep(2.5) print(f"\n------------\nGOING ON TO NEXT PAGE => {page_cntr} \n------------\n") except Exception as e: print("Failed to go to the next page. Exit Loop >>", e) break driver.quit() try: driver.quit() except: print("No driver to quit >>", e) return def jocosheriff_run(CHROME_LOC, DOWNLOAD_LOC, script, num): # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' OUTPUT_FILE = DOWNLOAD_LOC + "downloads/"+str(num)+"_" + script + ".csv" FIELD_NAMES = [ 'Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Middle_Initial', 'Warrant_Name_Suffix', 'Race', 'Sex', 'Age', 'Warrant_Type', 'Warrant_Issue_Date', 'Height', 'Weight', 'Hair', 'Eye', 'Last_Known_City_Residence', 'Warrant_No', 'Case_No', 'Charge', 'Bail_Amount', 'Bail_Type', "source_file" ] # df = pd.read_excel('zip_code_li.xlsx', sheet_name='Sheet1') driver = initChromeDriver(CHROME_LOC) driver.maximize_window() driver.get( "https://jocosheriff.org/operations-bureau/warrant-unit/warrant-search") # input("Enter ...") WebDriverWait(driver, 15).until( EC.visibility_of_element_located((By.XPATH, "//iframe"))) driver.switch_to.frame(0) city_rad_btn = driver.find_element_by_xpath("//input[@id='rbSearchCity']") driver.execute_script("arguments[0].click()", city_rad_btn) time.sleep(3) driver.switch_to.default_content() driver.switch_to.frame(0) # WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, "//select[@id='ddCity']"))) # input("Enter . . ") for i in range(1,21): #if i == 13: print("Processing >", i) # identify dropdown with Select class search_btn = driver.find_element_by_xpath( "//select[@id='ddCity']/parent::td") driver.execute_script("arguments[0].click()", search_btn) sel = Select(driver.find_element_by_xpath( "//select[@id='ddCity']")) sel.select_by_index(i) search_btn = driver.find_element_by_xpath( '//input[@id="btnSearch"]') driver.execute_script("arguments[0].click()", search_btn) # sel = Select(driver.find_element_by_xpath("//select[@id='ddCity']")) # sel.select_by_index(i) # search_btn = driver.find_element_by_xpath('//input[@id="btnSearch"]') # driver.execute_script("arguments[0].click()", search_btn) print("sleep - 6.") time.sleep(6) # driver.switch_to.default_content() html = driver.page_source respObj = Selector(text=html) # driver.switch_to.frame(0) warrant_li = respObj.xpath('(//tbody)[last()]/tr') print("Found warrant_li >", warrant_li) for indx, warrant in enumerate(warrant_li): #if indx >= 635: dataDict = { FIELD_NAMES[0]: warrant.xpath("normalize-space(.//td[1]/a/text())").get(), FIELD_NAMES[1]: warrant.xpath("normalize-space(.//td[2]/text())").get(), FIELD_NAMES[2]: warrant.xpath("normalize-space(.//td[3]/text())").get(), FIELD_NAMES[3]: warrant.xpath("normalize-space(.//td[4]/text())").get(), FIELD_NAMES[4]: warrant.xpath("normalize-space(.//td[5]/text())").get(), FIELD_NAMES[5]: warrant.xpath("normalize-space(.//td[6]/text())").get(), FIELD_NAMES[6]: warrant.xpath("normalize-space(.//td[7]/text())").get(), FIELD_NAMES[7]: warrant.xpath("normalize-space(.//td[8]/text())").get(), FIELD_NAMES[8]: warrant.xpath("normalize-space(.//td[9]/text())").get(), } driver.switch_to.default_content() driver.switch_to.frame(0) print("Looking for button") warrnt_dtl_btn = driver.find_element_by_xpath( f'(//tbody)[last()]/tr[{indx+1}]//td/a') driver.execute_script( "arguments[0].click()", warrnt_dtl_btn) WebDriverWait(driver, 10).until(EC.visibility_of_element_located( (By.XPATH, "//b[contains(text(), 'WARRANT NUMBER')]"))) print("Now getting more fields") html = driver.page_source respObj = Selector(text=html) warrant_no = respObj.xpath( "normalize-space(//b[contains(text(), 'WARRANT NUM')]/font/text())").get() dataDict[FIELD_NAMES[9]] = respObj.xpath( "normalize-space(//b[contains(text(), 'HEIGHT')]/font/text())").get() dataDict[FIELD_NAMES[10]] = respObj.xpath( "normalize-space(//b[contains(text(), 'WEIGHT')]/font/text())").get() dataDict[FIELD_NAMES[11]] = respObj.xpath( "normalize-space(//b[contains(text(), 'HAIR')]/font/text())").get() dataDict[FIELD_NAMES[12]] = respObj.xpath( "normalize-space(//b[contains(text(), 'EYES')]/font/text())").get() dataDict[FIELD_NAMES[13]] = respObj.xpath( "normalize-space(//b[contains(text(), 'RESIDENCE')]/font/text())").get() dataDict[FIELD_NAMES[14]] = warrant_no dataDict[FIELD_NAMES[15]] = respObj.xpath( "normalize-space(//b[contains(text(), 'CASE NUM')]/font/text())").get() dataDict[FIELD_NAMES[16]] = respObj.xpath( "normalize-space(//b[text()='TYPE: ']/font/text())").get() dataDict[FIELD_NAMES[17]] = respObj.xpath( "normalize-space(//b[contains(text(), 'BAIL AMOUNT')]/font/text())").get() dataDict[FIELD_NAMES[18]] = respObj.xpath( "normalize-space(//b[contains(text(), 'BAIL TYPE')]/font/text())").get() dataDict[FIELD_NAMES[19]] = "jocosheriff" #print(dataDict, end="\n\n") # writeCSV(dataDict, FIELD_NAMES, '/content/drive/MyDrive/fiverrWorkspace/gMapsData.csv') writeCSV(dataDict, FIELD_NAMES, OUTPUT_FILE) print("Wrote to CSV") capture_screenshot(driver, warrant_no) back_btn = driver.find_element_by_xpath( '//input[@id="Button1"]') driver.execute_script("arguments[0].click()", back_btn) WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.XPATH, "//select[@id='ddCity']"))) time.sleep(2) #-- Shutting down the chrome driver --# driver.quit() return def ndcourts_run(CHROME_LOC, DOWNLOAD_LOC, script, num): # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' OUTPUT_FILE = DOWNLOAD_LOC + "downloads/"+str(num)+"_" + script + ".csv" # search_query_li = ["".join(i) for i in list(combinations_with_replacement([i for i in ascii_lowercase],2))] search_query_li = [i for i in ascii_lowercase] #start_date_li = ['01/01/1980', '01/02/1985', '01/02/1990', '01/02/1995', '01/02/2000', '01/02/2005', '01/02/2010', # '01/02/2015', '01/02/2020', '04/02/2020', '08/02/2020', '12/02/2020', '03/02/2021', '06/02/2021', '09/02/2021'] #end_date_li = ['01/01/1985', '01/01/1990', '01/01/1995', '01/01/2000', '01/01/2005', '01/01/2010', '01/01/2015', # '01/01/2020', '04/01/2020', '08/01/2020', '12/01/2020', '03/01/2021', '06/01/2021', '09/01/2021', '11/20/2021'] today = datetime.datetime.now().strftime("%m/%d/%Y") dates = [s.strftime("%m/%d/%Y") for s in pd.date_range(start='01/01/1979', end = today, freq='1m')] start_date_li = dates[:-1] end_date_li = dates[1:] print("start_date_li =>\n", start_date_li) print("end_date_li =>\n", end_date_li) # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' #OUTPUT_FILE = DOWNLOAD_LOC + "downloads/16_ndcourts.csv" #-- Read the output file & if not available create a new csv file & write the column names into it --# # df_op = [] if os.path.isfile(OUTPUT_FILE): os.remove(OUTPUT_FILE) write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Initial_Name', 'Warrant_Case_Number', 'Citation_Number', 'Warrant_Birthdate', 'Warrant_Date', 'Warrant_Location', 'Judicial_Officer', 'Warrant_Type', 'Warrant_Status', 'Warrant_Description', 'Source_Url', "source_file"], OUTPUT_FILE) pass # df_op_temp = pd.read_csv(OUTPUT_FILE) # for _,val in df_op_temp.iterrows(): # df_op.append(f"{val['Warrant_Last_Name']}_{val['Warrant_First_Name']}_{val['Warrant_Birthdate']}") # print("Please wait while navigating through the pages...") else: write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Initial_Name', 'Warrant_Case_Number', 'Citation_Number', 'Warrant_Birthdate', 'Warrant_Date', 'Warrant_Location', 'Judicial_Officer', 'Warrant_Type', 'Warrant_Status', 'Warrant_Description', 'Source_Url', "source_file"], OUTPUT_FILE) #-- Initializing the chromedriver --# driver = initChromeDriver(CHROME_LOC) driver.set_page_load_timeout(100) driver.get("https://publicsearch.ndcourts.gov/default.aspx") indx_li = [1, 2, 5, 17, 21, 36, 46, 55, 59, 63] for indx in indx_li:#[-1:]: print("processing index: ", indx) #-- Select the location --# sel = Select(driver.find_element_by_xpath( "//select[@id='sbxControlID2']")) sel.select_by_index(indx) #-- Click on Criminal / Traffic --# criminal_search_btn = driver.find_element_by_xpath( "//a[contains(text(), 'Criminal')]") driver.execute_script("arguments[0].click()", criminal_search_btn) for sDate, eDate in zip(start_date_li, end_date_li): WebDriverWait(driver, 60).until(EC.visibility_of_element_located( (By.XPATH, '//input[@id="DateFiled"]'))) #-- Select Serach by Radio button as : Date Field --# filed_date_btn = driver.find_element_by_xpath( '//input[@id="DateFiled"]') driver.execute_script("arguments[0].click()", filed_date_btn) time.sleep(1) #-- Select case status as open --# open_case_btn = driver.find_element_by_xpath( '//input[@id="OpenOption"]') driver.execute_script("arguments[0].click()", open_case_btn) try: #-- Select case type as : Felony --# sel = Select(driver.find_element_by_xpath( '//select[@id="selCaseTypeGroups"]')) sel.select_by_index(5) except Exception as e: print("Could not find felony option, ", e) #-- Enter the filed date - start & end --# start_date_box = driver.find_element_by_xpath( "//input[@id='DateFiledOnAfter']") start_date_box.send_keys(sDate) end_date_box = driver.find_element_by_xpath( "//input[@id='DateFiledOnBefore']") end_date_box.send_keys(eDate) #-- Submit search button --# submit_btn = driver.find_element_by_xpath( "//input[@id='SearchSubmit']") driver.execute_script("arguments[0].click()", submit_btn) #-- CRIMINAL CASE RECORDS PAGE --# WebDriverWait(driver, 60).until(EC.visibility_of_element_located( (By.XPATH, "//a[contains(text(), 'New Criminal')]"))) time.sleep(2) html = driver.page_source resp = Selector(text=html) if resp.xpath("//b[contains(text(), 'No cases matched')]"): criminal_srch_btn = driver.find_element_by_xpath( "//a[contains(text(), 'New Criminal')]") driver.execute_script( "arguments[0].click()", criminal_srch_btn) else: records = resp.xpath("//colgroup/following-sibling::tbody/tr") for record in records: name = record.xpath( "normalize-space(.//td[3]/div[1]/text())").get() if name: if len(name.split(",")) == 3: lname = name.split(",")[0].strip() fname = name.split(",")[1].strip() initial = name.split(",")[2].strip() else: lname = name.split(",")[0].strip() fname = name.split(",")[-1].strip() initial = None charges = record.xpath(".//td[6]//tr/td") # print(charges) # input("Enter ...") for idx, charge in enumerate(charges): cNum = record.xpath( f"normalize-space(.//td[2]/div[{idx+1}]/text())").get() temp = [ lname, fname, initial, record.xpath( "normalize-space(.//td[1]/a/text())").get(), cNum, record.xpath( "normalize-space(.//td[3]/div[2]/text())").get(), record.xpath( "normalize-space(.//td[4]/div[1]/text())").get(), record.xpath( "normalize-space(.//td[4]/div[2]/text())").get(), record.xpath( "normalize-space(.//td[4]/div[3]/text())").get(), record.xpath( "normalize-space(.//td[5]/div[1]/text())").get(), record.xpath( "normalize-space(.//td[5]/div[2]/text())").get(), charge.xpath( "normalize-space(.//text())").get(), f'''https://publicsearch.ndcourts.gov/{record.xpath(".//td[1]/a/@href").get()}''', "ndcourts" ] #print(temp) write_to_csv(temp, OUTPUT_FILE) criminal_srch_btn = driver.find_element_by_xpath( "//a[contains(text(), 'New Criminal')]") driver.execute_script( "arguments[0].click()", criminal_srch_btn) #-- location serach button --# location_srch_btn = driver.find_element_by_xpath( "//a[text()='Search Menu']") driver.execute_script("arguments[0].click()", location_srch_btn) #-- Shutting down chromedriver --# driver.quit() return def pennco_run(CHROME_LOC, DOWNLOAD_LOC, script, num): # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' OUTPUT_FILE = DOWNLOAD_LOC + "downloads/"+str(num)+"_" + script + ".csv" #-- Read the output file & if not available create a new csv file & write the column names into it --# # df_op = [] if os.path.isfile(OUTPUT_FILE): os.remove(OUTPUT_FILE) write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Address_1', 'Address_2', 'Warrant_Birthdate', 'Warrant_Crime', "source_file"], OUTPUT_FILE) pass # df_op = pd.read_csv(OUTPUT_FILE)['Advertisement link'].tolist() # print("Please wait while navigating through the pages...") else: write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Address_1', 'Address_2', 'Warrant_Birthdate', 'Warrant_Crime', "source_file"], OUTPUT_FILE) #-- Initializing the chromedriver --# driver = initChromeDriver(CHROME_LOC) driver.set_page_load_timeout(10) driver.get("https://www.pennco.org/?SEC=23C884F7-BC3F-4332-86BF-D5375C62DF3B") #input("Enter . . . ") WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.XPATH, "//iframe"))) driver.switch_to.frame(0) page_icons = driver.find_elements_by_class_name("dxp-num") last_page_icon = page_icons[-1] last_page = last_page_icon.text def get_cur_page(driver): cur_icon = driver.find_element_by_class_name('dxp-current') current_page = cur_icon.text.strip('[').strip(']') return current_page current_page = get_cur_page(driver) while current_page != last_page: html = driver.page_source resp = Selector(text=html) warrant_li = resp.xpath("//tr[contains(@id, 'DataRow')]") for warrant in warrant_li: charges = [i.strip() for i in warrant.xpath( ".//td[2]/span/text()").getall() if i.strip()] for charge in charges: name = warrant.xpath( "normalize-space(.//td[1]/span[1]/text())").get() temp = [ name.split(",")[0].strip(), name.split(",")[-1].strip(), warrant.xpath( "normalize-space(.//td[1]/span[2]/text())").get(), warrant.xpath( "normalize-space(.//td[1]/span[3]/text())").get(), warrant.xpath( "normalize-space(.//td[1]/span[4]/text())").get(), charge, "pennco" ] #print(temp) write_to_csv(temp, OUTPUT_FILE) current_page = get_cur_page(driver) print(f"\n----------------------------\nCurrent Page is: {current_page}\n----------------------------\n") try: next_btn = driver.find_element_by_class_name('dxWeb_pNext') next_btn.click() print("On to next page") time.sleep(3) except Exception as e: print("No next click -> ", e) break #-- Shutting down the chromedriver --# driver.quit() return def sedgwickcounty_run(CHROME_LOC, DOWNLOAD_LOC, script, num): # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' OUTPUT_FILE = DOWNLOAD_LOC + "downloads/"+str(num)+"_" + script + ".csv" if os.path.isfile(OUTPUT_FILE): os.remove(OUTPUT_FILE) write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Middle_Initial', 'Warrant_Name_Suffix', 'Race', 'Sex', 'Age', 'Height', 'Weight', 'Hair', 'Eyes', 'WarrantId', 'CourtCaseNo', 'WarrantType', 'Charge', 'FelonyMisdemeanor', 'Bond', 'Source Url', "source_file"], OUTPUT_FILE) pass else: write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Middle_Initial', 'Warrant_Name_Suffix', 'Race', 'Sex', 'Age', 'Height', 'Weight', 'Hair', 'Eyes', 'WarrantId', 'CourtCaseNo', 'WarrantType', 'Charge', 'FelonyMisdemeanor', 'Bond', 'Source Url', "source_file"], OUTPUT_FILE) #-- Initializing the chromedriver --# driver = initChromeDriver(CHROME_LOC) driver.set_page_load_timeout(100) driver.get( "https://ssc.sedgwickcounty.org/SheriffWarrants/WarrantNameSearchForm.aspx") # time.sleep(1) #-- Generating the list of search queries after remove the finshed ones --# search_query_li_raw = ["".join(i) for i in list( combinations_with_replacement([i for i in ascii_lowercase], 3))] search_query_li_raw = ascii_lowercase # search_query_li = list( (set(search_query_li_raw) | set(finished_search_queries)) - (set(search_query_li_raw) & set(finished_search_queries)) ) #html = driver.page_source #resp = Selector(text=html) # driver.maximize_window() for query in search_query_li_raw: print("\n#########\nProcessing QUERY =>", query, "\n#########\n") driver.find_element_by_xpath("//*[@id='tbLastName']").clear() fName_box = driver.find_element_by_xpath("//*[@id='tbLastName']") fName_box.send_keys(query) fName_box.send_keys(Keys.ENTER) print("Found elements.") time.sleep(4) html = driver.page_source resp_obj = Selector(text=html) # input("User input ...") print("Found html and resp_obj.") warrant_li = resp_obj.xpath("//tbody/tr[@class='GridItem']") for url in warrant_li: print("\nProcess with ->", url) sub_url = f'''https://ssc.sedgwickcounty.org/SheriffWarrants/{url.xpath(".//td[1]/a/@href").get()}''', print("\tSub URL Actual- ", sub_url) Warrant_Last_Name = url.xpath( "normalize-space(.//td[1]/a/text())").get(), Warrant_First_Name = url.xpath( "normalize-space(.//td[2]/text())").get(), Warrant_Middle_Initial = url.xpath( "normalize-space(.//td[3]/text())").get(), Warrant_Name_Suffix = url.xpath( "normalize-space(.//td[4]/text())").get(), Race = url.xpath("normalize-space(.//td[5]/text())").get(), Sex = url.xpath("normalize-space(.//td[6]/text())").get() driver.get(sub_url[0]) html = driver.page_source response = Selector(text=html) item = { 'Warrant_Last_Name': Warrant_Last_Name, 'Warrant_First_Name': Warrant_First_Name, 'Warrant_Middle_Initial': Warrant_Middle_Initial, 'Warrant_Name_Suffix': Warrant_Name_Suffix, 'Race': Race, 'Sex': Sex, 'Age': response.xpath("normalize-space(//span[@id='lblAge']/text())").get(), 'Height': response.xpath("normalize-space(//span[@id='lblHeight']/text())").get(), 'Weight': response.xpath("normalize-space(//span[@id='lblWeight']/text())").get(), 'Hair': response.xpath("normalize-space(//span[@id='lblHair']/text())").get(), 'Eyes': response.xpath("normalize-space(//span[@id='lblEyes']/text())").get(), 'WarrantId': response.xpath("normalize-space(//span[@id='lblWarrantId']/text())").get(), 'CourtCaseNo': response.xpath("normalize-space(//span[@id='lblCourtCaseNo']/text())").get(), 'WarrantType': response.xpath("normalize-space(//span[@id='lblWarrantType']/text())").get(), 'Charge': response.xpath("normalize-space(//span[@id='lblCharge']/text())").get(), 'FelonyMisdemeanor': response.xpath("normalize-space(//span[@id='lblFelonyMisdemeanor']/text())").get(), 'Bond': response.xpath("normalize-space(//span[@id='lblBond']/text())").get(), 'Source Url': sub_url, "source_file": "sedgwickcounty" } item_list = [Warrant_Last_Name, Warrant_First_Name, Warrant_Middle_Initial, Warrant_Name_Suffix, Race, Sex, response.xpath( "normalize-space(//span[@id='lblAge']/text())").get(), response.xpath( "normalize-space(//span[@id='lblHeight']/text())").get(), response.xpath( "normalize-space(//span[@id='lblWeight']/text())").get(), response.xpath( "normalize-space(//span[@id='lblHair']/text())").get(), response.xpath( "normalize-space(//span[@id='lblEyes']/text())").get(), response.xpath( "normalize-space(//span[@id='lblWarrantId']/text())").get(), response.xpath( "normalize-space(//span[@id='lblCourtCaseNo']/text())").get(), response.xpath( "normalize-space(//span[@id='lblWarrantType']/text())").get(), response.xpath( "normalize-space(//span[@id='lblCharge']/text())").get(), response.xpath( "normalize-space(//span[@id='lblFelonyMisdemeanor']/text())").get(), response.xpath( "normalize-space(//span[@id='lblBond']/text())").get(), sub_url, "sedgwickcounty"] #print("found item , length ->", len(item)) #print(item) write_to_csv(item_list, OUTPUT_FILE) print("wrote to file.") driver.get( "https://ssc.sedgwickcounty.org/SheriffWarrants/WarrantNameSearchForm.aspx") driver.quit() return def warrantsearch_run(CHROME_LOC, DOWNLOAD_LOC, script, num): # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' OUTPUT_FILE = DOWNLOAD_LOC + "downloads/"+str(num)+"_" + script + ".csv" #-- Read the output file & if not available create a new csv file & write the column names into it --# df_op = [] if os.path.isfile(OUTPUT_FILE): df_op = pd.read_csv(OUTPUT_FILE)['Warrant_Doc_Number'].tolist() print("Please wait while navigating through the pages...") else: write_to_csv(['Warrant_Date', 'Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Crime', 'Warrant_County', 'Warrant_Doc_Number', 'Warrant_Birthdate', 'Warrant_Height', 'Warrant_Weight', 'Warrant_Age', 'Warrant_Eye_Color', 'Warrant_Hair_Color', 'Warrant_Race', 'Warrant_Image', "source_file"], OUTPUT_FILE) #-- Initializing the chromedriver --# driver = initChromeDriver(CHROME_LOC) # driver.set_page_load_timeout(10) driver.get("https://www.doc.wa.gov/information/warrants/default.aspx") WebDriverWait(driver, 3).until(EC.visibility_of_element_located( (By.XPATH, "//table[@id='WarrantsTable']"))) cntr = 0 while True: cntr == 0 cntr += 1 print(f"\n\nPAGE - {cntr}") html = driver.page_source resp = Selector(text=html) warrants_li = resp.xpath( "//table[@id='WarrantsTable']/tbody/tr[@class='collapse']") for warrant in warrants_li: name = warrant.xpath( "normalize-space(.//strong[contains(text(), 'Name')]/parent::li/text())").get() warrant_Doc_Number = warrant.xpath( "normalize-space(.//strong[contains(text(), 'DOC Number')]/parent::li/text())").get() # print(warrant_Doc_Number, type(warrant_Doc_Number)) if int(warrant_Doc_Number) not in df_op: temp = [ warrant.xpath( "normalize-space(.//strong[contains(text(), 'Date')]/parent::li/text())").get(), name.split(",")[0].strip(), name.split(",")[1].strip(), warrant.xpath( "normalize-space(.//strong[contains(text(), 'Crime')]/parent::li/text())").get(), warrant.xpath( "normalize-space(.//strong[contains(text(), 'County')]/parent::li/text())").get(), warrant_Doc_Number, warrant.xpath( "normalize-space(.//strong[contains(text(), 'Birthdate')]/parent::li/text())").get(), warrant.xpath( "normalize-space(.//strong[contains(text(), 'Height')]/parent::li/text())").get(), warrant.xpath( "normalize-space(.//strong[contains(text(), 'Weight')]/parent::li/text())").get(), warrant.xpath( "normalize-space(.//strong[contains(text(), 'Age')]/parent::li/text())").get(), warrant.xpath( "normalize-space(.//strong[contains(text(), 'Eye Color')]/parent::li/text())").get(), warrant.xpath( "normalize-space(.//strong[contains(text(), 'Hair Color')]/parent::li/text())").get(), warrant.xpath( "normalize-space(.//strong[contains(text(), 'Race')]/parent::li/text())").get(), f'''https://www.doc.wa.gov{warrant.xpath(".//img/@src").get()}''', "warrant_usa" ] #print(temp) write_to_csv(temp, OUTPUT_FILE) next_page = resp.xpath("//a[text()='Next']/@href").get() if next_page: driver.get(f"https://www.doc.wa.gov{next_page}") WebDriverWait(driver, 3).until(EC.visibility_of_element_located( (By.XPATH, "//table[@id='WarrantsTable']"))) time.sleep(1.5) else: break # pass #-- Shutting down the chromedriver --# driver.quit() return def weldgov_run(CHROME_LOC, DOWNLOAD_LOC, script, num): # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' OUTPUT_FILE = DOWNLOAD_LOC + "downloads/"+str(num)+"_" + script + ".csv" #-- Read the output file & if not available create a new csv file & write the column names into it --# # df_op = [] if os.path.isfile(OUTPUT_FILE): os.remove(OUTPUT_FILE) write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Address', 'Warrant_Charge', 'Original_Charge', 'Bail_Amount', 'Original_Warrant_Date', 'Date_Warrant_Entered', 'ORI', 'Warrant_Number', 'Warrant_Birthdate', 'Gender', 'Height', 'Weight', 'Hair', 'Eyes', 'Race', 'Crime_Type', "source_file"], OUTPUT_FILE) #pass # df_op = pd.read_csv(OUTPUT_FILE)['Advertisement link'].tolist() # print("Please wait while navigating through the pages...") else: write_to_csv(['Warrant_Last_Name', 'Warrant_First_Name', 'Warrant_Address', 'Warrant_Charge', 'Original_Charge', 'Bail_Amount', 'Original_Warrant_Date', 'Date_Warrant_Entered', 'ORI', 'Warrant_Number', 'Warrant_Birthdate', 'Gender', 'Height', 'Weight', 'Hair', 'Eyes', 'Race', 'Crime_Type', "source_file"], OUTPUT_FILE) #-- Initializing the chromedriver --# driver = initChromeDriver(CHROME_LOC) driver.set_page_load_timeout(100) driver.get("https://apps1.weldgov.com/sheriff/warrants/index.cfm") for i in [x for x in ascii_lowercase]: #if i == "y" or i == "z": WebDriverWait(driver, 10).until( EC.visibility_of_element_located((By.XPATH, "//form"))) time.sleep(2) driver.find_element_by_xpath('//input[@id="lname"]').clear() lname_box = driver.find_element_by_xpath('//input[@id="lname"]') lname_box.send_keys(i) sel = Select(driver.find_element_by_xpath( "//select[@id='severity']")) sel.select_by_index(2) search_btn = driver.find_element_by_xpath( "//input[contains(@value, 'Search')]") driver.execute_script("arguments[0].click()", search_btn) WebDriverWait(driver, 15).until(EC.visibility_of_element_located( (By.XPATH, "(//table/tbody)[last()]/tr"))) time.sleep(2) html = driver.page_source resp = Selector(text=html) warrant_li = resp.xpath("(//table/tbody)[last()]/tr") for warrant in warrant_li: name = resp.xpath("//td[1]/div/span/text()").get() warrant_Charge = warrant.xpath( "normalize-space(.//td[1]/div/text()[4])").get() warrant_Charge = warrant_Charge.split( ":")[-1].strip() if warrant_Charge else None original_Charge = warrant.xpath( "normalize-space(.//td[1]/div/text()[5])").get() original_Charge = original_Charge.split( ":")[-1].strip() if original_Charge else None bail_Amount = warrant.xpath( "normalize-space(.//td[1]/div/text()[6])").get() bail_Amount = bail_Amount.split( ":")[-1].strip() if bail_Amount else None original_Warrant_Date = warrant.xpath( "normalize-space(.//td[1]/div/text()[7])").get() original_Warrant_Date = original_Warrant_Date.split( ":")[-1].strip() if original_Warrant_Date else None date_Warrant_Entered = warrant.xpath( "normalize-space(.//td[1]/div/text()[8])").get() date_Warrant_Entered = date_Warrant_Entered.split( ":")[-1].strip() if date_Warrant_Entered else None ori = warrant.xpath( "normalize-space(.//td[1]/div/text()[9])").get() ori = ori.split(":")[-1].strip() if ori else None warrant_Number = warrant.xpath( "normalize-space(.//td[2]/text()[1])").get() warrant_Number = warrant_Number.split( ":")[-1].strip() if warrant_Number else None dob_gender = warrant.xpath( "normalize-space(.//td[2]/text()[2])").get() dob = dob_gender.strip().split(" ")[1].strip() gender = dob_gender.strip().split(" ")[-1].strip() height_weight = warrant.xpath( "normalize-space(.//td[2]/text()[3])").get() height = height_weight.strip().split(" ")[1].strip() weight = height_weight.strip().split(" ")[-1].strip() hair_eyes = warrant.xpath( "normalize-space(.//td[2]/text()[4])").get() hair = hair_eyes.strip().split(" ")[1].strip() eyes = hair_eyes.strip().split(" ")[-1].strip() race = warrant.xpath( "normalize-space(.//td[2]/text()[5])").get() crime_type = warrant.xpath( "normalize-space(.//td[2]/text()[6])").get() temp = [ name.split(",")[0].strip(), name.split(",")[-1].strip(), warrant.xpath( "normalize-space(.//td[1]/div/text()[3])").get(), warrant_Charge, original_Charge, bail_Amount, original_Warrant_Date, date_Warrant_Entered, ori, warrant_Number, dob, gender, height, weight, hair, eyes, race, crime_type, "weldgov" ] #print(temp) write_to_csv(temp, OUTPUT_FILE) back_btn = driver.find_element_by_xpath( "//a[contains(text(),'Do another')]") driver.execute_script("arguments[0].click()", back_btn) #-- Shutting down the chromedriver --# driver.quit() return def wyomingmn_run(CHROME_LOC, DOWNLOAD_LOC, script, num): # OUTPUT_FILE = f'''./downloads/data_{datetime.now().strftime("%d_%b_%Y")}.csv''' OUTPUT_FILE = DOWNLOAD_LOC + "downloads/"+str(num)+"_" + script + ".csv" if os.path.isfile(OUTPUT_FILE): os.remove(OUTPUT_FILE) write_to_csv([ 'Warrant_Last_Name', 'Warrant_First_Name', 'MNDOC Offender ID', 'Warrant_Birthdate', 'Current_Status', 'Sentence_Date', 'Release_Date', 'Agent_Name', 'Highest_Ranked_Offense', 'Court_File_Number', 'Warrant_Image', "source_file"], OUTPUT_FILE) pass else: write_to_csv([ 'Warrant_Last_Name', 'Warrant_First_Name', 'MNDOC Offender ID', 'Warrant_Birthdate', 'Current_Status', 'Sentence_Date', 'Release_Date', 'Agent_Name', 'Highest_Ranked_Offense', 'Court_File_Number', 'Warrant_Image', "source_file"], OUTPUT_FILE) #-- Initializing the chromedriver --# driver = initChromeDriver(CHROME_LOC) driver.set_page_load_timeout(100) driver.get("https://coms.doc.state.mn.us/PublicViewer/Fugitive") WebDriverWait(driver, 30).until(EC.visibility_of_element_located( (By.XPATH, "//div[@class='partialContents']/div[contains(@class, 'fugitiveImageWrapper')]"))) time.sleep(5) html = driver.page_source resp = Selector(text=html) warrant_li = resp.xpath( '//div[@class="partialContents"]/div[contains(@class, "fugitiveImageWrapper")]') for warrant in warrant_li: url = f'''https://coms.doc.state.mn.us{warrant.xpath(".//div[contains(@class, 'ImageText')]/a/@href").get()}''' print("NOW PROCESSING: ", url) # callback=self.parse, Warrant_Image = f'''https://coms.doc.state.mn.us{warrant.xpath('.//div[contains(@class, "ImageOuter")]/img/@src').get()}''' print("found image path->", Warrant_Image) # iterate... print("Iterate...") time.sleep(2) driver.get(url) html = driver.page_source response = Selector(text=html) if resp: print("got page html and resp") else: print("no page html and resp") name = response.xpath( 'normalize-space(//span[@id="LastName"]/text())').get() item = { 'Warrant_Last_Name': name.split(" ")[-1], 'Warrant_First_Name': " ".join(name.split(" ")[:-1]), 'MNDOC Offender ID': response.xpath('normalize-space(//span[@id="OID"]/text())').get(), 'Warrant_Birthdate': response.xpath('normalize-space(//span[@id="DOB"]/text())').get(), 'Current_Status': response.xpath('normalize-space(//span[@id="OffenderStatus"]/text())').get(), 'Sentence_Date': response.xpath('normalize-space(//span[@id="SentenceDate"]/text())').get(), 'Release_Date': response.xpath('normalize-space(//span[@id="ReleaseDate"]/text())').get(), 'Agent_Name': response.xpath('normalize-space(//span[@id="CaseWorker"]/a/text())').get(), 'Highest_Ranked_Offense': response.xpath('normalize-space(//span[contains(text(), "Highest Ranked Offense")]/parent::div/following-sibling::div/span/text())').get(), 'Court_File_Number': response.xpath('normalize-space(//span[contains(text(), "Court File Number")]/parent::div/following-sibling::div/span/text())').get(), 'Warrant_Image': Warrant_Image, "source_file": "wyomingmn" } item_list = [ name.split(" ")[-1], " ".join(name.split(" ")[:-1]), response.xpath('normalize-space(//span[@id="OID"]/text())').get(), response.xpath('normalize-space(//span[@id="DOB"]/text())').get(), response.xpath( 'normalize-space(//span[@id="OffenderStatus"]/text())').get(), response.xpath( 'normalize-space(//span[@id="SentenceDate"]/text())').get(), response.xpath( 'normalize-space(//span[@id="ReleaseDate"]/text())').get(), response.xpath( 'normalize-space(//span[@id="CaseWorker"]/a/text())').get(), response.xpath( 'normalize-space(//span[contains(text(), "Highest Ranked Offense")]/parent::div/following-sibling::div/span/text())').get(), response.xpath( 'normalize-space(//span[contains(text(), "Court File Number")]/parent::div/following-sibling::div/span/text())').get(), Warrant_Image, "wyomingmn" ] #print(item) write_to_csv(item_list, OUTPUT_FILE) #-- Shutting down the chromedriver --# driver.quit() return ####################################################################################################### def record_error(script, e): cur_datetime = pd.to_datetime(datetime.datetime.now()) with open("scraping_errors.csv", mode="a", newline="", encoding="utf-8") as err_file: csv_writer = csv.writer(err_file, delimiter=",") csv_writer.writerow([script, cur_datetime, e]) #err_file.write(f"""{script} had error {e}""" ) return def run_spiders(): try: print("\n##################\nRunning 6 Spiders.\n##################\n") # CODE HERE # Run Spiders with process ########################################### # SETTINGS ########################################### setting = { 'BOT_NAME':'Googlebot', 'SPIDER_MODULES' : ['warrants_usa.spiders'], 'NEWSPIDER_MODULE' : 'warrants_usa.spiders', # Crawl responsibly by identifying yourself (and your website) on the user-agent 'USER_AGENT' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36', # Obey robots.txt rules 'ROBOTSTXT_OBEY': False, 'AUTOTHROTTLE_ENABLED' : True, 'FEED_EXPORT_ENCODING' : 'utf-8', # DOWNLOADER_MIDDLEWARES = { # 'scrapy_selenium.SeleniumMiddleware': 800, # } 'SELENIUM_DRIVER_NAME' : 'chrome', # ---- SETTINGS FOR LINUX --- # SELENIUM_DRIVER_EXECUTABLE_PATH = "../chromedriver" # SELENIUM_DRIVER_ARGUMENTS=['--headless', '--no-sandbox', '--disable-dev-shm-usage', '--log-level=3', '--start-maximized', '--incognito'] # --- SETTINGS FOR WINDOWS --- 'SELENIUM_DRIVER_EXECUTABLE_PATH' : os.environ.get('chromedriver'), # SELENIUM_DRIVER_ARGUMENTS=['--headless', '--log-level=3', '--start-maximized', '--incognito'] 'SELENIUM_DRIVER_ARGUMENTS' : ['--log-level=3', '--start-maximized', '--incognito'] } ########################################### print("init process.") process = CrawlerProcess(setting) #process = CrawlerProcess() print("init spider classes.") # init spiders ( brownso, clackamas, flathead, hallcountyne, lincoln, tomgreencountysheriff ) = ( BrownsoSpider(), ClackamasSpider(), FlatheadSpider(), HallcountyneSpider(), LincolnSpider(), TomgreencountysheriffSpider() ) # scrap in order #for spider_name in process.spider_loader.list(): print("iterate spiders.") for spider_name in [BrownsoSpider, ClackamasSpider, FlatheadSpider, HallcountyneSpider, LincolnSpider, TomgreencountysheriffSpider]: #for spider_name in [brownso, clackamas, flathead, hallcountyne, lincoln, tomgreencountysheriff]: print ("Running spider %s" % (spider_name)) process.crawl(spider_name, query="dvh") #query dvh is custom argument used in your scrapy print("process start.") process.start() report = "SUCCESS [Spiders]!" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report def run_dallas(): # check the string entered: try: print("\n##################\nRunning Dallas.\n##################\n") # CODE HERE script = "dallascounty" num = 7 print("\n##################\n NOW SCRAPING WITH SCRIPT: ", script, "\n##########################") dallascounty_run(CHROME_LOC = "./", DOWNLOAD_LOC= "./", script=script, num=num) print(f"""SUCCESS {script}""") report = "SUCCESS [Dallas]!" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report def run_flde(): script = "fdle_state" num = 16 try: print("\n##################\n NOW SCRAPING WITH SCRIPT: ", script, "\n##########################") flde_run(CHROME_LOC = "./", DOWNLOAD_LOC= "./", script=script, num=num) print(f"""SUCCESS {script}""") report = f"""SUCCESS {script}""" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report def run_harriscounty(): script = "harriscountyso" num = 8 try: print("\n##################\n NOW SCRAPING WITH SCRIPT: ", script, "\n##########################") harriscounty_run(CHROME_LOC = "./", DOWNLOAD_LOC= "./", script=script, num=num) print(f"""SUCCESS {script}""") report = f"""SUCCESS {script}""" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report def run_jocosheriff(): script = "jocosheriff" num = 9 try: print("\n##################\n NOW SCRAPING WITH SCRIPT: ", script, "\n##########################") jocosheriff_run(CHROME_LOC = "./", DOWNLOAD_LOC= "./", script=script, num=num) print(f"""SUCCESS {script}""") report = f"""SUCCESS {script}""" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report def run_ndcourts(): script = "ndcourts" num = 10 try: print("\n##################\n NOW SCRAPING WITH SCRIPT: ", script, "\n##########################") ndcourts_run(CHROME_LOC = "./", DOWNLOAD_LOC= "./", script=script, num=num) print(f"""SUCCESS {script}""") report = f"""SUCCESS {script}""" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report def run_pennco(): script = "pennco" num = 15 try: print("\n##################\n NOW SCRAPING WITH SCRIPT: ", script, "\n##########################") pennco_run(CHROME_LOC = "./", DOWNLOAD_LOC= "./", script=script, num=num) print(f"""SUCCESS {script}""") report = f"""SUCCESS {script}""" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report def run_sedgwickcounty(): script = "sedgwickcounty" num = 11 try: print("\n##################\n NOW SCRAPING WITH SCRIPT: ", script, "\n##########################") sedgwickcounty_run(CHROME_LOC = "./", DOWNLOAD_LOC= "./", script=script, num=num) print(f"""SUCCESS {script}""") report = f"""SUCCESS {script}""" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report def run_warrantsearch(): script = "warrantSearch" num = 13 try: print("\n##################\n NOW SCRAPING WITH SCRIPT: ", script, "\n##########################") warrantsearch_run(CHROME_LOC = "./", DOWNLOAD_LOC= "./", script=script, num=num) print(f"""SUCCESS {script}""") report = f"""SUCCESS {script}""" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report def run_weldgov(): script = "weldgov" num = 12 try: print("\n##################\n NOW SCRAPING WITH SCRIPT: ", script, "\n##########################") weldgov_run(CHROME_LOC = "./", DOWNLOAD_LOC= "./", script=script, num=num) print(f"""SUCCESS {script}""") report = f"""SUCCESS {script}""" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report def run_wyomingmn(): script = "wyomingmn" num = 14 try: print("\n##################\n NOW SCRAPING WITH SCRIPT: ", script, "\n##########################") wyomingmn_run(CHROME_LOC = "./", DOWNLOAD_LOC= "./", script=script, num=num) print(f"""SUCCESS {script}""") report = f"""SUCCESS {script}""" msg ="." except Exception as e: report = "ERROR!" msg = str(e) print(report,": ", msg) messagebox.showinfo(title=report, message=msg) # time.sleep(2) return report # ----------------- # TKINTer WIDGETS # ------------------ label = Label(window, width=30, text="CLICK BUTTONS TO RUN SCRAPERS:", bg="yellow") label.pack(pady=10) btn_spiders = Button(window, text="Scrape 6 Spiders", width=30, height=1, bg="light blue", command=run_spiders) btn_spiders.pack(pady=10) #(bronso, clackamas, flathead, hallcounty, lincoln, tomgreencountysheriff) btn_spiders = Button(window, text="Scrape: Dallas", width=30, height=1, bg="light blue", command=run_dallas) btn_spiders.pack(pady=10) btn_spiders = Button(window, text="Scrape: Jocosheriff", width=30, height=1, bg="light blue", command=run_jocosheriff) btn_spiders.pack(pady=10) btn_spiders = Button(window, text="Scrape: NDCourts", width=30, height=1, bg="light blue", command=run_ndcourts) btn_spiders.pack(pady=10) btn_spiders = Button(window, text="Scrape: Sedgwickcounty", width=30, height=1, bg="light blue", command=run_sedgwickcounty) btn_spiders.pack(pady=10) btn_spiders = Button(window, text="Scrape: Warrantsearch", width=30, height=1, bg="light blue", command=run_warrantsearch) btn_spiders.pack(pady=10) btn_spiders = Button(window, text="Scrape: Weldgov", width=30, height=1, bg="light blue", command=run_weldgov) btn_spiders.pack(pady=10) btn_spiders = Button(window, text="Scrape: WyomingMN", width=30, height=1, bg="light blue", command=run_wyomingmn) btn_spiders.pack(pady=10) btn_spiders = Button(window, text="Scrape: Pennco", width=30, height=1, bg="light blue", command=run_pennco) btn_spiders.pack(pady=10) btn_spiders = Button(window, text="Scrape: FLDE", width=30, height=1, bg="light blue", command=run_flde) btn_spiders.pack(pady=10) btn_spiders = Button(window, text="Scrape: Harriscounty", width=30, height=1, bg="light blue", command=run_harriscounty) btn_spiders.pack(pady=10) #btn_spiders = Button(window, text="Combine & Save to DB.", width=30, # height=1, bg="light blue", command=combine_files_and_save_to_db) #btn_spiders.pack(pady=10) window.mainloop()