I’ve had the pleasure of working on Python RPA scripts, focusing on automation and data extraction from both modern and legacy web applications.
Two key takeaways were that automating and extracting data from modern web applications is relatively straightforward, but working with older, more complex applications presented some unique challenges.
- Driver.cookies() this function helps to take the current cookies of the tab and we can reuse the same cookies to hit the api endpoint (which contains the data we need). By this we can reduce the automation process and scrap the data faster and effectively.
def get_hash_value(customer_id,document_id):cookies = driver.get_cookies()cookie_string=""#this loop sets the necessary cookiesfor cookie in cookies:cookie_string+=cookie['name']+'='+cookie['value']+'; 'headers = {'authority': 'www.example.com','accept': 'application/json, text/javascript, */*; q=0.01','accept-language': 'en-US,en;q=0.9','content-type': 'application/x-www-form-urlencoded; charset=UTF-8','cookie': cookie_string,'origin': 'https://qa.devfovea.com','referer': 'https://www.example.com/secret','sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"','sec-fetch-dest': 'empty','sec-fetch-mode': 'cors','sec-fetch-site': 'same-origin','user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36','x-requested-with': 'XMLHttpRequest',}data = {'querystring': f'CustId={customer_id}&DocumentId={document_id}',}url = 'https://www.example.com/GetHashedValue'response = requests.post(url, headers=headers, data=data)if response.status_code == 200:return response.json()else:print(f"Request failed with status code {response.status_code}")def get_hash_value(customer_id,document_id): cookies = driver.get_cookies() cookie_string="" #this loop sets the necessary cookies for cookie in cookies: cookie_string+=cookie['name']+'='+cookie['value']+'; ' headers = { 'authority': 'www.example.com', 'accept': 'application/json, text/javascript, */*; q=0.01', 'accept-language': 'en-US,en;q=0.9', 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 'cookie': cookie_string, 'origin': 'https://qa.devfovea.com', 'referer': 'https://www.example.com/secret', 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'x-requested-with': 'XMLHttpRequest', } data = { 'querystring': f'CustId={customer_id}&DocumentId={document_id}', } url = 'https://www.example.com/GetHashedValue' response = requests.post(url, headers=headers, data=data) if response.status_code == 200: return response.json() else: print(f"Request failed with status code {response.status_code}")def get_hash_value(customer_id,document_id): cookies = driver.get_cookies() cookie_string="" #this loop sets the necessary cookies for cookie in cookies: cookie_string+=cookie['name']+'='+cookie['value']+'; ' headers = { 'authority': 'www.example.com', 'accept': 'application/json, text/javascript, */*; q=0.01', 'accept-language': 'en-US,en;q=0.9', 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 'cookie': cookie_string, 'origin': 'https://qa.devfovea.com', 'referer': 'https://www.example.com/secret', 'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'x-requested-with': 'XMLHttpRequest', } data = { 'querystring': f'CustId={customer_id}&DocumentId={document_id}', } url = 'https://www.example.com/GetHashedValue' response = requests.post(url, headers=headers, data=data) if response.status_code == 200: return response.json() else: print(f"Request failed with status code {response.status_code}")
Enter fullscreen mode Exit fullscreen mode
2.To Extract specific datas from a table, You can find the particular table element. And iterate the the table through values
from selenium import webdriverfrom selenium.webdriver.common.by import By# Initialize the WebDriverdriver = webdriver.Chrome() # Ensure chromedriver is in your PATH# Navigate to the webpageurl = 'https://example.com/page-with-table' # Replace with the target URLdriver.get(url)# Locate the table by its IDtable = driver.find_element(By.ID, 'example-table') # Replace with the actual ID of the table# Extract table rowsrows = table.find_elements(By.TAG_NAME, 'tr')# Initialize a list to store table datatable_data = []# Loop through each row and extract cell datafor row in rows:# Extract cell data from each rowcells = row.find_elements(By.TAG_NAME, 'td')cell_data = [cell.text for cell in cells]table_data.append(cell_data)# Print the extracted datafor row in table_data:print(row)# Close the WebDriverdriver.quit()from selenium import webdriver from selenium.webdriver.common.by import By # Initialize the WebDriver driver = webdriver.Chrome() # Ensure chromedriver is in your PATH # Navigate to the webpage url = 'https://example.com/page-with-table' # Replace with the target URL driver.get(url) # Locate the table by its ID table = driver.find_element(By.ID, 'example-table') # Replace with the actual ID of the table # Extract table rows rows = table.find_elements(By.TAG_NAME, 'tr') # Initialize a list to store table data table_data = [] # Loop through each row and extract cell data for row in rows: # Extract cell data from each row cells = row.find_elements(By.TAG_NAME, 'td') cell_data = [cell.text for cell in cells] table_data.append(cell_data) # Print the extracted data for row in table_data: print(row) # Close the WebDriver driver.quit()from selenium import webdriver from selenium.webdriver.common.by import By # Initialize the WebDriver driver = webdriver.Chrome() # Ensure chromedriver is in your PATH # Navigate to the webpage url = 'https://example.com/page-with-table' # Replace with the target URL driver.get(url) # Locate the table by its ID table = driver.find_element(By.ID, 'example-table') # Replace with the actual ID of the table # Extract table rows rows = table.find_elements(By.TAG_NAME, 'tr') # Initialize a list to store table data table_data = [] # Loop through each row and extract cell data for row in rows: # Extract cell data from each row cells = row.find_elements(By.TAG_NAME, 'td') cell_data = [cell.text for cell in cells] table_data.append(cell_data) # Print the extracted data for row in table_data: print(row) # Close the WebDriver driver.quit()
Enter fullscreen mode Exit fullscreen mode
3.The last one but useful one. Whenever I get stuck in writing scripts for an old and shabby websites I use selenium webdriver extension and record the actions and clicks and export to python script. You can find the selenium webdriver extension in chrome webstore.
暂无评论内容