Skip to main content

Web Scraping in Python

 Here is the code to show how to do Web Scraping in Python


import requests
from lxml import html
from bs4 import BeautifulSoup
import csv
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.worksheet.table import Table, TableStyleInfo
from datetime import datetime

page_number = 1
Baseurl = 'https://www.vesselfinder.com'
urls = 'https://www.vesselfinder.com/vessels'
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/39.0.2171.95 Safari/537.36',
}
# Get HTML Content
r = requests.get(urls, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')

page_links = []
table = soup.find('table', class_='results')
for row in table.tbody.find_all('tr'):
# Find all data for each column
columns = row.find('td')
link = columns.a['href']
# print(link)
page_links.append(link)

my_heads = ['Key', 'Value']
wb = Workbook()
ws = wb.active

for link in page_links:
print(Baseurl + link)
r_inner = requests.get(Baseurl + link, headers=header)
df_list = pd.read_html(r_inner.text) # this parses all the tables in webpages to a list
df = df_list[1]
print(df)
# df.head()
for r in dataframe_to_rows(df, header=True, index=False):
ws.append(r)

style = TableStyleInfo(name="TableStyleMedium9", showRowStripes=True)

table = Table(displayName="Vessel_Info",
ref="A1:" + get_column_letter(ws.max_column) + str(ws.max_row))
table.tableStyleInfo = style
ws.add_table(table)
wb.save('Vessel_Info_' + datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p") + '.xlsx')

print('Next Page link :', Baseurl + soup.find('a', href=True, class_='pagination-next')['href'])

while True:
pagination = Baseurl + soup.find('a', href=True, class_='pagination-next')['href']
if pagination:
page_number += 1
print('Next Page link :', Baseurl + soup.find('a', href=True, class_='pagination-next')['href'])
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/39.0.2171.95 Safari/537.36',
}
# Get HTML Content
r = requests.get(pagination, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
else:
break


"""
with pd.ExcelWriter('Vessel_Info.xlsx', mode='a', engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Sheet_name_1', header=my_heads)



items = soup.findAll("a", {"class": "pagination-next"})
"""

Comments

Popular posts from this blog

Add, remove, search an item in listview in C#

Below is the C# code which will help you to add, remove and search operations on listview control in C#. Below is the design view of the project: Below is the source code of the project: using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Linq; using System.Text; using System.Threading.Tasks; using System.Windows.Forms; namespace Treeview_control_demo {     public partial class Form2 : Form     {         public Form2()         {             InitializeComponent();             listView1.View = View.Details;                   }         private void button1_Click(object sender, EventArgs e)         {             if (textBox1.Text.Trim().Length == 0)...

Add worklog in Jira using Python

 Below is the Python code to add the worklog in Jira. You need to install a request library for this. Here is the code: import requests from requests.auth import HTTPBasicAuth import json url = "https://your jira address here/rest/api/2/issue/ticket_number/worklog" auth = HTTPBasicAuth("username", "jira access token") headers = {     "Accept": "application/json",     "Content-Type": "application/json" } payload = json.dumps({     "comment": {         "content": [             {                 "content": [                     {                         "text": "This is for QA Testing",                         "type": "text"                     } ...

Some GUI examples in Python using customtkinter

 Some GUI examples in Python using customtkinter import customtkinter import os from PIL import Image class ScrollableCheckBoxFrame(customtkinter.CTkScrollableFrame):     def __init__(self, master, item_list, command=None, **kwargs):         super().__init__(master, **kwargs)         self.command = command         self.checkbox_list = []         for i, item in enumerate(item_list):             self.add_item(item)     def add_item(self, item):         checkbox = customtkinter.CTkCheckBox(self, text=item)         if self.command is not None:             checkbox.configure(command=self.command)         checkbox.grid(row=len(self.checkbox_list), column=0, pady=(0, 10))         self.checkbox_list.append(checkbox)     def remove_item(self, it...