Web Scraping in Python

Here is the code to show how to do Web Scraping in Python

import requests
from lxml import html
from bs4 import BeautifulSoup
import csv
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.worksheet.table import Table, TableStyleInfo
from datetime import datetime

page_number = 1
Baseurl = 'https://www.vesselfinder.com'
urls = 'https://www.vesselfinder.com/vessels'
header = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/39.0.2171.95 Safari/537.36',
}
# Get HTML Content
r = requests.get(urls, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')

page_links = []
table = soup.find('table', class_='results')
for row in table.tbody.find_all('tr'):
    # Find all data for each column
    columns = row.find('td')
    link = columns.a['href']
    # print(link)
    page_links.append(link)

my_heads = ['Key', 'Value']
wb = Workbook()
ws = wb.active

for link in page_links:
    print(Baseurl + link)
    r_inner = requests.get(Baseurl + link, headers=header)
    df_list = pd.read_html(r_inner.text)  # this parses all the tables in webpages to a list
    df = df_list[1]
    print(df)
    # df.head()
    for r in dataframe_to_rows(df, header=True, index=False):
        ws.append(r)

style = TableStyleInfo(name="TableStyleMedium9", showRowStripes=True)

table = Table(displayName="Vessel_Info",
              ref="A1:" + get_column_letter(ws.max_column) + str(ws.max_row))
table.tableStyleInfo = style
ws.add_table(table)
wb.save('Vessel_Info_' + datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p") + '.xlsx')

print('Next Page link :', Baseurl + soup.find('a', href=True, class_='pagination-next')['href'])

while True:
    pagination = Baseurl + soup.find('a', href=True, class_='pagination-next')['href']
    if pagination:
        page_number += 1
        print('Next Page link :', Baseurl + soup.find('a', href=True, class_='pagination-next')['href'])
        header = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/39.0.2171.95 Safari/537.36',
        }
        # Get HTML Content
        r = requests.get(pagination, headers=header)
        soup = BeautifulSoup(r.content, 'html.parser')
    else:
        break


"""
    with pd.ExcelWriter('Vessel_Info.xlsx', mode='a', engine='openpyxl') as writer:
        df.to_excel(writer, sheet_name='Sheet_name_1', header=my_heads)
        
        
        
items = soup.findAll("a", {"class": "pagination-next"})
"""

Selenium, Java, testNG, Apache POI, Maven, Log4j, Extent report, C#, python and Asp.net blog

Search This Blog

Web Scraping in Python

Labels

Comments

Post a Comment

Popular posts from this blog

Add, remove, search an item in listview in C#

Add worklog in Jira using Python

Some GUI examples in Python using customtkinter