Web Scraping/Crawling in python using BeautifulSoup

Below is the code in python to so Web Scraping/Crawling using BeautifulSoup and request library.

First you need to install libraries using following commands:

1. pip install BeautifulSoup

2. pip install request

Here is the code:

import csv

from datetime import datetime

import random

import time

import requests

from lxml import html

from bs4 import BeautifulSoup

import pandas as pd

import numpy as np

counter = 0

d = {}

Vessel_record = []

Vessel_info = []

urls = []

url = 'https://www.vesselfinder.com/vessels/details/'

file_Imo = "C:\\Users\\vmalge\\PycharmProjects\\vesselInfoAutomation\\InputTestData\\IMO1.xlsx"

df = pd.read_excel(file_Imo, index_col=None, na_values=['NA'], usecols="A")

# print(df)

for i, row in df.iterrows():

for j, column in row.items():

urls.append(url + str(column))

user_agent_list = [

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',

'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 '

'Mobile/15E148 Safari/604.1',

'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 '

'Safari/537.36 Edg/87.0.664.75',

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 '

'Safari/537.36 Edge/18.18363',

]

header = {"User-Agent": user_agent_list[random.randint(0, len(user_agent_list) - 1)]}

proxies = {

'http': 'http://10.10.1.10:3128',

'https': 'http://10.10.1.10:1080',

}

for link in urls:

print(link)

r_inner = requests.get(link, proxies, headers=header)

soup = BeautifulSoup(r_inner.content, 'html.parser')

table = soup.find("table", {"class": "tparams"})

if table:

table_body = table.find('tbody')

rows = table_body.find_all('tr')

for row in rows:

cols = row.find_all('td')

cols = [ele.text.strip() for ele in cols]

# print([ele for ele in cols]) # Get rid of empty values

Vessel_info.append(cols)

d = dict(Vessel_info)

if table:

print(d)

with open('vessel_book_' + datetime.now().strftime("%Y_%m_%d") + '.csv', 'a', newline='') as f:

w = csv.writer(f)

if counter == 0:

w.writerow(d.keys())

w.writerow(d.values())

counter = counter + 1

else:

w.writerow(d.values())

time.sleep(2)

Selenium, Java, testNG, Apache POI, Maven, Log4j, Extent report, C#, python and Asp.net blog

Search This Blog

Web Scraping/Crawling in python using BeautifulSoup

Labels

Comments

Post a Comment

Popular posts from this blog

Add, remove, search an item in listview in C#

Add worklog in Jira using Python

Some GUI examples in Python using customtkinter