Below is the code in python to so Web Scraping/Crawling using BeautifulSoup and request library.
First you need to install libraries using following commands:
1. pip install BeautifulSoup
2. pip install request
Here is the code:
import csv
from datetime import datetime
import random
import time
import requests
import requests
from lxml import html
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
counter = 0
d = {}
Vessel_record = []
Vessel_info = []
urls = []
url = 'https://www.vesselfinder.com/vessels/details/'
file_Imo = "C:\\Users\\vmalge\\PycharmProjects\\vesselInfoAutomation\\InputTestData\\IMO1.xlsx"
df = pd.read_excel(file_Imo, index_col=None, na_values=['NA'], usecols="A")
# print(df)
for i, row in df.iterrows():
for j, column in row.items():
urls.append(url + str(column))
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_4_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 '
'Mobile/15E148 Safari/604.1',
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 '
'Safari/537.36 Edg/87.0.664.75',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 '
'Safari/537.36 Edge/18.18363',
]
header = {"User-Agent": user_agent_list[random.randint(0, len(user_agent_list) - 1)]}
proxies = {
'http': 'http://10.10.1.10:3128',
'https': 'http://10.10.1.10:1080',
}
for link in urls:
print(link)
r_inner = requests.get(link, proxies, headers=header)
soup = BeautifulSoup(r_inner.content, 'html.parser')
table = soup.find("table", {"class": "tparams"})
if table:
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
# print([ele for ele in cols]) # Get rid of empty values
Vessel_info.append(cols)
d = dict(Vessel_info)
if table:
print(d)
with open('vessel_book_' + datetime.now().strftime("%Y_%m_%d") + '.csv', 'a', newline='') as f:
w = csv.writer(f)
if counter == 0:
w.writerow(d.keys())
w.writerow(d.values())
counter = counter + 1
else:
w.writerow(d.values())
time.sleep(2)
Comments
Post a Comment