Web Scrapping using Jsoup , Selenium and Java
In this article we will see how to do web scrapping using Jsoup and Selenium using Java.
Jsoup Jar you can download from below links:
http://www.java2s.com/Code/Jar/j/Downloadjsoup160jar.htm
Below is the full code to do web scrapping and write the data to text file.
In the following example we are navigating to a web which has a drop down with 12 values. For each value we select and clieck on a search button it will navigate to a page for which we have to scrape a data which is spread on multiple page. So we will srcrap all those data present on a multiple page. Navigate back to home page(page from which we have selected the dropdown value and cliecked on a search button) then again select next dropdown value and repeat the same procedure. This will repeat till we reach to the end of a dropdown values i.e. for all 12 dropdown values. Kindly please do comment when you find this as working for you.
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.support.CacheLookup;
import org.openqa.selenium.support.FindBy;
import org.openqa.selenium.support.How;
import org.openqa.selenium.support.ui.Select;
import org.openqa.selenium.support.ui.WebDriverWait;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
public class JAVA_WEB_SCRAPPING_DEM0 {
String currentUrl =null;
public WebDriver driver;
String url="url of a webpage/website";
@FindBy(how = How.XPATH, using="xpath")
@CacheLookup
WebElement ele_dropdown;
@FindBy(how = How.XPATH, using="xpath")
@CacheLookup
WebElement search_btn;
@BeforeTest( alwaysRun=true)
public void beforeTest() throws IOException {
System.setProperty("webdriver.chrome.driver", "location of a chromedriver");
HashMap<String, Object> chromePrefs = new HashMap<String, Object>();
chromePrefs.put("profile.default_content_settings.popups", 0);
ChromeOptions options = new ChromeOptions();
options.setExperimentalOption("prefs", chromePrefs);
options.addArguments("disable-popup-blocking");
driver = new ChromeDriver(options);
driver.get(url);
driver.manage().window().maximize();
driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
}
@Test(priority = 0, alwaysRun=true)
public void selecteleDropdown() throws InterruptedException, IOException{
try {
File myObj = new File("path to a file with txt file name");
if (myObj.createNewFile()) {
System.out.println("File created: " + myObj.getName());
} else {
System.out.println("File already exists.");
}
} catch (IOException e) {
System.out.println("An error occurred.");
e.printStackTrace();
}
FileWriter myWriter = new FileWriter("path to a file with txt file name");
WebDriverWait wait = new WebDriverWait(driver, 30);
for(int i=1; i<12;i++)
{
Thread.sleep(3000);
Select dropdown = new Select(driver.findElement(By.name("webelement")));
dropdown.selectByIndex(i);
System.out.println(dropdown.getOptions().get(i).getText());
driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
driver.findElement(By.xpath("webelement")).click();
currentUrl = driver.getCurrentUrl();
System.out.println(currentUrl);
driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
Document doc = Jsoup.connect(currentUrl).get();
Element parentElement = doc.select("web element tag name for which you have to get all data").first();
//System.out.println(parentElement.text());
myWriter.write(parentElement.text());
myWriter.write("==========================================");
driver.navigate().back();
driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
}
myWriter.close();
}
}
Comments
Post a Comment