mtg-decks-downloader

Tool to download Magic: The Gathering decklists from the Internet
git clone https://kevincorvisier.fr/git/mtg-decks-downloader.git
Log | Files | Refs | README

Crawler.java (1932B)


      1 package fr.kevincorvisier.mtg.dd;
      2 
      3 import java.io.ByteArrayInputStream;
      4 import java.io.IOException;
      5 import java.io.InputStream;
      6 import java.net.URL;
      7 import java.util.List;
      8 
      9 import org.openqa.selenium.By;
     10 import org.openqa.selenium.WebDriver;
     11 import org.openqa.selenium.WebElement;
     12 import org.openqa.selenium.WindowType;
     13 import org.openqa.selenium.htmlunit.HtmlUnitDriver;
     14 import org.springframework.stereotype.Service;
     15 
     16 import com.google.common.util.concurrent.RateLimiter;
     17 
     18 import lombok.extern.slf4j.Slf4j;
     19 
     20 @Slf4j
     21 @Service
     22 public class Crawler
     23 {
     24 	private static final double CRAWL_DELAY_SECONDS = 30d;
     25 
     26 	private final WebDriver driver = new HtmlUnitDriver();
     27 	private final RateLimiter rateLimiter;
     28 
     29 	public Crawler()
     30 	{
     31 		rateLimiter = RateLimiter.create(1d / CRAWL_DELAY_SECONDS);
     32 	}
     33 
     34 	/*
     35 	 * WebDriver operations with rate limitation
     36 	 */
     37 
     38 	public void navigateTo(final URL url)
     39 	{
     40 		try
     41 		{
     42 			rateLimiter.acquire();
     43 			driver.navigate().to(url);
     44 
     45 			log.info("navigateTo: url={}", url);
     46 		}
     47 		catch (final Exception e)
     48 		{
     49 			log.warn("navigateTo: url={}", url, e);
     50 			throw e;
     51 		}
     52 	}
     53 
     54 	/*
     55 	 * Other operations with rate limitation
     56 	 */
     57 
     58 	public InputStream openStream(final URL url) throws IOException
     59 	{
     60 		try
     61 		{
     62 			rateLimiter.acquire();
     63 			final String currHandle = driver.getWindowHandle();
     64 
     65 			driver.switchTo().newWindow(WindowType.TAB);
     66 			driver.navigate().to(url);
     67 			final String pageSource = driver.getPageSource();
     68 			driver.close();
     69 			driver.switchTo().window(currHandle);
     70 
     71 			log.info("openStream: url={}", url);
     72 			return new ByteArrayInputStream(pageSource.getBytes());
     73 		}
     74 		catch (final Exception e)
     75 		{
     76 			log.warn("openStream: url={}", url, e);
     77 			throw e;
     78 		}
     79 	}
     80 
     81 	/*
     82 	 * WebDriver operations without rate limitation
     83 	 */
     84 
     85 	public List<WebElement> findElements(final By by)
     86 	{
     87 		return driver.findElements(by);
     88 	}
     89 
     90 	public WebElement findElement(final By by)
     91 	{
     92 		return driver.findElement(by);
     93 	}
     94 }