Crawler.java (1932B)
1 package fr.kevincorvisier.mtg.dd; 2 3 import java.io.ByteArrayInputStream; 4 import java.io.IOException; 5 import java.io.InputStream; 6 import java.net.URL; 7 import java.util.List; 8 9 import org.openqa.selenium.By; 10 import org.openqa.selenium.WebDriver; 11 import org.openqa.selenium.WebElement; 12 import org.openqa.selenium.WindowType; 13 import org.openqa.selenium.htmlunit.HtmlUnitDriver; 14 import org.springframework.stereotype.Service; 15 16 import com.google.common.util.concurrent.RateLimiter; 17 18 import lombok.extern.slf4j.Slf4j; 19 20 @Slf4j 21 @Service 22 public class Crawler 23 { 24 private static final double CRAWL_DELAY_SECONDS = 30d; 25 26 private final WebDriver driver = new HtmlUnitDriver(); 27 private final RateLimiter rateLimiter; 28 29 public Crawler() 30 { 31 rateLimiter = RateLimiter.create(1d / CRAWL_DELAY_SECONDS); 32 } 33 34 /* 35 * WebDriver operations with rate limitation 36 */ 37 38 public void navigateTo(final URL url) 39 { 40 try 41 { 42 rateLimiter.acquire(); 43 driver.navigate().to(url); 44 45 log.info("navigateTo: url={}", url); 46 } 47 catch (final Exception e) 48 { 49 log.warn("navigateTo: url={}", url, e); 50 throw e; 51 } 52 } 53 54 /* 55 * Other operations with rate limitation 56 */ 57 58 public InputStream openStream(final URL url) throws IOException 59 { 60 try 61 { 62 rateLimiter.acquire(); 63 final String currHandle = driver.getWindowHandle(); 64 65 driver.switchTo().newWindow(WindowType.TAB); 66 driver.navigate().to(url); 67 final String pageSource = driver.getPageSource(); 68 driver.close(); 69 driver.switchTo().window(currHandle); 70 71 log.info("openStream: url={}", url); 72 return new ByteArrayInputStream(pageSource.getBytes()); 73 } 74 catch (final Exception e) 75 { 76 log.warn("openStream: url={}", url, e); 77 throw e; 78 } 79 } 80 81 /* 82 * WebDriver operations without rate limitation 83 */ 84 85 public List<WebElement> findElements(final By by) 86 { 87 return driver.findElements(by); 88 } 89 90 public WebElement findElement(final By by) 91 { 92 return driver.findElement(by); 93 } 94 }