我有一个在 Executor.newCachedThreadPool() 的帮助下工作的解析器,并面临这样一个事实,即写入 JSON 文件的记录的主线程在子线程之前执行。结果,我们有一个空文件......我对多线程的主题了解得很差,无法理解错误。我尝试在主线程上使用Join()方法,但最终程序在接近这部分时就挂了
主程序
import model.Product;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
public class Main {
public static void main(String[] args) throws InterruptedException {
String rootUrl = "example.com";
System.out.println("Started parsing: " + rootUrl);
long m = System.currentTimeMillis();
HtmlParser htmlParser = new HtmlParser();
List<Product> productList = new CopyOnWriteArrayList<>();
htmlParser.parse(rootUrl, productList);
Printer.printToJson(productList);
System.out.println("Finish: completed in " + ((double) System.currentTimeMillis() - m) / 1000 + " seconds");
}
}
HtmlParser.java
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import ua.bala.model.Product;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
public class HtmlParser {
private static AtomicInteger httpRequestsCounter = new AtomicInteger(0);
public static AtomicInteger getHttpRequestsCounter() {
return httpRequestsCounter;
}
public void parse(String url, List<Product> productList) {
try {
Document page = getPage(url);
parsePage(page, productList);
} catch (IOException e) {
e.printStackTrace();
}
}
private static Document getPage(String url) throws IOException {
Document document = Jsoup.connect(url).get();
httpRequestsCounter.getAndIncrement();
return document;
}
private void parsePage(Document page, List<Product> productList) {
Elements productElements = page.select("a.dgBQdu");
ExecutorService service = Executors.newCachedThreadPool();
for (Element element: productElements){
service.execute(() -> {
Long articleID = Long.parseLong(element.attr("id"));
String name = "NAME";
String brand = "BRAND";
BigDecimal price = new BigDecimal(BigInteger.ZERO);
Set<String> colors = new HashSet<>();
String url = "https://www.aboutyou.de" + element.attr("href");
Document innerPage;
try {
innerPage = getPage(url);
Element innerElement = innerPage.selectFirst("[data-test-id='BuyBox']");
name = innerElement.selectFirst("div.dZjUXd").text();
brand = innerElement.selectFirst("[data-test-id='BrandLogo']").attr("alt");
colors = new HashSet<>(innerElement.select("span.jlvxcb-1").eachText());
String priceStr = innerElement.selectFirst("div.dWWxvw > span").text().replace("ab ","").replace(" EUR","").replace(",", ".");
price = new BigDecimal(priceStr);
} catch (IOException e) {
e.printStackTrace();
}
Product product = new Product(articleID, name, brand, colors, price, url);
addProduct(product, productList);
});
}
service.shutdown();
}
private synchronized void addProduct(Product product, List<Product> productList){
System.out.println("Product " + product.getID() + " parsed");
System.out.print(product);
productList.add(product);
System.out.printf("Product %d added to list\n%n", product.getID());
}
}
打印机.java
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import model.Product;
import java.io.*;
import java.util.Comparator;
import java.util.List;
public class Printer {
private static final String path = "";
private static final String fileName = "productsOutput";
public static void printToJson(List<Product> products){
products.sort(Comparator.comparing(Product::getID));
System.out.println("Product list start printing to JSON");
try (final Writer writer = new FileWriter(path + fileName + ".json")) {
Gson gson = new GsonBuilder().create();
gson.toJson(products, writer);
System.out.println("Product list printed to JSON");
System.out.printf("Amount of triggered HTTP requests: %s%nAmount of extracted products: %s%n",
HtmlParser.getHttpRequestsCounter(), products.size());
} catch (IOException e) {
e.printStackTrace();
}
}
}
打印机.java
package model;
import lombok.*;
import java.math.BigDecimal;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
@NoArgsConstructor
@Getter
@Setter
public class Product {
private static AtomicLong productsCounter = new AtomicLong(1);
private Long ID;
private Long articleID;
private String name;
private String brand;
private BigDecimal price;
private Set<String> colors;
private String url;
{
ID = productsCounter.getAndIncrement();
}
public Product(Long articleID, String name, String brand, Set<String> colors, BigDecimal price, String url) {
this.articleID = articleID;
this.name = name;
this.brand = brand;
this.price = price;
this.colors = colors;
this.url = url;
}
public static AtomicLong getProductsCounter() {
return productsCounter;
}
@Override
public String toString() {
return String.format("%d\t%d\t%s\t%s\t%s\t%s\t%s\n", ID, articleID, name, brand, price, colors, url);
}
}
有几种方法可以克服这个问题。使用可观察对象,或阻塞主线程或使用接口而不阻塞主线程。对我来说,界面将是一个不错的选择。如果您熟悉 java 接口,则可以实现一个接口以打印最近解析的产品。这是一步一步的方法:
接口类:
public interface ProductsListener {
void onProductsReady(List<Product> products);
}
MainImpl 类(不是 Main 类本身):
public class MainImpl implements ProductListener {
// When product list loading is done this func will be called
void onProductsRead(List<Product> products) {
Printer.printToJson(productList);
}
}
在主类中:
public class Main {
public static void main(String[] args) throws InterruptedException {
MainImpl listener = new MainImpl();
htmlParser.setProductListener(listener);
// Rest of the code...
}
}
在 HtmlParser 类中:
public class HtmlParser {
private MainImpl productListener;
//...
public void setProductListener(MainImpl listener) {
// Alternatively you can do it in a constructor
productListener = listener;
}
//...
private void parsePage(Document page, List<Product> productList) {
Elements productElements = page.select("a.dgBQdu");
int parseCount = 0;
ExecutorService service = Executors.newCachedThreadPool();
for (Element element: productElements){
service.execute(() -> {
Long articleID = Long.parseLong(element.attr("id"));
String name = "NAME";
String brand = "BRAND";
BigDecimal price = new BigDecimal(BigInteger.ZERO);
Set<String> colors = new HashSet<>();
String url = "https://www.aboutyou.de" + element.attr("href");
Document innerPage;
try {
innerPage = getPage(url);
Element innerElement = innerPage.selectFirst("[data-test-id='BuyBox']");
name = innerElement.selectFirst("div.dZjUXd").text();
brand = innerElement.selectFirst("[data-test-id='BrandLogo']").attr("alt");
colors = new HashSet<>(innerElement.select("span.jlvxcb-1").eachText());
String priceStr = innerElement.selectFirst("div.dWWxvw > span").text().replace("ab ","").replace(" EUR","").replace(",", ".");
price = new BigDecimal(priceStr);
} catch (IOException e) {
e.printStackTrace();
}
Product product = new Product(articleID, name, brand, colors, price, url);
addProduct(product, productList);
parseCount++; // Count each element that has been parsed
// Check if all elements have been parsed
if(parseCount >= productElements.size()) {
// All products are done, notify the listener class
productListener.onProductsReady(productList);
}
});
}
}
未测试,但接口逻辑必须工作。
本文收集自互联网,转载请注明来源。
如有侵权,请联系 [email protected] 删除。
我来说两句