123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164 |
- package com.webchat.common.util.web;
- import com.google.common.collect.Sets;
- import com.webchat.domain.dto.UrlAnalysisResultDTO;
- import lombok.extern.slf4j.Slf4j;
- import org.apache.commons.lang3.StringUtils;
- import org.jsoup.Connection;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import java.util.HashSet;
- import java.util.Set;
- @Slf4j
- public class UrlAnalysisUtil {
- /**
- * 网址解析
- *
- * @param url
- * @return
- */
- public static UrlAnalysisResultDTO analyze(String url) {
- UrlAnalysisResultDTO result = new UrlAnalysisResultDTO();
- try {
- Connection conn = Jsoup.connect(url);
- // 解析ICON
- result.setIcon(analysisIcon(conn));
- // 解析标题
- result.setTitle(analysisTitle(conn));
- // 解析关键词
- result.setKeywords(analysisKeywords(conn));
- // 解析概述
- result.setDescription(analysisDescription(conn));
- // 解析首图
- result.setFirstImage(analysisFirstImage(conn));
- } catch (Exception e) {
- log.error("URL ANALYSIS ERROR. url:{}", url, e);
- }
- return result;
- }
- /***
- * 解析标题
- * @param conn
- * @return
- */
- private static String analysisTitle(Connection conn) {
- if (conn == null) {
- return "";
- }
- try {
- return conn.get().title();
- } catch (Exception e) {
- return "";
- }
- }
- /***
- * 解析概述
- * @param conn
- * @return
- */
- private static String analysisDescription(Connection conn) {
- if (conn == null) {
- return "";
- }
- try {
- Elements elements = conn.get().getElementsByTag("meta");
- for (Element element : elements) {
- if (element.attr("name").contains("description") || element.attr("property").contains("description")) {
- return element.attr("content");
- }
- }
- } catch (Exception e) {
- return "";
- }
- return "";
- }
- /***
- * 解析ICON
- * @param conn
- * @return
- */
- private static String analysisIcon(Connection conn) {
- if (conn == null) {
- return null;
- }
- try {
- Elements elements = conn.get().getElementsByTag("link");
- for (Element element : elements) {
- if (element.attr("rel").contains("icon")) {
- if (StringUtils.isBlank(element.attr("href"))) {
- continue;
- }
- return element.attr("abs:href");
- }
- }
- } catch (Exception e) {
- return null;
- }
- return null;
- }
- /***
- * 解析关键词
- * @param conn
- * @return
- */
- private static Set<String> analysisKeywords(Connection conn) {
- Set<String> keywords = new HashSet<>();
- if (conn == null) {
- return keywords;
- }
- try {
- Elements elements = conn.get().getElementsByTag("meta");
- for (Element element : elements) {
- if (element.attr("name").contains("keywords")) {
- String keywordsStr = element.attr("content");
- if (StringUtils.isNotBlank(keywordsStr)) {
- return Sets.newHashSet(keywordsStr.split(","));
- }
- }
- }
- } catch (Exception e) {
- return keywords;
- }
- return keywords;
- }
- /***
- * 解析首图
- * @param conn
- * @return
- */
- private static String analysisFirstImage(Connection conn) {
- if (conn == null) {
- return "";
- }
- try {
- Elements elements = conn.get().getElementsByTag("meta");
- for (Element element : elements) {
- if (element.attr("name").contains("image") || element.attr("itemprop").contains("image")) {
- return element.attr("abs:content");
- }
- }
- Elements imgElements = conn.get().getElementsByTag("img");
- for (Element img : imgElements) {
- if (StringUtils.isBlank(img.attr("src"))) {
- continue;
- }
- return img.attr("abs:src");
- }
- } catch (Exception e) {
- return "";
- }
- return "";
- }
- }
|