UrlAnalysisUtil.java 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. package com.webchat.common.util.web;
  2. import com.google.common.collect.Sets;
  3. import com.webchat.domain.dto.UrlAnalysisResultDTO;
  4. import lombok.extern.slf4j.Slf4j;
  5. import org.apache.commons.lang3.StringUtils;
  6. import org.jsoup.Connection;
  7. import org.jsoup.Jsoup;
  8. import org.jsoup.nodes.Element;
  9. import org.jsoup.select.Elements;
  10. import java.util.HashSet;
  11. import java.util.Set;
  12. @Slf4j
  13. public class UrlAnalysisUtil {
  14. /**
  15. * 网址解析
  16. *
  17. * @param url
  18. * @return
  19. */
  20. public static UrlAnalysisResultDTO analyze(String url) {
  21. UrlAnalysisResultDTO result = new UrlAnalysisResultDTO();
  22. try {
  23. Connection conn = Jsoup.connect(url);
  24. // 解析ICON
  25. result.setIcon(analysisIcon(conn));
  26. // 解析标题
  27. result.setTitle(analysisTitle(conn));
  28. // 解析关键词
  29. result.setKeywords(analysisKeywords(conn));
  30. // 解析概述
  31. result.setDescription(analysisDescription(conn));
  32. // 解析首图
  33. result.setFirstImage(analysisFirstImage(conn));
  34. } catch (Exception e) {
  35. log.error("URL ANALYSIS ERROR. url:{}", url, e);
  36. }
  37. return result;
  38. }
  39. /***
  40. * 解析标题
  41. * @param conn
  42. * @return
  43. */
  44. private static String analysisTitle(Connection conn) {
  45. if (conn == null) {
  46. return "";
  47. }
  48. try {
  49. return conn.get().title();
  50. } catch (Exception e) {
  51. return "";
  52. }
  53. }
  54. /***
  55. * 解析概述
  56. * @param conn
  57. * @return
  58. */
  59. private static String analysisDescription(Connection conn) {
  60. if (conn == null) {
  61. return "";
  62. }
  63. try {
  64. Elements elements = conn.get().getElementsByTag("meta");
  65. for (Element element : elements) {
  66. if (element.attr("name").contains("description") || element.attr("property").contains("description")) {
  67. return element.attr("content");
  68. }
  69. }
  70. } catch (Exception e) {
  71. return "";
  72. }
  73. return "";
  74. }
  75. /***
  76. * 解析ICON
  77. * @param conn
  78. * @return
  79. */
  80. private static String analysisIcon(Connection conn) {
  81. if (conn == null) {
  82. return null;
  83. }
  84. try {
  85. Elements elements = conn.get().getElementsByTag("link");
  86. for (Element element : elements) {
  87. if (element.attr("rel").contains("icon")) {
  88. if (StringUtils.isBlank(element.attr("href"))) {
  89. continue;
  90. }
  91. return element.attr("abs:href");
  92. }
  93. }
  94. } catch (Exception e) {
  95. return null;
  96. }
  97. return null;
  98. }
  99. /***
  100. * 解析关键词
  101. * @param conn
  102. * @return
  103. */
  104. private static Set<String> analysisKeywords(Connection conn) {
  105. Set<String> keywords = new HashSet<>();
  106. if (conn == null) {
  107. return keywords;
  108. }
  109. try {
  110. Elements elements = conn.get().getElementsByTag("meta");
  111. for (Element element : elements) {
  112. if (element.attr("name").contains("keywords")) {
  113. String keywordsStr = element.attr("content");
  114. if (StringUtils.isNotBlank(keywordsStr)) {
  115. return Sets.newHashSet(keywordsStr.split(","));
  116. }
  117. }
  118. }
  119. } catch (Exception e) {
  120. return keywords;
  121. }
  122. return keywords;
  123. }
  124. /***
  125. * 解析首图
  126. * @param conn
  127. * @return
  128. */
  129. private static String analysisFirstImage(Connection conn) {
  130. if (conn == null) {
  131. return "";
  132. }
  133. try {
  134. Elements elements = conn.get().getElementsByTag("meta");
  135. for (Element element : elements) {
  136. if (element.attr("name").contains("image") || element.attr("itemprop").contains("image")) {
  137. return element.attr("abs:content");
  138. }
  139. }
  140. Elements imgElements = conn.get().getElementsByTag("img");
  141. for (Element img : imgElements) {
  142. if (StringUtils.isBlank(img.attr("src"))) {
  143. continue;
  144. }
  145. return img.attr("abs:src");
  146. }
  147. } catch (Exception e) {
  148. return "";
  149. }
  150. return "";
  151. }
  152. }