|
@@ -1,8 +1,54 @@
|
|
|
package com.qmth.cqb.utils;
|
|
|
|
|
|
+import java.util.regex.Matcher;
|
|
|
+import java.util.regex.Pattern;
|
|
|
+
|
|
|
+
|
|
|
/**
|
|
|
* Created by songyue on 16/12/27.
|
|
|
*/
|
|
|
public final class CommonUtils {
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 提取Html里面的文本信息,并把一般的转义字符串转换回来
|
|
|
+ * @param htmlStr
|
|
|
+ * @return
|
|
|
+ * @throws ParserException
|
|
|
+ */
|
|
|
+ public static String extractText(String htmlStr){
|
|
|
+ if(htmlStr.toLowerCase().contains("<!doctype")){
|
|
|
+ int index1 = htmlStr.toLowerCase().indexOf("<!doctype");
|
|
|
+ int index2 = htmlStr.indexOf('>',index1 + 1);
|
|
|
+ htmlStr = htmlStr.substring(0, index1) + htmlStr.substring(index2 + 1);
|
|
|
+ }
|
|
|
+ String regEx_head = "<[\\s]*?head[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?head[\\s]*?>"; //定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
|
|
|
+ String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; //定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
|
|
|
+ String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; //定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> }
|
|
|
+ String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
|
|
|
+
|
|
|
+ Pattern p_head = Pattern.compile(regEx_head,Pattern.CASE_INSENSITIVE);
|
|
|
+ Matcher m_head = p_head.matcher(htmlStr);
|
|
|
+ htmlStr = m_head.replaceAll(""); //过滤script标签
|
|
|
+
|
|
|
+ Pattern p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
|
|
|
+ Matcher m_script = p_script.matcher(htmlStr);
|
|
|
+ htmlStr = m_script.replaceAll(""); //过滤script标签
|
|
|
+
|
|
|
+ Pattern p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
|
|
|
+ Matcher m_style = p_style.matcher(htmlStr);
|
|
|
+ htmlStr = m_style.replaceAll(""); //过滤style标签
|
|
|
+
|
|
|
+ htmlStr = htmlStr.replace("<div", " <div").replace("<DIV", " <DIV")
|
|
|
+ .replace("<p", " <p").replace("<P", " <P")
|
|
|
+ .replace("<h", " <h").replace("<H", " <H")
|
|
|
+ .replace("<br", " <br").replace("<BR", " <BR")
|
|
|
+ .replace("<td", " <td").replace("<TD", " <TD")
|
|
|
+ .replace("<th", " <th").replace("<TH", " <TH");
|
|
|
+ Pattern p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
|
|
|
+ Matcher m_html = p_html.matcher(htmlStr);
|
|
|
+ htmlStr = m_html.replaceAll(""); //过滤html标签
|
|
|
+
|
|
|
+ return htmlStr.trim();
|
|
|
+ }
|
|
|
|
|
|
}
|