|
@@ -15,6 +15,7 @@ import java.util.regex.Matcher;
|
|
|
import java.util.regex.Pattern;
|
|
|
import java.util.stream.Stream;
|
|
|
|
|
|
+import org.apache.commons.lang3.StringEscapeUtils;
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
import org.dom4j.Attribute;
|
|
|
import org.dom4j.Document;
|
|
@@ -275,6 +276,95 @@ public final class CommonUtils {
|
|
|
BigDecimal formatNumber = new BigDecimal(number);
|
|
|
return formatNumber.setScale(2, RoundingMode.HALF_UP).doubleValue();
|
|
|
}
|
|
|
+
|
|
|
+ public static String repairHtmlStr(String htmlStr)throws Exception{
|
|
|
+ htmlStr = htmlStr.trim();
|
|
|
+ if(htmlStr.toLowerCase().contains("<!doctype html ")){
|
|
|
+ int index1 = htmlStr.toLowerCase().indexOf("<!doctype html ");
|
|
|
+ int index2 = htmlStr.indexOf('>',index1 + 1);
|
|
|
+ htmlStr = htmlStr.substring(0, index1) + htmlStr.substring(index2 + 1);
|
|
|
+ }
|
|
|
+ while(htmlStr.toLowerCase().contains("<br ")){
|
|
|
+ int index1 = htmlStr.toLowerCase().indexOf("<br ");
|
|
|
+ int index2 = htmlStr.toLowerCase().indexOf(">",index1 + 1);
|
|
|
+ htmlStr = htmlStr.substring(0, index1) + "<br/>" + htmlStr.substring(index2 + 1);
|
|
|
+ }
|
|
|
+ while(htmlStr.toLowerCase().endsWith("<br>") || htmlStr.toLowerCase().endsWith("<br/>")){
|
|
|
+ if(htmlStr.toLowerCase().endsWith("<br>")){
|
|
|
+ htmlStr = htmlStr.substring(0, htmlStr.length()-"<br>".length());
|
|
|
+ }else if(htmlStr.toLowerCase().endsWith("<br/>")){
|
|
|
+ htmlStr = htmlStr.substring(0, htmlStr.length()-"<br/>".length());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ htmlStr = htmlStr.replace("<br>", "<br/>").replace("<BR>", "<br/>");
|
|
|
+
|
|
|
+ {//补全META标签
|
|
|
+ int imgIndex = indexOfRegex(htmlStr,"<((meta)|(META)) ");
|
|
|
+ while(imgIndex > 0){
|
|
|
+ int flag = htmlStr.indexOf(">", imgIndex);
|
|
|
+ if(htmlStr.charAt(flag - 1) != '/'){
|
|
|
+ htmlStr = htmlStr.substring(0,flag) + "/" + htmlStr.substring(flag);
|
|
|
+ }
|
|
|
+ imgIndex = indexOfRegex(htmlStr,"<((meta)|(META)) ",flag);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ {//补全img标签
|
|
|
+ int imgIndex = indexOfRegex(htmlStr,"<((img)|(IMG)) ");
|
|
|
+ while(imgIndex > 0){
|
|
|
+ int flag = htmlStr.indexOf(">", imgIndex);
|
|
|
+ if(htmlStr.charAt(flag - 1) != '/'){
|
|
|
+ htmlStr = htmlStr.substring(0,flag) + "/" + htmlStr.substring(flag);
|
|
|
+ }
|
|
|
+ imgIndex = indexOfRegex(htmlStr,"<((img)|(IMG)) ",flag);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //添加body标签
|
|
|
+ if(!htmlStr.toLowerCase().contains("<body")){
|
|
|
+ htmlStr = "<body>"+htmlStr+"</body>";
|
|
|
+ }
|
|
|
+ return new String(htmlStr.getBytes("UTF-8"));
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 从指定的位置开始查找第一个匹配正则表达式的字符串的位置
|
|
|
+ * @param str
|
|
|
+ * @param regex 正则表达式
|
|
|
+ * @param fromIndex 指定的起始位置
|
|
|
+ * @return
|
|
|
+ */
|
|
|
+ public static int indexOfRegex(String str,String regex,int fromIndex){
|
|
|
+ int index = indexOfRegex(str.substring(fromIndex),regex);
|
|
|
+ if(index < 0){
|
|
|
+ return -1;
|
|
|
+ }
|
|
|
+ return fromIndex + index;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 查找第一个匹配正则表达式的字符串的位置
|
|
|
+ * @param str
|
|
|
+ * @param regex 正则表达式
|
|
|
+ * @return
|
|
|
+ */
|
|
|
+ public static int indexOfRegex(String str,String regex){
|
|
|
+ Pattern p = Pattern.compile(regex);
|
|
|
+ Matcher m = p.matcher(str);
|
|
|
+ if(m.find()){
|
|
|
+ return m.start();
|
|
|
+ }else{
|
|
|
+ return -1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public static String formatHtml(String htmlStr)throws Exception{
|
|
|
+ if(StringUtils.isEmpty(htmlStr)){
|
|
|
+ return "";
|
|
|
+ }
|
|
|
+ htmlStr = repairHtmlStr(htmlStr);
|
|
|
+ htmlStr = StringEscapeUtils.unescapeHtml4(htmlStr);
|
|
|
+ return htmlStr;
|
|
|
+ }
|
|
|
|
|
|
|
|
|
public static void main(String[] args) {
|