问题:须要将大量的已存在的word文档导入到web项目里在网站展现,不可能经过编辑录入的方式处理,经过程序实现。
解决思路:经过读取word文档处理成html,再获取html富文本内容,拼接成sql,导入数据库。html
要点
1:读取word文件夹会递归读取,只要文件夹下有word文档便可,程序中有过滤word文档的代码,可根据须要修改;
二、可同时处理word2003和word2007+版本的word文档;
三、读取word2007生成的html文档内中文是unicode编码的,放到数据库或浏览器直接打开,不影响页面显示;
四、对word文档中图片作了处理,存储到单独的文件夹,导入mysql或其它数据库后,正确显示图片,要注意路径处理;
五、只处理word2007文档可不生成html直接获取富文本内容,但2003版本不能够,所以统一将doc文档都生成html页面再用java读取html文档获取body元素下的富文本内容。java
不足之处,欢迎交流和指正。mysql
可能用到的pom以下:web
<!--poi处理word文档--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.15</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.15</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>fr.opensagres.xdocreport.document</artifactId> <version>2.0.1</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> <version>1.0.6</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.core</artifactId> <version>1.0.6</version> </dependency> <dependency> <groupId>fr.opensagres.xdocreport</groupId> <artifactId>org.apache.poi.xwpf.converter.pdf</artifactId> <version>1.0.6</version> </dependency>
完整的java代码以下:sql
package test; import org.apache.commons.lang.StringUtils; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.xwpf.converter.core.FileImageExtractor; import org.apache.poi.xwpf.converter.core.FileURIResolver; import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.junit.Test; import org.w3c.dom.Document; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import java.io.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /* * @Desc word转化为html * @Author ls 2019/3/19 0019 11:21 */ public class Word2Html { @Test public void handleWordToSql() { String path = "F:\\word文档\\"; List<String> fileNames = new ArrayList<>(); Map<String, String> contentsMap = new HashMap<>(); getAllFileName(path, fileNames); // fileNames.forEach(System.out::println); // System.out.println(fileNames.size()); Map<String, String> map = handleFileName(fileNames); String imagePath = "F:\\images\\"; String htmlPath = "F:\\html\\"; map.forEach((k, v) -> { String content = ""; String articleName = k.substring(0, k.lastIndexOf(".")); String htmlName = articleName + ".html"; if (k.contains(".doc") && !k.contains(".docx")) { try { content = word2003ToHtml(imagePath + generateImageName() + "\\", v, htmlPath + htmlName); } catch (TransformerException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParserConfigurationException e) { e.printStackTrace(); } } else if (k.contains(".docx")) { try { content = word2007ToHtml(imagePath + generateImageName() + "\\", v, htmlPath + htmlName); } catch (IOException e) { e.printStackTrace(); } } else { System.out.println("word文档格式不正确->" + k); } if (StringUtils.isNotBlank(content)) { contentsMap.put(articleName, content); } }); System.out.println("文章总条数: " + contentsMap.size()); // contents.forEach(System.out::println); handleInsertSql(contentsMap); } /** * 处理sql * * @param contentsMap * @return */ public String handleInsertSql(Map<String, String> contentsMap) { StringBuffer sb = new StringBuffer(); int id = 161; for (String k : contentsMap.keySet()) { sb.append("INSERT INTO `article` VALUES (" + id + ", '2019-03-19 14:24:56', '2019-03-19 10:53:56', '0', '网络文章', \"" + contentsMap.get(k).replace("\"", "\\\'") + "\", '0', '', '', null, null, null, '" + k + "', '1', null, '0', null)"); sb.append(" ;"); sb.append("\r\n"); id++; } String data = sb.toString(); data = data.replace("F:\\images\\", "/images/"); writeFile(new StringBuffer(data)); // System.out.println(data); return data; } /** * 路径处理为map key-> word文件名 value-> 全路径 * * @param list * @return */ public Map<String, String> handleFileName(List<String> list) { if (list.size() == 0) { System.out.println("没有须要处理的文件!"); } // 过滤非word文档路径 for (int i = 0; i < list.size(); i++) { String str = list.get(i); if (str.contains(".doc") || str.contains(".docx")) { } else { list.remove(str); i--; } } Map<String, String> map = new HashMap<>(); for (String path : list) { if (StringUtils.isNotBlank(path)) { String[] arr = path.split("\\\\"); for (String value : arr) { if (value.contains(".doc") || value.contains(".docx")) { // 文件名 全路径 map.put(value, path); } } } } return map; } /** * 获取全部文件夹及文件 * * @param path * @param listFileName */ public void getAllFileName(String path, List<String> listFileName) { File file = new File(path); File[] files = file.listFiles(); String[] names = file.list(); if (names != null) { String[] completNames = new String[names.length]; for (int i = 0; i < names.length; i++) { // if(path.contains(".doc") || path.contains(".docx")) completNames[i] = path + names[i]; } listFileName.addAll(Arrays.asList(completNames)); } for (File a : files) { if (a.isDirectory()) { //若是文件夹下有子文件夹,获取子文件夹下的全部文件全路径。 getAllFileName(a.getAbsolutePath() + "\\", listFileName); } } } /** * word2003转换 * * @param imgPath * @param fileName * @param outPutFile */ public String word2003ToHtml(String imgPath, String fileName, String outPutFile) throws TransformerException, IOException, ParserConfigurationException { HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName)); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder() .newDocument()); wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> suggestedName); wordToHtmlConverter.processDocument(wordDocument); //save pictures List pics = wordDocument.getPicturesTable().getAllPictures(); if (pics != null) { for (int i = 0; i < pics.size(); i++) { Picture pic = (Picture) pics.get(i); System.out.println(); try { pic.writeImageContent(new FileOutputStream(imgPath + pic.suggestFullFileName())); } catch (FileNotFoundException e) { e.printStackTrace(); } } } Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); out.close(); writeFile(new String(out.toByteArray()), outPutFile); String body = readHtml(new File((outPutFile))); // System.out.println(out.toString()); body = replaceBreak(body); // System.out.println(body); return body; } /** * word2007转html * * @throws IOException */ public String word2007ToHtml(String imgPath, String fileName, String htmlName) throws IOException { File f = new File(fileName); String content = ""; if (!f.exists()) { System.out.println("Sorry File does not Exists!"); } else { if (f.getName().endsWith(".docx") || f.getName().endsWith(".docx")) { // 1) 加载word文档生成 XWPFDocument对象 InputStream in = new FileInputStream(f); if (in.available() == 0) { return content; } XWPFDocument document = new XWPFDocument(in); // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录) File imageFolderFile = new File(imgPath); XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile)); options.setExtractor(new FileImageExtractor(imageFolderFile)); options.setIgnoreStylesIfUnused(false); options.setFragment(true); // 3) 将 XWPFDocument转换成XHTML OutputStream out = new FileOutputStream(new File(htmlName)); XHTMLConverter.getInstance().convert(document, out, options); //也可使用字符数组流获取解析的内容 ByteArrayOutputStream baos = new ByteArrayOutputStream(); XHTMLConverter.getInstance().convert(document, baos, options); if (baos.size() == 0) { return content; } content = baos.toString(); // 输出转化后的文本 // System.out.println(content); baos.close(); } else { System.out.println("Enter only MS Office 2007+ files"); } } return content; } /** * 解析html文件 得到body内容 * * @param file * @return */ public String readHtml(File file) { String body = ""; try { FileInputStream iStream = new FileInputStream(file); Reader reader = new InputStreamReader(iStream); BufferedReader htmlReader = new BufferedReader(reader); String line; boolean found = false; while (!found && (line = htmlReader.readLine()) != null) { if (line.toLowerCase().indexOf("<body") != -1) { // 在<body>的前面可能存在空格 found = true; } } found = false; while (!found && (line = htmlReader.readLine()) != null) { if (line.toLowerCase().indexOf("</body") != -1) { found = true; } else { // 若是存在图片,则将相对路径转换为绝对路径 String lowerCaseLine = line.toLowerCase(); if (lowerCaseLine.contains("src")) { //这里是定义图片的访问路径 String directory = "D:/test"; // 若是该行存在多个<img>元素,则分行进行替代 String[] splitLines = line.split("<img\\s+"); // <img后带一个或多个空格 // 由于java中引用的问题不能使用for each for (int i = 0; i < splitLines.length; i++) { if (splitLines[i].toLowerCase().startsWith("src")) { splitLines[i] = splitLines[i].substring(0, splitLines[i].toLowerCase().indexOf("src") + 5) + directory + splitLines[i].substring(splitLines[i].toLowerCase().indexOf("src") + 5); } } // 最后进行拼接 line = ""; for (int i = 0; i < splitLines.length - 1; i++) { // 循环次数要-1,由于最后一个字符串后不须要添加<img line = line + splitLines[i] + "<img "; } line = line + splitLines[splitLines.length - 1]; } body = body + line + "\n"; } } htmlReader.close(); // System.out.println(body); } catch (Exception e) { e.printStackTrace(); } return body; } /** * 去掉换行 * * @param str * @return */ public String replaceBreak(String str) { String dest = ""; if (str != null) { Pattern p = Pattern.compile("\\t|\n"); Matcher m = p.matcher(str); dest = m.replaceAll(""); } return dest; } /** * sql写入文件 * @param content * @param path */ public static void writeFile(String content, String path) { FileOutputStream fos = null; BufferedWriter bw = null; try { File file = new File(path); fos = new FileOutputStream(file); bw = new BufferedWriter(new OutputStreamWriter(fos, "utf-8")); bw.write(content); } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { if (bw != null) bw.close(); if (fos != null) fos.close(); } catch (IOException ie) { } } } /** * sql写入文件 */ public static void writeFile(StringBuffer sb) { String sqlTxtFile = "F:\\articleSql.sql"; try { File writeName = new File(sqlTxtFile); // 相对路径,若是没有则要创建一个新的output.txt文件 writeName.createNewFile(); // 建立新文件,有同名的文件的话直接覆盖 try (FileWriter writer = new FileWriter(writeName); BufferedWriter out = new BufferedWriter(writer) ) { out.write(sb.toString()); // \r\n即为换行 out.flush(); // 把缓存区内容压入文件 } } catch (IOException e) { e.printStackTrace(); } System.out.println("sql文件写入完成"); } // 生成6位随机数 public static String randomCode(){ int num = (int)((Math.random()*9+1)*100000); return String.valueOf(num); } /** * 生成随机字符串 时间戳_6位随机数 * @return */ public static String generateImageName(){ String name = String.valueOf(System.currentTimeMillis()) + "_" + randomCode(); return name; } }
运行handleWordToSql测试方法便可!数据库
效果图:
原word文档
apache
处理后的html
数组
对应的html图片文件夹
浏览器
生成的sql
缓存