word文档处理成富文本生成sql语句导入mysql

时间 2019-11-12

标签 word 文档处理文本生成 sql 语句导入 mysql 栏目 Microsoft Office 繁體版

原文原文链接

问题：须要将大量的已存在的word文档导入到web项目里在网站展现，不可能经过编辑录入的方式处理，经过程序实现。
解决思路：经过读取word文档处理成html，再获取html富文本内容，拼接成sql，导入数据库。html

要点
1：读取word文件夹会递归读取，只要文件夹下有word文档便可，程序中有过滤word文档的代码，可根据须要修改；
二、可同时处理word2003和word2007+版本的word文档；
三、读取word2007生成的html文档内中文是unicode编码的，放到数据库或浏览器直接打开，不影响页面显示；
四、对word文档中图片作了处理，存储到单独的文件夹，导入mysql或其它数据库后，正确显示图片，要注意路径处理；
五、只处理word2007文档可不生成html直接获取富文本内容，但2003版本不能够，所以统一将doc文档都生成html页面再用java读取html文档获取body元素下的富文本内容。java

不足之处，欢迎交流和指正。mysql

可能用到的pom以下：web

<!--poi处理word文档-->
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi</artifactId>
			<version>3.15</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>3.15</version>
		</dependency>
		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>fr.opensagres.xdocreport.document</artifactId>
			<version>2.0.1</version>
		</dependency>
		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
			<version>1.0.6</version>
		</dependency>

		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>org.apache.poi.xwpf.converter.core</artifactId>
			<version>1.0.6</version>
		</dependency>

		<dependency>
			<groupId>fr.opensagres.xdocreport</groupId>
			<artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
			<version>1.0.6</version>
		</dependency>

完整的java代码以下：sql

package test;

import org.apache.commons.lang.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/*
 * @Desc word转化为html
 * @Author ls 2019/3/19 0019 11:21
 */
public class Word2Html {


    @Test
    public void handleWordToSql() {
        String path = "F:\\word文档\\";
        List<String> fileNames = new ArrayList<>();
        Map<String, String> contentsMap = new HashMap<>();

        getAllFileName(path, fileNames);
//        fileNames.forEach(System.out::println);
//        System.out.println(fileNames.size());
        Map<String, String> map = handleFileName(fileNames);
        String imagePath = "F:\\images\\";
        String htmlPath = "F:\\html\\";
        map.forEach((k, v) -> {
            String content = "";
            String articleName = k.substring(0, k.lastIndexOf("."));
            String htmlName = articleName + ".html";
            if (k.contains(".doc") && !k.contains(".docx")) {
                try {
                    content = word2003ToHtml(imagePath + generateImageName() + "\\", v, htmlPath + htmlName);
                } catch (TransformerException e) {
                    e.printStackTrace();
                } catch (IOException e) {
                    e.printStackTrace();
                } catch (ParserConfigurationException e) {
                    e.printStackTrace();
                }

            } else if (k.contains(".docx")) {
                try {
                    content = word2007ToHtml(imagePath + generateImageName() + "\\", v, htmlPath + htmlName);
                } catch (IOException e) {
                    e.printStackTrace();
                }

            } else {
                System.out.println("word文档格式不正确->" + k);
            }

            if (StringUtils.isNotBlank(content)) {
                contentsMap.put(articleName, content);
            }

        });
        System.out.println("文章总条数: " + contentsMap.size());
//        contents.forEach(System.out::println);
        handleInsertSql(contentsMap);

    }

    /**
     * 处理sql
     *
     * @param contentsMap
     * @return
     */
    public String handleInsertSql(Map<String, String> contentsMap) {
        StringBuffer sb = new StringBuffer();
        int id = 161;
        for (String k : contentsMap.keySet()) {
            sb.append("INSERT INTO `article` VALUES (" + id + ", '2019-03-19 14:24:56', '2019-03-19 10:53:56', '0', '网络文章', \"" + contentsMap.get(k).replace("\"", "\\\'") + "\", '0', '', '', null, null, null, '" + k + "', '1', null, '0', null)");
            sb.append(" ;");
            sb.append("\r\n");
            id++;
        }

        String data = sb.toString();
        data = data.replace("F:\\images\\", "/images/");
        writeFile(new StringBuffer(data));
//        System.out.println(data);
        return data;
    }

    /**
     * 路径处理为map key-> word文件名 value-> 全路径
     *
     * @param list
     * @return
     */
    public Map<String, String> handleFileName(List<String> list) {
        if (list.size() == 0) {
            System.out.println("没有须要处理的文件！");
        }

        // 过滤非word文档路径
        for (int i = 0; i < list.size(); i++) {
            String str = list.get(i);
            if (str.contains(".doc") || str.contains(".docx")) {

            } else {
                list.remove(str);
                i--;
            }

        }

        Map<String, String> map = new HashMap<>();
        for (String path : list) {
            if (StringUtils.isNotBlank(path)) {
                String[] arr = path.split("\\\\");
                for (String value : arr) {
                    if (value.contains(".doc") || value.contains(".docx")) {
                        // 文件名 全路径
                        map.put(value, path);
                    }
                }
            }
        }
        return map;
    }


    /**
     * 获取全部文件夹及文件
     *
     * @param path
     * @param listFileName
     */
    public void getAllFileName(String path, List<String> listFileName) {
        File file = new File(path);
        File[] files = file.listFiles();
        String[] names = file.list();
        if (names != null) {
            String[] completNames = new String[names.length];
            for (int i = 0; i < names.length; i++) {
//                if(path.contains(".doc") || path.contains(".docx"))
                completNames[i] = path + names[i];
            }
            listFileName.addAll(Arrays.asList(completNames));
        }
        for (File a : files) {
            if (a.isDirectory()) {
                //若是文件夹下有子文件夹，获取子文件夹下的全部文件全路径。
                getAllFileName(a.getAbsolutePath() + "\\", listFileName);
            }
        }
    }

    /**
     * word2003转换
     *
     * @param imgPath
     * @param fileName
     * @param outPutFile
     */
    public String word2003ToHtml(String imgPath, String fileName, String outPutFile)
            throws TransformerException, IOException,
            ParserConfigurationException {
        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument());
        wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> suggestedName);
        wordToHtmlConverter.processDocument(wordDocument);
        //save pictures
        List pics = wordDocument.getPicturesTable().getAllPictures();
        if (pics != null) {
            for (int i = 0; i < pics.size(); i++) {
                Picture pic = (Picture) pics.get(i);
                System.out.println();
                try {
                    pic.writeImageContent(new FileOutputStream(imgPath
                            + pic.suggestFullFileName()));
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                }
            }
        }
        Document htmlDocument = wordToHtmlConverter.getDocument();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(out);

        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        out.close();
        writeFile(new String(out.toByteArray()), outPutFile);
        String body = readHtml(new File((outPutFile)));
//        System.out.println(out.toString());
        body = replaceBreak(body);
//        System.out.println(body);
        return body;
    }

    /**
     * word2007转html
     *
     * @throws IOException
     */
    public String word2007ToHtml(String imgPath, String fileName, String htmlName) throws IOException {
        File f = new File(fileName);
        String content = "";
        if (!f.exists()) {
            System.out.println("Sorry File does not Exists!");
        } else {
            if (f.getName().endsWith(".docx") || f.getName().endsWith(".docx")) {

                // 1) 加载word文档生成 XWPFDocument对象
                InputStream in = new FileInputStream(f);
                if (in.available() == 0) {
                    return content;
                }
                XWPFDocument document = new XWPFDocument(in);

                // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
                File imageFolderFile = new File(imgPath);
                XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
                options.setExtractor(new FileImageExtractor(imageFolderFile));
                options.setIgnoreStylesIfUnused(false);
                options.setFragment(true);

                // 3) 将 XWPFDocument转换成XHTML
                OutputStream out = new FileOutputStream(new File(htmlName));
                XHTMLConverter.getInstance().convert(document, out, options);

                //也可使用字符数组流获取解析的内容
                ByteArrayOutputStream baos = new ByteArrayOutputStream();
                XHTMLConverter.getInstance().convert(document, baos, options);
                if (baos.size() == 0) {
                    return content;
                }
                content = baos.toString();
                // 输出转化后的文本
                // System.out.println(content);
                baos.close();
            } else {
                System.out.println("Enter only MS Office 2007+ files");
            }
        }

        return content;
    }

    /**
     * 解析html文件 得到body内容
     *
     * @param file
     * @return
     */
    public String readHtml(File file) {
        String body = "";
        try {
            FileInputStream iStream = new FileInputStream(file);
            Reader reader = new InputStreamReader(iStream);
            BufferedReader htmlReader = new BufferedReader(reader);

            String line;
            boolean found = false;
            while (!found && (line = htmlReader.readLine()) != null) {
                if (line.toLowerCase().indexOf("<body") != -1) { // 在<body>的前面可能存在空格
                    found = true;
                }
            }

            found = false;
            while (!found && (line = htmlReader.readLine()) != null) {
                if (line.toLowerCase().indexOf("</body") != -1) {
                    found = true;
                } else {
                    // 若是存在图片，则将相对路径转换为绝对路径
                    String lowerCaseLine = line.toLowerCase();
                    if (lowerCaseLine.contains("src")) {
                        //这里是定义图片的访问路径
                        String directory = "D:/test";
                        // 若是该行存在多个<img>元素，则分行进行替代
                        String[] splitLines = line.split("<img\\s+"); // <img后带一个或多个空格
                        // 由于java中引用的问题不能使用for each
                        for (int i = 0; i < splitLines.length; i++) {
                            if (splitLines[i].toLowerCase().startsWith("src")) {
                                splitLines[i] = splitLines[i].substring(0, splitLines[i].toLowerCase().indexOf("src") + 5)
                                        + directory
                                        + splitLines[i].substring(splitLines[i].toLowerCase().indexOf("src") + 5);
                            }
                        }

                        // 最后进行拼接
                        line = "";
                        for (int i = 0; i < splitLines.length - 1; i++) { // 循环次数要-1，由于最后一个字符串后不须要添加<img
                            line = line + splitLines[i] + "<img ";
                        }
                        line = line + splitLines[splitLines.length - 1];
                    }

                    body = body + line + "\n";
                }
            }
            htmlReader.close();
            //        System.out.println(body);

        } catch (Exception e) {
            e.printStackTrace();
        }
        return body;
    }

    /**
     * 去掉换行
     *
     * @param str
     * @return
     */
    public String replaceBreak(String str) {
        String dest = "";
        if (str != null) {
            Pattern p = Pattern.compile("\\t|\n");
            Matcher m = p.matcher(str);
            dest = m.replaceAll("");
        }
        return dest;
    }

    /**
     * sql写入文件
     * @param content
     * @param path
     */
    public static void writeFile(String content, String path) {
        FileOutputStream fos = null;
        BufferedWriter bw = null;
        try {
            File file = new File(path);
            fos = new FileOutputStream(file);
            bw = new BufferedWriter(new OutputStreamWriter(fos, "utf-8"));
            bw.write(content);
        } catch (FileNotFoundException fnfe) {
            fnfe.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                if (bw != null)
                    bw.close();
                if (fos != null)
                    fos.close();
            } catch (IOException ie) {
            }
        }
    }

    /**
     * sql写入文件
     */
    public static void writeFile(StringBuffer sb) {
        String sqlTxtFile = "F:\\articleSql.sql";
        try {
            File writeName = new File(sqlTxtFile); // 相对路径，若是没有则要创建一个新的output.txt文件
            writeName.createNewFile(); // 建立新文件,有同名的文件的话直接覆盖
            try (FileWriter writer = new FileWriter(writeName);
                 BufferedWriter out = new BufferedWriter(writer)
            ) {
                out.write(sb.toString()); // \r\n即为换行
                out.flush(); // 把缓存区内容压入文件
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println("sql文件写入完成");
    }

    // 生成6位随机数
    public static String randomCode(){
        int num = (int)((Math.random()*9+1)*100000);
        return String.valueOf(num);
    }

    /**
     * 生成随机字符串 时间戳_6位随机数
     * @return
     */
    public static String generateImageName(){
        String name = String.valueOf(System.currentTimeMillis()) + "_" + randomCode();
        return name;
    }

}

运行handleWordToSql测试方法便可！数据库

效果图：
原word文档
apache

处理后的html
数组

对应的html图片文件夹
浏览器

生成的sql
缓存