POI读取第三方下载的Word文档

由于从第三方读取到的word多是其余格式（例如：html）转成word的，此时去读取word可能会失败。这里以HTML为例html

依赖

<!-- parse world -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.0.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.0.1</version>
        </dependency>

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.3</version>
        </dependency>

代码片断

private String parseWord(String path) throws ParseWordException {
        // inspect
        if (isEmpty(path)) {
            throw new ParseWordException(Code.PARAM_EMPTY.getCode(), Code.PARAM_EMPTY.getMessage());
        }

        // reader
        File file = new File(path);
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(file);
        }  catch (FileNotFoundException e) {
            throw new ParseWordException(Code.READER_FILE_FAILURE.getCode(), Code.READER_FILE_FAILURE.getMessage());
        }


        // parse

        try {
            if (path.toUpperCase().endsWith(FileType.DOC.toString())) {
                HWPFDocument wordDoc = new HWPFDocument(fis);
                // 本身读

            } else if (path.toUpperCase().endsWith(FileType.DOCX.toString())) {
                XWPFDocument wordDocx = new XWPFDocument(fis);
                // 本身读

            } else {
                // 文件格式非法
                throw new ParseWordException(Code.FILE_TYPE_ILLEGAL.getCode(), Code.FILE_TYPE_ILLEGAL.getMessage());
            }

        }
        catch (IllegalArgumentException ie) {
            System.out.println(ie.getMessage());
            if (isEmpty(ie.getMessage())) {
                throw new ParseWordException(Code.PARAM_EMPTY.getCode(), Code.PARAM_EMPTY.getMessage());
            }
            if (ie.getMessage().contains("The document is really a HTML file")) {
                // 格式转换
                try {
                    String htmlPath = parseHtml(file);
                    Document doc = Jsoup.parse(new File(htmlPath), "GBK"); // 本身定
                    Elements elements = doc.select("table").select("tbody"); //读取全部的tbody标签，视状况而定
                    elements.forEach(e -> {
                        //读取td中全部的span标签，视状况而定，可能有图片，本身处理
                        e.select("td").select("span").eachText().stream().filter(d -> d != null && d.trim().length() > 0).forEach(System.out::println);

                    });


                } catch (IOException e) {
                    throw new ParseWordException(Code.FILE_CONVERT_FAILURE.getCode(), Code.FILE_CONVERT_FAILURE.getMessage());
                }
            }


        }
        catch (IOException e) {
            throw new ParseWordException(Code.PARSE_FAILURE.getCode(), Code.PARSE_FAILURE.getMessage());
        }
        return null;
    }

    /**
     * parse HTML
     *
     * @param readerFile
     * @return
     * @throws IOException
     */
    private String parseHtml(File readerFile) throws IOException {
        String tempPath = "d:\\1.html"; // 建立一个零时文件，本身换一下路径

        File outFile = new File(tempPath);
        if (outFile.exists()) {
            outFile.delete(); // 删掉以前已经存在的文件
        }
        FileInputStream fis = new FileInputStream(readerFile);
        FileOutputStream fileOutputStream = new FileOutputStream(outFile);
        int len = 0;
        byte[] buffer = new byte[1024];
        while ((len = fis.read(buffer)) != -1) {
            fileOutputStream.write(buffer, 0, len);

        }

        return tempPath;
    }

    public static void main(String[] args) throws IOException, ParseWordException {
       ParseWorld parse = new ParseWorld();
       parse.parseWord("D:\\aaa.doc");


//
    }

    private boolean isEmpty(String str) {
        return str == null || str.trim().length() == 0;
    }