JAVA--Reptile

简单小爬虫html

项目中技术:

  • Jsoup (解析网页:请求网址返回网页信息,Jsoup获取对应节点的数据)
  • Lucene(搜索引擎:对抓取的数据进行搜索,相比较而言,比数据库查询要快的多!)
  • 前端样式用的是BootScript
  • 前端使用Ajax请求数据,后台使用Servelt处理请求,先后台传递数据格式为Json
  • 数据库链接使用JDBC
  • 存储的数据库是:SqlServer

项目的环境:

  • 运行环境:apache-tomcat-7.0.94
  • 开发环境:JDK8.0
  • 开发工具:Eclipse

效果展现:

       

       

       

 

代码展现:

//请求连接,获取网页源码 
public String sendGet(String url) {
        String result = "";
        StringBuffer sb = new StringBuffer();
        BufferedReader in = null;
        try {
        String urlNameString = url;
    
        System.out.println(urlNameString);
        URL realUrl = new URL(urlNameString);
        // 打开和URL之间的链接
        URLConnection connection = realUrl.openConnection();
        // 设置通用的请求属性
        connection.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        connection.setRequestProperty("Content-type", "text/html;charset=gbk");
        connection.setRequestProperty("upgrade-insecure-requests", "1");
        connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.9");

        connection.setRequestProperty("user-agent",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36");
        // connection

        connection.connect();

        // 获取全部响应头字段
        Map<String, List<String>> map = connection.getHeaderFields();

        in = new BufferedReader(new InputStreamReader(
        connection.getInputStream(),"GBK"));
        String lines;

        while ((lines = in.readLine()) != null) {
            lines=getUTF8StringFromGBKString(lines); 
            
              sb.append(lines);
        //System.out.println(line);
        }
        
        } catch (Exception e) {
        System.out.println("发送GET请求出现异常!" + e);
        e.printStackTrace();
        }
        // 使用finally块来关闭输入流
        finally {
        try {
        if (in != null) {
        in.close();
        }
        } catch (Exception e2) {
        e2.printStackTrace();
        }
        }
        
           return sb.toString();
    }

 

  1 //对抓取的数据存入磁盘中,进行全文搜索
  2 public class LuceneManager {
  3     
  4        public static List<shop> Select(String name,int tid,int uid) throws IOException {
  5 
  6             IndexSearcher indexSearcher =new IndexSearcher(LuceneUtils.getDirectory());
  7 
  8             //建立一个布尔查询对象
  9 
 10             BooleanQuery query = new BooleanQuery();
 11             
 12             //建立第一个查询条件
 13 if (name!=null&&name.length()!=0) {
 14     int maxEdits = 100; //相同的前缀长度
 15 //  Query query = new FuzzyQuery(term,maxEdits,prefixLength);
 16     char[] tc=name.toCharArray();
 17     for (int i = 0; i < tc.length; i++) {
 18         Term term = new Term("shopName","*"+tc[i]+"*");
 19         Query shopnameQuery=new WildcardQuery(term);
 20         query.add(shopnameQuery, Occur.MUST);
 21     }
 22 //    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>");
 23 //
 24 //    Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
 25 //    highlighter.setTextFragmenter(new SimpleFragmenter(text.length()));
 26 
 27 }
 28 else {
 29     
 30 }
 31             
 32 
 33             Query tidQuery = new TermQuery(new Term("tid",String.valueOf(tid)));
 34             Query uidQuery = new TermQuery(new Term("uid",String.valueOf(uid)));
 35 
 36             //组合查询条件
 37 
 38         
 39 
 40             query.add(tidQuery, Occur.MUST);
 41             query.add(uidQuery, Occur.MUST);
 42             //执行查询
 43 
 44              TopDocs topDocs = indexSearcher.search(query, 100);
 45              List<shop> list=new ArrayList<shop>();
 46                 //获取符合条件的编号
 47              System.out.println(topDocs.scoreDocs.length);
 48                 for (int i = 0; i < topDocs.scoreDocs.length; i++) {
 49                     ScoreDoc scoreDoc = topDocs.scoreDocs[i];
 50                     int no = scoreDoc.doc;
 51                     //用indexSearcher对象去索引库中查询编号对应的Document对象
 52                     Document document = indexSearcher.doc(no);
 53                     //将Document对象中的全部属性取出,再封装回JavaBean对象中去
 54                    shop user = (shop) LuceneUtils.Document2JavaBean(document, shop.class);
 55                    list.add(user);
 56                   System.out.println(user.getiD()+":"+user.getShopName()+":"+user.getShopPic()+":"+user.getShopPrice()+":"+user.getShopSalesvolume()+":"+user.getStoreName()+":"+user.getTid()+":"+user.getUid());
 57                 }
 58 return list;
 59             }
 60 
 61        public static void createIndexDB(shop userShop) throws Exception {
 62                    //把数据填充到JavaBean对象中
 63            // User user = new User("1", "钟福成23", "将来的程序员3");
 64            //shop userShop=new shop("4", "小米9", "2000.0", "华为旗舰店", "1q23", "1","2","1000");
 65                 //建立Document对象【导入的是Lucene包下的Document对象】
 66                 Document document = new Document();
 67                 //将JavaBean对象全部的属性值,均放到Document对象中去,属性名能够和JavaBean相同或不一样
 68                 /**
 69                  * 向Document对象加入一个字段
 70                  * 参数一:字段的关键字
 71                  * 参数二:字符的值
 72                  * 参数三:是否要存储到原始记录表中
 73                  *      YES表示是
 74                  *      NO表示否
 75                  * 参数四:是否须要将存储的数据拆分到词汇表中
 76                  *      ANALYZED表示拆分
 77                  *      NOT_ANALYZED表示不拆分
 78                  *
 79                  * */
 80 //                document.add(new Field("id", user, Field.Store.YES, Field.Index.ANALYZED));
 81 //                document.add(new Field("userName", user.getUserName(), Field.Store.YES, Field.Index.ANALYZED));
 82 //                document.add(new Field("sal", user.getSal(), Field.Store.YES, Field.Index.ANALYZED));
 83                document.add(new Field("iD", userShop.getiD(), Field.Store.YES, Field.Index.ANALYZED));
 84                 document.add(new Field("shopName", userShop.getShopName(), Field.Store.YES, Field.Index.ANALYZED));
 85                document.add(new Field("shopPic", userShop.getShopPic(), Field.Store.YES, Field.Index.ANALYZED));
 86                document.add(new Field("shopPrice", userShop.getShopPrice(), Field.Store.YES, Field.Index.ANALYZED));
 87                document.add(new Field("shopSalesvolume", userShop.getShopSalesvolume(), Field.Store.YES, Field.Index.ANALYZED));
 88                document.add(new Field("storeName", userShop.getStoreName(), Field.Store.YES, Field.Index.ANALYZED));
 89                document.add(new Field("tid", userShop.getTid(), Field.Store.YES, Field.Index.ANALYZED));
 90                document.add(new Field("uid", userShop.getUid(), Field.Store.YES, Field.Index.ANALYZED));
 91                 //建立IndexWriter对象
 92                 //目录指定为E:/createIndexDB
 93                 Directory directory = FSDirectory.open(new File("D:/createIndexDB"));
 94 
 95                 //使用标准的分词算法对原始记录表进行拆分
 96                 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
 97 
 98                 //LIMITED默认是1W个
 99                 IndexWriter.MaxFieldLength maxFieldLength = IndexWriter.MaxFieldLength.LIMITED;
100                 /**
101                  * IndexWriter将咱们的document对象写到硬盘中
102                  *
103                  * 参数一:Directory d,写到硬盘中的目录路径是什么
104                  * 参数二:Analyzer a, 以何种算法来对document中的原始记录表数据进行拆分红词汇表
105                  * 参数三:MaxFieldLength mfl 最多将文本拆分出多少个词汇
106                  *
107                  * */
108                 IndexWriter indexWriter = new IndexWriter(directory, analyzer, maxFieldLength);
109                 Term id=new Term("iD",String.valueOf(userShop.getiD()));
110                // Term id=new Term("id",String.valueOf(user.getId()));
111                 indexWriter.updateDocument(id,document);
112                 //将Document对象经过IndexWriter对象写入索引库中
113             
114                 indexWriter.optimize();
115 
116                 //设置合并因子为3,每当有3个cfs文件,就合并
117                 indexWriter.setMergeFactor(3);
118                 //关闭IndexWriter对象
119                 indexWriter.close();
120             }
121 
122        public static void DeleteByID(int id) throws IOException {
123                  //建立Document对象【导入的是Lucene包下的Document对象】
124                 Document document = new Document();
125 
126                 //建立IndexWriter对象
127                 //目录指定为E:/createIndexDB
128                 Directory directory = FSDirectory.open(new File("D:/createIndexDB"));
129 
130                 //使用标准的分词算法对原始记录表进行拆分
131                 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
132 
133                 //LIMITED默认是1W个
134                 IndexWriter.MaxFieldLength maxFieldLength = IndexWriter.MaxFieldLength.LIMITED;
135                 /**
136                  * IndexWriter将咱们的document对象写到硬盘中
137                  *
138                  * 参数一:Directory d,写到硬盘中的目录路径是什么
139                  * 参数二:Analyzer a, 以何种算法来对document中的原始记录表数据进行拆分红词汇表
140                  * 参数三:MaxFieldLength mfl 最多将文本拆分出多少个词汇
141                  *
142                  * */
143                 IndexWriter indexWriter = new IndexWriter(directory, analyzer, maxFieldLength);
144                 indexWriter.deleteDocuments(new Term("iD", String.valueOf(id)));
145                 indexWriter.optimize();
146 
147                 //设置合并因子为3,每当有3个cfs文件,就合并
148                 indexWriter.setMergeFactor(3);
149                 //关闭IndexWriter对象
150                 indexWriter.close();
151             }
152          
153        public static void findIndexDB(String nameString) throws Exception {
154 
155                 //建立IndexSearcher对象
156                 IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.getDirectory());
157                 //建立QueryParser对象
158                 QueryParser queryParser = new QueryParser(Version.LUCENE_30, "shopName", LuceneUtils.getAnalyzer());
159                 //给出要查询的关键字
160                 String keyWords = nameString;
161                 //建立Query对象来封装关键字
162                 Query query = queryParser.parse(keyWords);
163                 //用IndexSearcher对象去索引库中查询符合条件的前100条记录,不足100条记录的以实际为准
164                 TopDocs topDocs = indexSearcher.search(query, 100);
165                 //获取符合条件的编号
166              System.out.println(topDocs.scoreDocs.length);
167                 for (int i = 0; i < topDocs.scoreDocs.length; i++) {
168                     ScoreDoc scoreDoc = topDocs.scoreDocs[i];
169                     int no = scoreDoc.doc;
170                     //用indexSearcher对象去索引库中查询编号对应的Document对象
171                     Document document = indexSearcher.doc(no);
172                     //将Document对象中的全部属性取出,再封装回JavaBean对象中去
173                     shop user = (shop) LuceneUtils.Document2JavaBean(document, shop.class);
174                        System.out.println(user.getiD()+":"+user.getShopName()+":"+user.getShopPic()+":"+user.getShopPrice()+":"+user.getShopSalesvolume()+":"+user.getStore15:49:00Name()+":"+user.getTid()+":"+user.getUid());
177                 }
178             }
179 
180 }

 须要源码的,能够私信我!QQ:2748434806前端

相关文章
相关标签/搜索
本站公众号
   欢迎关注本站公众号,获取更多信息