简单小爬虫html
//请求连接,获取网页源码 public String sendGet(String url) { String result = ""; StringBuffer sb = new StringBuffer(); BufferedReader in = null; try { String urlNameString = url; System.out.println(urlNameString); URL realUrl = new URL(urlNameString); // 打开和URL之间的链接 URLConnection connection = realUrl.openConnection(); // 设置通用的请求属性 connection.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); connection.setRequestProperty("Content-type", "text/html;charset=gbk"); connection.setRequestProperty("upgrade-insecure-requests", "1"); connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.9"); connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"); // connection connection.connect(); // 获取全部响应头字段 Map<String, List<String>> map = connection.getHeaderFields(); in = new BufferedReader(new InputStreamReader( connection.getInputStream(),"GBK")); String lines; while ((lines = in.readLine()) != null) { lines=getUTF8StringFromGBKString(lines); sb.append(lines); //System.out.println(line); } } catch (Exception e) { System.out.println("发送GET请求出现异常!" + e); e.printStackTrace(); } // 使用finally块来关闭输入流 finally { try { if (in != null) { in.close(); } } catch (Exception e2) { e2.printStackTrace(); } } return sb.toString(); }
1 //对抓取的数据存入磁盘中,进行全文搜索 2 public class LuceneManager { 3 4 public static List<shop> Select(String name,int tid,int uid) throws IOException { 5 6 IndexSearcher indexSearcher =new IndexSearcher(LuceneUtils.getDirectory()); 7 8 //建立一个布尔查询对象 9 10 BooleanQuery query = new BooleanQuery(); 11 12 //建立第一个查询条件 13 if (name!=null&&name.length()!=0) { 14 int maxEdits = 100; //相同的前缀长度 15 // Query query = new FuzzyQuery(term,maxEdits,prefixLength); 16 char[] tc=name.toCharArray(); 17 for (int i = 0; i < tc.length; i++) { 18 Term term = new Term("shopName","*"+tc[i]+"*"); 19 Query shopnameQuery=new WildcardQuery(term); 20 query.add(shopnameQuery, Occur.MUST); 21 } 22 // SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>"); 23 // 24 // Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); 25 // highlighter.setTextFragmenter(new SimpleFragmenter(text.length())); 26 27 } 28 else { 29 30 } 31 32 33 Query tidQuery = new TermQuery(new Term("tid",String.valueOf(tid))); 34 Query uidQuery = new TermQuery(new Term("uid",String.valueOf(uid))); 35 36 //组合查询条件 37 38 39 40 query.add(tidQuery, Occur.MUST); 41 query.add(uidQuery, Occur.MUST); 42 //执行查询 43 44 TopDocs topDocs = indexSearcher.search(query, 100); 45 List<shop> list=new ArrayList<shop>(); 46 //获取符合条件的编号 47 System.out.println(topDocs.scoreDocs.length); 48 for (int i = 0; i < topDocs.scoreDocs.length; i++) { 49 ScoreDoc scoreDoc = topDocs.scoreDocs[i]; 50 int no = scoreDoc.doc; 51 //用indexSearcher对象去索引库中查询编号对应的Document对象 52 Document document = indexSearcher.doc(no); 53 //将Document对象中的全部属性取出,再封装回JavaBean对象中去 54 shop user = (shop) LuceneUtils.Document2JavaBean(document, shop.class); 55 list.add(user); 56 System.out.println(user.getiD()+":"+user.getShopName()+":"+user.getShopPic()+":"+user.getShopPrice()+":"+user.getShopSalesvolume()+":"+user.getStoreName()+":"+user.getTid()+":"+user.getUid()); 57 } 58 return list; 59 } 60 61 public static void createIndexDB(shop userShop) throws Exception { 62 //把数据填充到JavaBean对象中 63 // User user = new User("1", "钟福成23", "将来的程序员3"); 64 //shop userShop=new shop("4", "小米9", "2000.0", "华为旗舰店", "1q23", "1","2","1000"); 65 //建立Document对象【导入的是Lucene包下的Document对象】 66 Document document = new Document(); 67 //将JavaBean对象全部的属性值,均放到Document对象中去,属性名能够和JavaBean相同或不一样 68 /** 69 * 向Document对象加入一个字段 70 * 参数一:字段的关键字 71 * 参数二:字符的值 72 * 参数三:是否要存储到原始记录表中 73 * YES表示是 74 * NO表示否 75 * 参数四:是否须要将存储的数据拆分到词汇表中 76 * ANALYZED表示拆分 77 * NOT_ANALYZED表示不拆分 78 * 79 * */ 80 // document.add(new Field("id", user, Field.Store.YES, Field.Index.ANALYZED)); 81 // document.add(new Field("userName", user.getUserName(), Field.Store.YES, Field.Index.ANALYZED)); 82 // document.add(new Field("sal", user.getSal(), Field.Store.YES, Field.Index.ANALYZED)); 83 document.add(new Field("iD", userShop.getiD(), Field.Store.YES, Field.Index.ANALYZED)); 84 document.add(new Field("shopName", userShop.getShopName(), Field.Store.YES, Field.Index.ANALYZED)); 85 document.add(new Field("shopPic", userShop.getShopPic(), Field.Store.YES, Field.Index.ANALYZED)); 86 document.add(new Field("shopPrice", userShop.getShopPrice(), Field.Store.YES, Field.Index.ANALYZED)); 87 document.add(new Field("shopSalesvolume", userShop.getShopSalesvolume(), Field.Store.YES, Field.Index.ANALYZED)); 88 document.add(new Field("storeName", userShop.getStoreName(), Field.Store.YES, Field.Index.ANALYZED)); 89 document.add(new Field("tid", userShop.getTid(), Field.Store.YES, Field.Index.ANALYZED)); 90 document.add(new Field("uid", userShop.getUid(), Field.Store.YES, Field.Index.ANALYZED)); 91 //建立IndexWriter对象 92 //目录指定为E:/createIndexDB 93 Directory directory = FSDirectory.open(new File("D:/createIndexDB")); 94 95 //使用标准的分词算法对原始记录表进行拆分 96 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); 97 98 //LIMITED默认是1W个 99 IndexWriter.MaxFieldLength maxFieldLength = IndexWriter.MaxFieldLength.LIMITED; 100 /** 101 * IndexWriter将咱们的document对象写到硬盘中 102 * 103 * 参数一:Directory d,写到硬盘中的目录路径是什么 104 * 参数二:Analyzer a, 以何种算法来对document中的原始记录表数据进行拆分红词汇表 105 * 参数三:MaxFieldLength mfl 最多将文本拆分出多少个词汇 106 * 107 * */ 108 IndexWriter indexWriter = new IndexWriter(directory, analyzer, maxFieldLength); 109 Term id=new Term("iD",String.valueOf(userShop.getiD())); 110 // Term id=new Term("id",String.valueOf(user.getId())); 111 indexWriter.updateDocument(id,document); 112 //将Document对象经过IndexWriter对象写入索引库中 113 114 indexWriter.optimize(); 115 116 //设置合并因子为3,每当有3个cfs文件,就合并 117 indexWriter.setMergeFactor(3); 118 //关闭IndexWriter对象 119 indexWriter.close(); 120 } 121 122 public static void DeleteByID(int id) throws IOException { 123 //建立Document对象【导入的是Lucene包下的Document对象】 124 Document document = new Document(); 125 126 //建立IndexWriter对象 127 //目录指定为E:/createIndexDB 128 Directory directory = FSDirectory.open(new File("D:/createIndexDB")); 129 130 //使用标准的分词算法对原始记录表进行拆分 131 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); 132 133 //LIMITED默认是1W个 134 IndexWriter.MaxFieldLength maxFieldLength = IndexWriter.MaxFieldLength.LIMITED; 135 /** 136 * IndexWriter将咱们的document对象写到硬盘中 137 * 138 * 参数一:Directory d,写到硬盘中的目录路径是什么 139 * 参数二:Analyzer a, 以何种算法来对document中的原始记录表数据进行拆分红词汇表 140 * 参数三:MaxFieldLength mfl 最多将文本拆分出多少个词汇 141 * 142 * */ 143 IndexWriter indexWriter = new IndexWriter(directory, analyzer, maxFieldLength); 144 indexWriter.deleteDocuments(new Term("iD", String.valueOf(id))); 145 indexWriter.optimize(); 146 147 //设置合并因子为3,每当有3个cfs文件,就合并 148 indexWriter.setMergeFactor(3); 149 //关闭IndexWriter对象 150 indexWriter.close(); 151 } 152 153 public static void findIndexDB(String nameString) throws Exception { 154 155 //建立IndexSearcher对象 156 IndexSearcher indexSearcher = new IndexSearcher(LuceneUtils.getDirectory()); 157 //建立QueryParser对象 158 QueryParser queryParser = new QueryParser(Version.LUCENE_30, "shopName", LuceneUtils.getAnalyzer()); 159 //给出要查询的关键字 160 String keyWords = nameString; 161 //建立Query对象来封装关键字 162 Query query = queryParser.parse(keyWords); 163 //用IndexSearcher对象去索引库中查询符合条件的前100条记录,不足100条记录的以实际为准 164 TopDocs topDocs = indexSearcher.search(query, 100); 165 //获取符合条件的编号 166 System.out.println(topDocs.scoreDocs.length); 167 for (int i = 0; i < topDocs.scoreDocs.length; i++) { 168 ScoreDoc scoreDoc = topDocs.scoreDocs[i]; 169 int no = scoreDoc.doc; 170 //用indexSearcher对象去索引库中查询编号对应的Document对象 171 Document document = indexSearcher.doc(no); 172 //将Document对象中的全部属性取出,再封装回JavaBean对象中去 173 shop user = (shop) LuceneUtils.Document2JavaBean(document, shop.class); 174 System.out.println(user.getiD()+":"+user.getShopName()+":"+user.getShopPic()+":"+user.getShopPrice()+":"+user.getShopSalesvolume()+":"+user.getStore15:49:00Name()+":"+user.getTid()+":"+user.getUid()); 177 } 178 } 179 180 }
须要源码的,能够私信我!QQ:2748434806前端