在爬取某些网站,有些须要登陆才能获取访问权限。若是仅仅只是须要登陆,这里能够推荐你们一个工具,很好用的
java
在火狐浏览其中有个插件firebug(须要安装),经过这个插件能够详细的查看网站的访问过程(连接的跳转和访问前后顺序),以及每次连接的请求头信息、响应头信息,同时也能够查看post提交的数据。固然在IE和谷歌浏览器中也有些开发工具,F12直接唤出,可是我的感受火狐的firebug比较好用,IE的和谷歌的,我也偶尔使用。浏览器
经过上面介绍的工具能够获取模拟的详细过程,而后模拟登陆,都是很容易的事。cookie
这里我是介绍的是登陆若是须要验证码,就有些麻烦了,我这里想到一种解决办法,比较经常使用,就是弹出验证码dom
实现以下,模拟登陆jsp
public class LoginByCode { public static void main(String[] args) { CloseableHttpClient httpClient = HttpClientBuilder.create().build(); SimpleDateFormat format = new SimpleDateFormat("yyyyMMddhhmmss"); String path = "d:/img/tmp/" + format.format(new Date()) + ".jpg"; try { String imgurl = "http://www.shanghaiip.cn/wasWeb/login/Random.jsp"; HttpUriRequest get = new HttpGet(imgurl); HttpResponse res = httpClient.execute(get); res.setHeader("Content-Type", "image/gif"); byte[] img = EntityUtils.toByteArray(res.getEntity());//下载验证码图片 saveFile(path, img); String code = new ImgDialog().showDialog(null, path);//弹出验证码,获取填写验证码 String login = "http://www.shanghaiip.cn/wasWeb/login/loginServer.jsp"; HttpPost post = new HttpPost(login); List<NameValuePair> data = new ArrayList<NameValuePair>(); data.add(new BasicNameValuePair("username", "zhpatent")); data.add(new BasicNameValuePair("password", "5ca072839350b0733a2a456cc4004371"));//火狐里面用firebug能够查看密码是加密后的 data.add(new BasicNameValuePair("newrandom", code)); post.setEntity(new UrlEncodedFormEntity(data)); res = httpClient.execute(post); Header[] headers = res.getHeaders("Location");//获取跳转连接 get = new HttpGet(headers[0].getValue()); res = httpClient.execute(get); String body = EntityUtils.toString(res.getEntity()); if (body.contains("zhpatent")) { System.out.println("模拟登陆成功:" + body.substring(body.indexOf("zhpatent") - 40, body.indexOf("zhpatent") + 40)); } } catch (Exception e) { System.out.println("异常:" + e.getMessage()); } finally { File file = new File(path); if (file.exists()) { file.delete(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } private static void saveFile(String path, byte[] data) { int size = 0; byte[] buffer = new byte[10240]; try (BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(path)); ByteArrayInputStream is = new ByteArrayInputStream(data)) { while ((size = is.read(buffer)) != -1) { bos.write(buffer, 0, size); } } catch (IOException e) { e.printStackTrace(); } } }
验证码工具类ide
public class ImgDialog { public String message = null; private JButton confirm; private JDialog dialog = null; private TextField field; String result = ""; public String showDialog(JFrame father, String path) { JLabel label = new JLabel(); label.setBorder(new EtchedBorder(EtchedBorder.LOWERED, null, null)); label.setBounds(10, 10, 125, 51); label.setIcon(new ImageIcon(path)); field = new TextField(); field.setBounds(145, 10, 65, 20); confirm = new JButton("肯定"); confirm.setBounds(145, 40, 65, 20); confirm.addActionListener(new ActionListener() { @Override public void actionPerformed(ActionEvent e) { result = field.getText(); ImgDialog.this.dialog.dispose(); } }); dialog = new JDialog(father, true); dialog.setTitle("请输入图片中的验证码"); Container pane = dialog.getContentPane(); pane.setLayout(null); pane.add(label); pane.add(field); pane.add(confirm); dialog.pack(); dialog.setSize(new Dimension(235, 110)); dialog.setLocation(750, 430); // dialog.setLocationRelativeTo(father); dialog.setVisible(true); return result; } }
实验效果以下工具
运行会下载验证码并弹出post
输入验证码,在登陆后跳转的页面中获取到个人用户信息。开发工具
我这里是使用的httpclient模拟登陆的,httpclient不用管理cookies,因此用起来方便,不会出现验证码对不上号的问题。网站
若是是使用Jsoup模拟登陆就稍微麻烦点,得本身管理cookies,在访问验证码页面的时候同时得下载验证码和拿到cookies,而后在模拟登陆的时候须要带上cookies