【整理】用Python+beautifulsoup抓取股票数据

时间 2019-12-11

标签整理 python+beautifulsoup python beautifulsoup 抓取股票数据栏目 Python 繁體版

原文原文链接

【刚开始写总结，读者若是对个人内容有任何建议欢迎留言反馈，或直接加QQ1172617666，期待交流】php

先贴上代码，再详细的写一下在写这些代码的过程当中遇到的问题，解决的方法。html

这些代码完成的任务是：访问 http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/600000.phtml 把该股票代码的全部极度的历史信息抓取下来，保存成.json格式（能够用记事本打开）文件。我是存放在了C:\Users\ZSH\Desktop\Python\DATA下面，你能够把这个路径替换为你的相关路径。

 
       #coding:utf-8 
      
 
       ''' 
      
 
       Created on 2014年3月20日 
      
 
               
      
 
       @author: ZSH 
      
 
       ''' 
      
 
       import 
       urllib.request 
      
 
       import 
       json 
      
 
       from 
       bs4  
       import 
       BeautifulSoup  
      
 
               
      
 
               
      
 
       def 
       get_year_range(code): 
      
 
            
       url  
       = 
       'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/%s.phtml' 
       % 
       code 
      
 
            
       content  
       = 
       urllib.request.urlopen(url).read() 
      
 
            
       soup  
       = 
       BeautifulSoup(content) 
      
 
            
       str1  
       = 
       soup.findAll( 
       'select' 
       , attrs 
       = 
       { 
       'name' 
       : 
       'year' 
       })     
      
 
            
       optionSoup  
       = 
       str1[ 
       0 
       ] 
      
 
            
       optionTags  
       = 
       optionSoup.findAll( 
       'option' 
       ) 
      
 
            
       yearlist  
       = 
       [] 
      
 
            
       for 
       i  
       in 
       range 
       ( 
       0 
       ,  
       len 
       (optionTags)): 
      
 
                
       yearlist.append(optionTags[i].string) 
      
 
            
       return 
       (yearlist) 
      
 
               
      
 
       def 
       get_data(code): 
      
 
            
       yearlist  
       = 
       get_year_range(code) 
      
 
            
       for 
       year  
       in 
       range 
       ( 
       0 
       , 
       len 
       (yearlist)): 
      
 
                
       for 
       season  
       in 
       range 
       ( 
       1 
       , 
       5 
       ): 
      
 
                    
       try 
       : 
      
 
                        
       jidu  
       = 
       str 
       (season) 
      
 
                        
       codestr  
       = 
       str 
       (code) 
      
 
                        
       url  
       = 
       'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/' 
       + 
       codestr 
       + 
       '.phtml?year=' 
       + 
       yearlist[year] 
       + 
       '&jidu=' 
       + 
       jidu 
      
 
                        
       rsp  
       = 
       urllib.request.urlopen(url) 
      
 
                        
       html  
       = 
       rsp.read() 
      
 
                        
       soup  
       = 
       BeautifulSoup(html, from_encoding  
       = 
       'GB2312' 
       ) 
      
 
                        
       #tablesoup = soup.getText() 
      
 
                        
       tablesoup  
       = 
       soup.find_all( 
       'table' 
       , attrs  
       = 
       { 
       'id' 
       : 
       'FundHoldSharesTable' 
       })  
      
 
                        
       d1  
       = 
       {} 
      
 
                        
       rows  
       = 
       tablesoup[ 
       0 
       ].findAll( 
       'tr' 
       ) 
      
 
                        
       colume  
       = 
       rows[ 
       1 
       ].findAll( 
       'td' 
       ) 
      
 
                        
       for 
       row  
       in 
       rows[ 
       2 
       :]: 
      
 
                            
       data  
       = 
       row.findAll( 
       'td' 
       ) 
      
 
                            
       d1.setdefault(colume[ 
       0 
       ].get_text(),[]).append(data[ 
       0 
       ].get_text(strip  
       = 
       True 
       )) 
      
 
                            
       d1.setdefault(colume[ 
       1 
       ].get_text(),[]).append(data[ 
       1 
       ].get_text(strip  
       = 
       True 
       )) 
      
 
                            
       d1.setdefault(colume[ 
       2 
       ].get_text(),[]).append(data[ 
       2 
       ].get_text(strip  
       = 
       True 
       )) 
      
 
                            
       d1.setdefault(colume[ 
       3 
       ].get_text(),[]).append(data[ 
       3 
       ].get_text(strip  
       = 
       True 
       )) 
      
 
                            
       d1.setdefault(colume[ 
       4 
       ].get_text(),[]).append(data[ 
       4 
       ].get_text(strip  
       = 
       True 
       )) 
      
 
                            
       d1.setdefault(colume[ 
       5 
       ].get_text(),[]).append(data[ 
       5 
       ].get_text(strip  
       = 
       True 
       )) 
      
 
                            
       d1.setdefault(colume[ 
       6 
       ].get_text(),[]).append(data[ 
       6 
       ].get_text(strip  
       = 
       True 
       )) 
      
 
                        
       encodejson  
       = 
       open 
       (r 
       'C:\Users\ZSH\Desktop\Python\DATA\ ' 
       + 
       rows[ 
       0 
       ].get_text(strip  
       = 
       True 
       ) 
       + 
       yearlist[year] 
       + 
       r 
       '年' 
       + 
       jidu 
       + 
       r 
       '季度.json' 
       , 
       'w' 
       ) 
      
 
                        
       encodejson.write(json.dumps(d1,ensure_ascii  
       = 
       False 
       )) 
      
 
                        
       print 
       ( 
       '已完成' 
       + 
       rows[ 
       0 
       ].get_text(strip  
       = 
       True 
       ) 
       + 
       yearlist[year] 
       + 
       r 
       '年' 
       + 
       jidu 
       + 
       r 
       '季度.json' 
       ) 
      
 
                    
       except 
       : 
      
 
                        
       print 
       ( 
       '出现了错误' 
       ) 
      
 
                        
       continue   
      
 
            
       print 
       ( 
       '抓取完成！' 
       ) 
      
 
               
      
 
       get_data( 
       600000 
       ) 
      

1，windows下，Python环境的搭建，个人环境是myeclipse+pydev，参考的教程帖子是Python环境搭建我的以为myeclipse是个很是强大的编译器，上手较容易。关于Python函数，for 语句等等基本基本语法，我推荐两个文档，一是“Python简明教程”（中文），内容通俗易懂。另外一个就是位于C:\Python34\Doc的说明文档。python

2，这个脚本用到的第三方模块——beautifulsoup4，也就是from bs4 import BeautifulSoup 这一句代码牵扯到的，这个模块用于从html代码中分析出表格区域，进一步解析出数据。关于beautifulsoup的安装我参考的是Windows平台安装Beautiful Soup 。json

3，关于用urllib.request模块实现整个功能的部分，我从这位大哥的博客里学到了好多，他的博客真是超级详细易懂，体贴初学者。博客地址windows

4，Python字符串“格式化”——也即替换句子中的某一个字符串。Python中与字符串相关的各类操做Python基础教程笔记——使用字符串中讲的很详细。app

5，Python2到Python3的转换，因为字符编码的问题（中文print出来是ascii码），有人建议换到Python3，由于Python3默认是utf-8，Python3.x和Python2.x的区别这个连接讲了Python2和Python3的区别。eclipse

来自为知笔记(Wiz)