url = "http://m.weather.com.cn/data/%s.html" path = "./conf/codeAPI.txt" def main(): for li in open(path, "r"): for l in re.findall(r'[\d]+', li): try: stream=urllib2.urlopen(url%l.strip()) weatherAnalysis.xmlBuilder(stream.read()) except Exception, e: continue
root = etree.Element("weatherinfos") data_xml = "data/weatherinfo%s.xml" splitor='~' subffixlow='L' subffixhigh='H' def xmlBuilder(f): js = json.loads(f) root.append(jsonAnalyser(js["weatherinfo"])) out= open(data_xml%datetime.datetime.now().strftime("%Y-%m-%d-%Hh"), "w") out.write(etree.tostring(root, pretty_print=True, encoding='utf-8')) def jsonAnalyser(js): element = etree.Element("weatherinfo", city=js["city"], city_en=js["city_en"]\ , date=js["date"], week=js["week"]) for x in range(1,6): d=datetime.datetime.now()+datetime.timedelta(hours=4*(x-1)) time_range= etree.Element("time_range") element.append(time_range) etree.SubElement(time_range, "time").text=d.strftime("%H") tempC=js["temp"+str(x)].split(splitor) etree.SubElement(time_range, "tempCL").text=tempC[0]+subffixlow etree.SubElement(time_range, "tempCH").text=tempC[1]+subffixhigh tempC=js["tempF"+str(x)].split(splitor) etree.SubElement(time_range, "tempFL").text=tempC[0]+subffixlow etree.SubElement(time_range, "tempFH").text=tempC[1]+subffixhigh etree.SubElement(time_range, "weather").text=js["weather"+str(x)] etree.SubElement(time_range, "wind").text=js["wind"+str(x)] return element解析后的某个城市的数据:
<weatherinfo city="北京" city_en="beijing" date="" week="星期日"> <time_range> <time>22</time> <tempCL>16℃L</tempCL> <tempCH>30℃H</tempCH> <tempFL>60.8℉L</tempFL> <tempFH>86℉H</tempFH> <weather>晴</weather> <wind>微风</wind> </time_range> <time_range> <time>02</time> <tempCL>17℃L</tempCL> <tempCH>29℃H</tempCH> <tempFL>62.6℉L</tempFL> <tempFH>84.2℉H</tempFH> <weather>多云</weather> <wind>微风</wind> </time_range> <time_range> <time>06</time> <tempCL>17℃L</tempCL> <tempCH>25℃H</tempCH> <tempFL>62.6℉L</tempFL> <tempFH>77℉H</tempFH> <weather>多云转小雨</weather> <wind>微风</wind> </time_range> <time_range> <time>10</time> <tempCL>15℃L</tempCL> <tempCH>26℃H</tempCH> <tempFL>59℉L</tempFL> <tempFH>78.8℉H</tempFH> <weather>小雨转阴</weather> <wind>微风转北风3-4级</wind> </time_range> <time_range> <time>14</time> <tempCL>15℃L</tempCL> <tempCH>30℃H</tempCH> <tempFL>59℉L</tempFL> <tempFH>86℉H</tempFH> <weather>晴</weather> <wind>微风</wind> </time_range> </weatherinfo>
Apriori algorithm是关联规则里一项基本算法。是由Rakesh Agrawal和Ramakrishnan Srikant两位博士在1994年提出的关联规则挖掘算法。关联规则的目的就是在一个数据集中找出项与项之间的关系,也被称为购物蓝分析 (Market Basket analysis),由于“购物蓝分析”很贴切的表达了适用该算法情景中的一个子集。关于这个算法有一个很是有名的故事:"尿布和啤酒"。故事是这样的:美国的妇女们常常会嘱咐她们的丈夫下班后为孩子买尿布,而丈夫在买完尿布后又要顺 手买回本身爱喝的啤酒,所以啤酒和尿布在一块儿被购买的机会不少。这个举措使尿布和啤酒的销量双双增长,并一直为众商家所津津乐道。 html
def returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet): #把数据中大于最小支持度的项组成itemSet返回 def joinSet(itemSet,length): #这个功能就是所谓的对一个集合笛卡尔积去掉重复的部分 def getItemSetTransactionList(data_iterator): #把数据转换成为Set和list<Set>的形式 def runApriori(data_iter, minSupport, minConfidence): """ run the apriori algorithm. data_iter is a record iterator Return both: - items (tuple, support) - rules ((pretuple, posttuple), confidence) """ itemSet, transactionList = getItemSetTransactionList(xmlAnalysis.parseWeatherXML(data_iter)) freqSet = defaultdict(int) largeSet = dict() # Global dictionary which stores (key=n-itemSets,value=support) which satisfy minSupport assocRules = dict() # Dictionary which stores Association Rules oneCSet = returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet) currentLSet = oneCSet k = 2 while(currentLSet != set([])): largeSet[k-1] = currentLSet currentLSet = joinSet(currentLSet,k) currentCSet = returnItemsWithMinSupport(currentLSet, transactionList, minSupport, freqSet) currentLSet = currentCSet k = k + 1 toRetItems=[] for key,value in largeSet.items(): toRetItems.extend([(tuple(item), getSupport(item)) for item in value]) toRetRules=[] for key,value in largeSet.items()[1:]: for item in value: _subsets = map(frozenset,[x for x in subsets(item)]) for element in _subsets: remain = item.difference(element) if len(remain)>0: confidence = getSupport(item)/getSupport(element) if confidence >= minConfidence: toRetRules.append(((tuple(element),tuple(remain)), confidence)) return toRetItems, toRetRules