TED-Talks的视频(www.ted.com/talks) 云集了曾踏上过TED讲坛、闻名中外的思想家、艺术家和科技专家。在TED.com网站上,咱们能够免费下载这些视频。视频包含了能够互动的英文讲稿以及多达80多个语种的字幕。css
此次的爬取场景是将某个演讲视频下英语和匈牙利语的字幕稿给抽取出来并一一对应后写入文件,并利用Selenium随机点击下一个视频,不断执行上述操做。
英语字幕稿: 例子连接:https://www.ted.com/talks/fabio_pacucci_could_the_earth_be_swallowed_by_a_black_hole/transcript
匈牙利语字幕稿: 例子连接=英文连接+?language=hu
点击下一个: 每一个视频右边都会有一列推荐视频,只要用selenium进行随机某一个进入下一个视频就行。
web
options.add_argument('--headless')
''' 这里是 import ''' __author = 'cyy' local = threading.local() id = 0 local.id = id class Get_Align(object): ''' 得到对齐句对 ''' def __init__(self,num,frequecy): self.__num = num self.__path = text_path[self.__num-1] self.frequecy = frequecy self.driver = None self.hu_behind = behind self.en_texts = [] self.hu_texts = [] self.start_url = start_urls[self.__num - 1] @property def num(self): return self.__num @num.setter def num(self,value): self.__num=value def __call__(self, *args, **kwargs): return self.get_align() def get_align(self): f = open(self.__path,'a',encoding='utf-8') self.driver = webdriver.Chrome(chrome_options=options) self.driver.maximize_window() self.driver.get(self.start_url + self.hu_behind) hu = self.driver.find_elements_by_css_selector(text) for h in hu: self.hu_texts.append(h.text) self.driver.get(self.start_url) en = self.driver.find_elements_by_css_selector(text) for e in en: self.en_texts.append(e.text) for i in range(len(self.hu_texts)): try: f.writelines(self.en_texts[i]+'\n') f.writelines(self.hu_texts[i]+'\n' + '\n') except IndexError as e: break f.close() local.id += 1 print(local.id) self.get_align_continue() def check_hu(self): ''' 检查视频是否有匈牙利字幕 :return: ''' ne = self.driver.find_elements_by_xpath(click) if not ne: local.id -= 1 self.driver.back() ne = self.driver.find_elements_by_xpath(click) ra = random.choice(ne) ra.click() return self.check_hu() else: pass def get_align_continue(self): next = self.driver.find_element_by_xpath(click).click() for i in range(2, self.frequecy): f = open(self.__path, 'a', encoding='utf-8') self.en_texts , self.hu_texts = [] , [] en_url = self.driver.current_url self.driver.get(self.driver.current_url + '/transcript' + self.hu_behind) self.check_hu() hu = self.driver.find_elements_by_css_selector(text) for h in hu: self.hu_texts.append(h.text) self.driver.get(en_url + '/transcript') en = self.driver.find_elements_by_css_selector(text) for e in en: self.en_texts.append(e.text) try: for i in range(len(self.hu_texts)): f.writelines(self.en_texts[i] + '\n') f.writelines(self.hu_texts[i] + '\n' + '\n') except IndexError as e: print(e.args) f.close() local.id += 1 print(local.id) try: next = self.driver.find_elements_by_xpath(click) choice=random.choice(next) choice.click() except IndexError as e: next = self.driver.find_element_by_xpath(click) next.click() self.driver.close()