这个是帮朋友做的,难点就是他们有一个反爬虫机制,用request一直不行,后面我就用selenium直接把网页copy下来,然后再来解析本地的html文件,就木有问题啦。
现在看来,写得有点傻,多包涵。
# -*- coding:utf-8 -*- import os import time import datetime import codecs from lxml import etree from selenium import webdriver import csv #控制编码,全英文网页,用不着 # import sys # reload(sys) # sys.setdefaultencoding('utf-8') # # date格式转为string格式 today = datetime.date.today() today_string = today.strftime('%Y-%m-%d') #通过浏览器得到网页页面--反反爬虫 def html_getter(site,file_name): driver = webdriver.Firefox() # chromedriver = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe' # os.environ['webdriver.chrome.driver'] = chromedriver # driver = webdriver.Chrome(chromedriver) driver.get(site) driver.maximize_window() # 将浏览器最大化显示 time.sleep(5) # 控制间隔时间,等待浏览器反映 # 保存页面 source_code = driver.find_element_by_xpath("//*").get_attribute("outerHTML") f = codecs.open(file_name, 'w+', 'utf8') f.write(source_code) f.close() #打开保存在本地的html文件 def file_html(file_name): f = open(file_name,'r') html = f.read() f.close() return html #写入csv,也可以有其他写入方式,这个地方就csv啦 def csv_writer(ll): headers = ['drug','inter','snp_rs_id','Allele_name','Defining_change','Adverse_Reaction','ref','href','original_title'] with open('drugbank.csv','a') as f: f_csv = csv.writer(f) f_csv.writerow(headers) f_csv.writerows(ll) #用xpath解析网页,得到表格数据,我就是这么爱xpath,不喜欢正则表达式 def data_get(html): selector = etree.HTML(html) tbody=selector.xpath('/html/body/main/table/tbody/tr') for each in tbody: # #1.'drug' drug_name=each.xpath('td[1]/strong/text()')[0] drug_sn=each.xpath('td[1]/a/text()')[0] drug=drug_name+' '+drug_sn # #print(drug) # #2.'Interacting Gene/Enzyme' int=each.xpath('td[2]')[0] inter=int.xpath('string(.)') # print(inter) # #3.'SNP RS ID' snp=each.xpath('td[3]/a/text()') if snp: snp_rs_id=snp[0] else: snp_rs_id='Not Available ' #print snp_rs_id #4.Allele name Allele=each.xpath('td[4]/text()') if Allele: Allele_name=Allele[0] else: Allele_name='Not Available ' # #print Allele_name # #5.'Defining change' Defining=each.xpath('td[5]/text()') if Defining: Defining_change=Defining[0] else: Defining_change='Not Available ' # print Defining_change # 6.'Adverse Reaction' Adverse=each.xpath('td[6]/text()') if Adverse: Adverse_Reaction=Adverse[0] else: Adverse_Reaction='Not Available ' # print Adverse_Reaction #7.'Reference(s)' ref=each.xpath('td[7]/span/a/text()')[0] href=each.xpath('td[7]/span/a/@href')[0] original_title=each.xpath('td[7]/span/a/@data-original-title')[0] # print ref # print(href) # print(original_title) tt=(drug,inter,snp_rs_id,Allele_name,Defining_change,Adverse_Reaction,ref,href,original_title) ll.append(tt) #print ll if __name__ == '__main__': ll=[] for i in range(1,5): page_num=i site='http://www.drugbank.ca/genobrowse/snp-adr?page='+str(page_num) #get the html through webdriver file_name=unicode(today_string)+u'drugbank_'+unicode(str(page_num))+u'.html' html_getter(site,file_name) html=file_html(file_name) data_get(html) csv_writer(ll)