因为马上就要大四实习了,博主实在懒得在学校官网上一个个翻,直接用爬虫将所有数据都爬下来
放在表格里,这样感觉简单多了,可惜还没找到工作,so sad
总共选择了三个学校:湖南大学,中南大学,湘潭大学
三个项目代码分别如下(新手代码,惨不忍睹):
湘潭大学:
#!/usr/bin/python3#coding=utf-8import requestsimport jsonimport loggingimport osimport xlwtbasic_url = 'http://jobs.xtu.edu.cn/index/getdaycareers?day=2018-10-'logging.basicConfig(level=logging.DEBUG,format='')workbook = xlwt.Workbook()sheet1 = workbook.add_sheet('list1')sheet1.write(0,0,'时间')sheet1.write(0,1,'地点')sheet1.write(0,2,'公司名称')sheet1.write(0,3,'专业要求')sheet1.write(0,5,'详细信息')count=1for i in range(1,32): url = basic_url+str(i) logging.debug('the clawer web site is:'+url) clawertext = requests.get(url) logging.debug(type(clawertext)) logging.debug(clawertext.json()) logging.debug(clawertext.json()['data']) logging.debug(type(clawertext.json()['data'])) data_list = clawertext.json()['data']#the useful data for i in data_list: sheet1.write(count,0,i['meet_day']) sheet1.write(count,1,i['address']) sheet1.write(count,2,i['meet_name']) sheet1.write(count,3,i['professionals']) sheet1.write(count,5,'http://jobs.xtu.edu.cn/detail/career?id='+i['career_talk_id']) count=count+1workbook.save('湘潭大学十月份招聘信息.xlsx')
中南大学:
这个最坑,花了我一个多小时
#!/usr/bin/python3#coding=utf-8import requestsimport xlwtimport jsonimport loggingimport bs4from bs4 import BeautifulSoup#初始化日志保存路劲,及格式logging.basicConfig(filename='log.txt',level=logging.DEBUG,format='%(asctime)s - %(levelname)s - %(message)s')logging.getLogger('requests').setLevel(logging.WARNING)#禁用requests的日志#初始化表格workbook = xlwt.Workbook()sheet1 = workbook.add_sheet('list')sheet1.write(0,0,'时间')sheet1.write(0,1,'地点')sheet1.write(0,2,'公司名称')sheet1.write(0,3,'职位名称')sheet1.write(0,4,'教育水平')sheet1.write(0,5,'专业要求')sheet1.write(0,6,'空缺数量')sheet1.write(0,7,'详细信息')#初始化地址json_all_url = 'http://jobsky.csu.edu.cn/Home/SearchDateAllMonth'dt1={ 'Date':'2018-09-04'}post_data = requests.post(json_all_url,data=dt1)json_data = post_data.json()logging.debug(type(json_data))'''with open('json.txt','w') as fileTxt: for i in json_data: fileTxt.write(str(i)+'\n') '''basic_html_url = 'http://jobsky.csu.edu.cn/Home/ArticleDetails/'counter_all = 1for data in json_data: company_Id=data['NewsID'] #logging.debug('the commpanyID is:'+company_Id) html_url=basic_html_url+company_Id#html_url=basic_html_url+'13713'#static url,please delete and repaire after you have used it html_txt = requests.get(html_url)# logging.debug('the web site using code is:'+str(html_txt.status_code)) bs = BeautifulSoup(html_txt.text,'lxml') #get the commpanyName list_soup_CN = bs.find('h1',attrs={ 'class':'text-center title'}) try: advertise_company_name=list_soup_CN.getText() sheet1.write(counter_all,2,advertise_company_name) except: logging.debug("the url"+html_url+'has some problem') #get the time and place try: list_soup_TP = bs.find('div',attrs={ 'id':'placeAndTime'}) advertise_time=list_soup_TP.find('p',attrs={ 'class':'text-center time'}).getText() advertise_place=list_soup_TP.find('p',attrs={ 'class':'text-center place'}).getText() sheet1.write(counter_all,0,advertise_time) sheet1.write(counter_all,1,advertise_place) except: logging.debug("the url"+html_url+'has some problem') try: list_soup_demand = bs.find('table',attrs={ 'class':'table table-bordered'}) list_td = list_soup_demand.find_all('td') counter_even = 0#use to counter ,so that we can find the number of td,and get we need data #we can get the useful data by looking the source for td in list_td: if counter_even==1 : sheet1.write(counter_all,3,td.getText()) if counter_even==3 : sheet1.write(counter_all,4,td.getText()) if counter_even==5 : sheet1.write(counter_all,5,td.getText()) if counter_even==7 : sheet1.write(counter_all,6,td.getText()) counter_even =counter_even+1 sheet1.write(counter_all,7,html_url) counter_all+=1 except: logging.debug("the url"+html_url+'has some problem') #保存文件 workbook.save('中南大学招聘信息.xlsx')
最后是湖南大学,不知道为什么,湖南大学招聘信息少的可怜
#!/usr/bin/python3#coding=utf-8import requestsimport jsonimport loggingimport osimport xlwtjson_url = 'https://hnu.bysjy.com.cn/module/getcareers?start_page=1&keyword=&type=inner&day=&count=15&start=1&_=1536044186160'logging.basicConfig(level=logging.DEBUG,format='')json_data = requests.get(json_url)#print(json_data.text)workbook = xlwt.Workbook()sheet1 = workbook.add_sheet('list1')sheet1.write(0,0,'时间')sheet1.write(0,1,'地点')sheet1.write(0,2,'公司名称')sheet1.write(0,3,'招聘会')sheet1.write(0,4,'专业要求')sheet1.write(0,6,'详细信息')count=1 data_list = json_data.json()['data']#the useful data for i in data_list: sheet1.write(count,0,i['meet_day']+i['meet_time']) sheet1.write(count,1,i['address']) sheet1.write(count,2,i['company_name']) sheet1.write(count,3,i['meet_name']) sheet1.write(count,4,i['professionals']) sheet1.write(count,5,'https://hnu.bysjy.com.cn/detail/career?id='+i['career_talk_id']) count=count+1workbook.save('湖南大学招聘信息.xlsx')