摘自:https://blog.csdn.net/qq_38900565/article/details/102650951
天气模块
本文分别使用正则表达式和lxml+xpath进行数据爬取,抓取当前时间天气和未来7天天气数据
import re
import requests
from urllib import request
from xpinyin import Pinyin
import time
from lxml import etree
class WeatherParse(object):
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
Url = 'https://www.tianqi.com/'
@classmethod
def parse_weather_now(cls, url, headers):
weather_now = dict()
resp = requests.get(url, headers=headers)
text = resp.text
city = re.findall(r'<dd class="name"><h2>(.*?)</h2>', text, re.DOTALL)[0]
weather = re.findall(r'<span><b>(.*?)</b>', text, re.DOTALL)[0]
temperature = re.findall(r'<p class="now"><b>(.*?)</b>', text, re.DOTALL)[0]
now_time = re.findall(r'<dd class="week">(.*?)</dd>', text, re.DOTALL)[0]
date = re.findall(r'(.*?)日', now_time)[0] + '日'
week = '星期'+ re.findall(r'星期(.*?)\s', now_time)[0]
img_url = 'https://' + re.findall(r'<dd class="weather">\n<i><img src="//(.*?)">', text, re.DOTALL)[0]
items = re.findall(r'<dd class="shidu"><b>(.*?)</b><b>(.*?)</b><b>(.*?)</b>', text, re.DOTALL)[0]
nodes = []
for item in list(items):
temp = re.split(r':', item)[1]
nodes.append(temp)
humidity = nodes[0]
wind_direct = nodes[1]
ultraviolet_rays = nodes[2]
quality = re.findall(r'<dd class="kongqi">.*?:(.*?)</h5>', text, re.DOTALL)[0]
PM = re.findall(r'<dd class="kongqi">.*?</h5><h6>PM:(.*?)</h6>', text, re.DOTALL)[0].strip()
# print('城市:%s\t气温:%s℃\t%s\t 日期:%s\t 湿度:%s\t风西:%s\t紫外线:%s\t天气:%s\t天气质量:%s\tPM:%s'
# % (city, temperature, week, date, humidity, wind_direct, ultraviolet_rays, weather, quality, PM))
# print('imgUrl:%s' % img_url)
datetime = time.strftime('%H:%M:%S', time.localtime(time.time()))
weather_now['city'] = city
weather_now['week'] = week
weather_now['date'] = date
weather_now['datetime'] = datetime
weather_now['quality'] = quality
weather_now['temperature'] = temperature
weather_now['weather'] = weather
weather_now['wind_direct'] = wind_direct
weather_now['humidity'] = humidity
weather_now['ultraviolet_rays'] = ultraviolet_rays
weather_now['PM'] = PM
weather_now['img_url'] = img_url
return weather_now
@classmethod
def parse_weather_7(cls, url, headers, city):
resp = requests.get(url, headers=headers)
# text = resp.content.decode('utf-8')
text = resp.text
html = etree.HTML(text)
datetimes = []
weeks = []
img_urls = []
date_weeks = html.xpath("//ul[@class='week']/li")
for item in date_weeks:
datetimes.append(item.xpath("./b/text()")[0])
weeks.append(item.xpath("./span/text()")[0])
img_urls.append('http:' + item.xpath("./img/@src")[0])
weathers = html.xpath("//ul[@class='txt txt2']/li/text()")
temperatures = html.xpath("//div[@class='zxt_shuju']/ul")[0]
max_temperatures = temperatures.xpath("./li/span/text()")
min_temperatures = temperatures.xpath("./li/b/text()")
results = []
for i in range(len(weathers)):
result = dict()
result['datetime'] = datetimes[i]
result['week'] = weeks[i]
result['weather'] = weathers[i]
result['max_temperature'] = max_temperatures[i]
result['min_temperature'] = min_temperatures[i]
result['img_url'] = img_urls[i]
results.append(result)
all= dict()
all[city] = results
return all
def get_weather_now(self, city):
p = Pinyin()
x = p.get_pinyin(city, '')
url = request.urljoin(self.Url, x)
now = self.parse_weather_now(url, self.Headers)
return now
def get_future_weather(self, city):
p = Pinyin()
x = p.get_pinyin(city, '')
url = request.urljoin(self.Url, x)
future = self.parse_weather_7(url, self.Headers, city)
return future
w = WeatherParse()
now= w.get_weather_now('北京')
future = w.get_future_weather('上海')
print(now)
print(future)