豆瓣作为一款集书影音评论的网站 ,却还提供了各种其他有趣的多服务,这更像是一个社区。往往这样的网站中会有非常多有趣的数据,今天就来尝试爬一下,做一个数据集出来。起初就在这个网站看到有一个专门的Top250榜单,官方说是根据书/电影/音乐所看过或听过的人数,以及该项目所得的评价等综合数据,通过算法分析 产生的。那今天的主要目的就来爬一下这些个Top250。代码写的很水,希望能有人不吝赐教。
爬取目标
- 豆瓣电影 Top 250 >> 电影名,导演,年份,制片地区,电影分类,豆瓣评分,评论人数,描述 <<
- 豆瓣图书 Top 250 >> 书名,作者,售价,豆瓣评分,评论人数,描述 <<
- 豆瓣音乐 Top 250 >> 名称,作者,年份,类型,介质,风格,豆瓣评分,评论人数 <<
具体实现
爬取豆瓣电影 Top 250
""" @author: Yu @contact: kinomu@sina.com @file: DoubanMovieSpider.py @time: 2018/9/13 9:27 @desc: 豆瓣电影 Top 250 爬虫 """
import requests import csv from lxml import etree
filmsdata = [] root = 10000 count = 0
for i in range(10): url = 'https://movie.douban.com/top250?start=' + str(25*i) data = requests.get(url).text html = etree.HTML(data) films = html.xpath('//*[@id="content"]/div/div[1]/ol/li') for film in films: _title = film.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0] film_info = film.xpath('./div/div[2]/div[2]/p/text()') _director = film_info[0].strip('\n').strip('\xa0').strip(' ').split('\xa0')[0].split(':')[1] detailed = film_info[1].strip('\n').strip('\xa0').strip(' ') _year = detailed.split('/')[0].strip('\xa0') _area = detailed.split('/')[1].strip('\xa0') _type = detailed.split('/')[2].strip('\xa0').strip('\n')
_score = '\t'+film.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0] _count = film.xpath('./div/div[2]/div[2]/div/span[4]/text()')[0].split('人')[0] _content = film.xpath('./div/div[2]/div[2]/p[2]/span/text()')[0] count += 1 _index = count + root _all = [_index, _title, _director, _year, _area, _type, _score, _count, _content] filmsdata.append(_all)
csv_file = 'film_top250.csv' with open(csv_file, 'w', encoding='utf-8-sig', newline='') as _file: writer = csv.writer(_file) header = ['序号', '电影名', '导演', '年份', '制片地区', '电影分类', '豆瓣评分', '评论人数', '描述'] writer.writerow(header) for row in filmsdata: print(row) writer.writerow(row)
|
爬取豆瓣图书 Top 250
""" @author: Yu @contact: kinomu@sina.com @file: BookSpider.py @time: 2018/9/13 10:01 @desc: 豆瓣图书 Top 250 """
import requests import csv from lxml import etree
bookdata = [] root = 20000 count = 0 for i in range(10): url = 'https://book.douban.com/top250?start=' + str(25*i) data = requests.get(url).text html = etree.HTML(data) books = html.xpath('//*[@id="content"]/div/div[1]/div/table') for book in books: _name = book.xpath('./tr/td[2]/div/a/@title')[0] alias = book.xpath('./tr/td[2]/div/span[1]/text()') if len(alias) == 0: alias.append('(无)') _othername = alias[0] info = book.xpath('./tr/td[2]/p[1]/text()')[0].split('/') _author = info[0].strip().replace(' 口述', '').replace('著', '') if ']' in _author: _author = _author.split(']')[1].strip().replace(' ', '') if _author[0] == '(': _author = _author.split(')')[1].replace(' ', '') if '】' in _author: _author = _author.split('】')[1].replace(' ', '') _price = info[-1].replace('元', '').replace('RMB', '').replace('CNY','').replace(' ', '') _score = '\t'+book.xpath('./tr/td[2]/div[2]/span[2]/text()')[0] _count = book.xpath('./tr/td[2]/div[2]/span[3]/text()')[0]\ .replace('(', '').replace(')', '').replace('\n', '').strip().split('人')[0] _content = book.xpath('./tr/td[2]/p[2]/span/text()') if len(_content) == 0: _content.append('(无)') _content = _content[0] count += 1 _index = root + count bookdata.append([_index, _name, _author, _price, _score, _count, _content])
csv_file = 'book_top250.csv' with open(csv_file, 'w', encoding='utf-8-sig', newline='') as _file: writer = csv.writer(_file) header = ['序号', '书名', '作者', '售价', '豆瓣评分', '评论人数', '描述'] writer.writerow(header) for row in bookdata: print(row) writer.writerow(row)
|
爬取豆瓣音乐 Top 250
""" @author: Lin Yu @contact: kinomu@sina.com @file: MusicSpider.py @time: 2018/9/13 10:22 @desc: 豆瓣音乐 Top 250 爬虫 """ import requests import csv from lxml import etree
musicdata = [] root = 3000 count = 0
for i in range(10): url = 'https://music.douban.com/top250?start=' + str(25 * i) data = requests.get(url).text html = etree.HTML(data) music_set = html.xpath('//*[@id="content"]/div/div[1]/div/table') for music in music_set: title_author = music.xpath('./tr/td[1]/a/@title')[0].split('-') _title = title_author[1] _author = title_author[0] info = music.xpath('./tr/td[2]/div/p[1]/text()')[0].split(' / ') _date = '\t'+info[1]\ .replace('年', '-')\ .replace('月', '-')\ .replace('日', '-')\ .replace('/', '-')\ .strip('-') _type = info[2] if len(info) == 5: _media = info[3] _style = info[4] elif len(info) == 4: _media = info[-1] _style = '(未知)' _score = '\t'+music.xpath('./tr/td[2]/div/div[1]/span[2]/text()')[0] target = './tr/td[2]/div/div[1]/span[3]/text()' _count = music.xpath(target)[0].replace(')', '').replace('(', '').strip().split('人')[0] count += 1 _index = root + count musicdata.append([_index, _title, _author, _date, _type, _media, _style, _score, _count])
csv_file = 'music_top250.csv' with open(csv_file, 'w', encoding='utf-8-sig', newline='') as _file: writer = csv.writer(_file) header = ['序号', '名称', '作者', '年份', '类型', '介质', '风格', '豆瓣评分', '评论人数'] writer.writerow(header) for row in musicdata: print(row) writer.writerow(row)
|