Python如何爬取猫咪网站交易数据-柠檬ai自媒体

本期，边肖将给大家带来Python如何抓取猫网站交易数据的信息。文章内容丰富，从专业角度进行分析和叙述。看完这篇文章，希望你能有所收获。

00-1010当你看到一个可爱的猫咪表情包，总会忍不住去收藏。晒干部分如下：

Python如何爬取猫咪网站交易数据

从这个网站，我们抓取了猫咪品种介绍数据和20W猫咪交易数据，从而了解可爱猫咪。

00-1010打开猫和猫交易网，先抓取猫品种数据，打开页面看到猫和猫品种列表：

Python如何爬取猫咪网站交易数据

通过查看网页，我们可以发现网页的结构简单，易于分析和提取数据。爬虫代码如下：

导入请求

进口

导入csv

fromlxmlimportetree

fromtqdmimporttqdm

from fake _ user agentimportuser agent

#随机生成请求头。

ua=UserAgent(verify_ssl=False，路径='fake_useragent.json ')

Defrandom_ua():#用于随机切换请求头。

标题={ 0

接受-编码' :'gzip '，

接受-语言' :'zh-CN '，

连接' : '保持活动状态'，

主持人' :'www.maomijiaoyi.com，

用户代理' :ua.random

}

返回标题

Defcreate_csv():#创建csv以保存数据。

withopen('。/data/cat_kind.csv '，' w '，换行=' '，编码='utf-8')asf:

wr=csv.writer(f)

wr . writerrow(['品种'，'参考价'，'中文学名'，'别名'，'祖先'，'分布区'，

原点'，'体型'，'原始用途'，'今天用途'，'分组'，'高度'，

'体重', '寿命', '整体', '毛发', '颜色', '头部', '眼睛',
                     '耳朵', '鼻子', '尾巴', '胸部', '颈部', '前驱', '后驱',
                     '基本信息', 'FCI标准', '性格特点', '生活习性', '优点/缺点',
                     '喂养方法', '鉴别挑选'])
def scrape_page(url1):      # 获取HTML网页源代码返回文本
    response = requests.get(url1, headers=random_ua())
    # print(response.status_code)
    response.encoding = 'utf-8'
    return response.text
def get_cat_urls(html1):    # 获取每个品种猫咪详情页url
    dom = etree.HTML(html1)
    lis = dom.xpath('//div[@class="pinzhong_left"]/a')
    cat_urls = []
    for li in lis:
        cat_url = li.xpath('./@href')[0]
        cat_url = 'http://www.maomijiaoyi.com' + cat_url
        cat_urls.append(cat_url)
    return cat_urls
def get_info(html2):    # 爬取每个品种猫咪详情页里的有关信息
    # 品种
    kind = re.findall('div class="line1">.*?<div class="name">(.*?)<span>', html2, re.S)[0]
    kind = kind.replace('\r','').replace('\n','').replace('\t','')
    # 参考价格
    price = re.findall('<div>参考价格：</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    price = price.replace('\r', '').replace('\n', '').replace('\t', '')
    # 中文学名
    chinese_name = re.findall('<div>中文学名:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    chinese_name = chinese_name.replace('\r', '').replace('\n', '').replace('\t', '')
    # 别名
    other_name = re.findall('<div>别名:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    other_name = other_name.replace('\r', '').replace('\n', '').replace('\t', '')
    # 祖先
    ancestor = re.findall('<div>祖先:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    ancestor = ancestor.replace('\r', '').replace('\n', '').replace('\t', '')
    # 分布区域
    area = re.findall('<div>分布区域:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    area = area.replace('\r', '').replace('\n', '').replace('\t', '')
    # 原产地
    source_area = re.findall('<div>原产地:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    source_area = source_area.replace('\r', '').replace('\n', '').replace('\t', '')
    # 体型
    body_size = re.findall('<div>体型:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    body_size = body_size.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 原始用途
    source_use = re.findall('<div>原始用途:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    source_use = source_use.replace('\r', '').replace('\n', '').replace('\t', '')
    # 今日用途
    today_use = re.findall('<div>今日用途:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    today_use = today_use.replace('\r', '').replace('\n', '').replace('\t', '')
    # 分组
    group = re.findall('<div>分组:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    group = group.replace('\r', '').replace('\n', '').replace('\t', '')
    # 身高
    height = re.findall('<div>身高:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    height = height.replace('\r', '').replace('\n', '').replace('\t', '')
    # 体重
    weight = re.findall('<div>体重:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    weight = weight.replace('\r', '').replace('\n', '').replace('\t', '')
    # 寿命
    lifetime = re.findall('<div>寿命:</div>.*?<div>(.*?)</div>', html2, re.S)[0]
    lifetime = lifetime.replace('\r', '').replace('\n', '').replace('\t', '')
    # 整体
    entirety = re.findall('<div>整体</div>.*?.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    entirety = entirety.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 毛发
    hair = re.findall('<div>毛发</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    hair = hair.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 颜色
    color = re.findall('<div>颜色</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    color = color.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 头部
    head = re.findall('<div>头部</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    head = head.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 眼睛
    eye = re.findall('<div>眼睛</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    eye = eye.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 耳朵
    ear = re.findall('<div>耳朵</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    ear = ear.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 鼻子
    nose = re.findall('<div>鼻子</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    nose = nose.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 尾巴
    tail = re.findall('<div>尾巴</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    tail = tail.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 胸部
    chest = re.findall('<div>胸部</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    chest = chest.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 颈部
    neck = re.findall('<div>颈部</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    neck = neck.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 前驱
    font_foot = re.findall('<div>前驱</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    font_foot = font_foot.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 后驱
    rear_foot = re.findall('<div>前驱</div>.*?<div></div>.*?<div>(.*?)</div>', html2, re.S)[0]
    rear_foot = rear_foot.replace('\r', '').replace('\n', '').replace('\t', '').strip()
    # 保存前面猫猫的各种有关信息
    cat = [kind, price, chinese_name, other_name, ancestor, area, source_area,
           body_size, source_use, today_use, group, height, weight, lifetime,
           entirety, hair, color, head, eye, ear, nose, tail, chest, neck, font_foot, rear_foot]
    # 提取标签栏信息（基本信息-FCI标准-性格特点-生活习性-优缺点-喂养方法-鉴别挑选）
    html2 = etree.HTML(html2)
    labs = html2.xpath('//div[@class="property_list"]/div')
    for lab in labs:
        text1 = lab.xpath('string(.)')
        text1 = text1.replace('\n','').replace('\t','').replace('\r','').replace(' ','')
        cat.append(text1)
    return cat
def write_to_csv(data):     # 保存数据  追加写入
    with open('./data/cat_kind.csv', 'a+', newline='', encoding='utf-8') as fn:
        wr = csv.writer(fn)
        wr.writerow(data)
if __name__ == '__main__':
    # 创建保存数据的csv
    create_csv()
    # 猫咪品种页面url
    base_url = 'http://www.maomijiaoyi.com/index.php?/pinzhongdaquan_5.html'
    # 获取品种页面中的所有url
    html = scrape_page(base_url)
    urls = get_cat_urls(html)
    # 进度条可视化运行情况    就不打印东西来看了
    pbar = tqdm(urls)
    # 开始爬取
    for url in pbar:
        text = scrape_page(url)
        info = get_info(text)
        write_to_csv(info)

运行效果如下：

Python如何爬取猫咪网站交易数据

爬取更详细的数据需要进入详情页，包含商家信息、猫咪品种、猫龄、价格、标题、在售只数、预防等信息。

Python如何爬取猫咪网站交易数据

看各种猫咪的体型分布

Python如何爬取猫咪网站交易数据

橘猫是世界各地都有的，不愧是我大橘猫。俗话说 "十个橘猫九个胖还有一个压塌炕"。橘猫比起其他花色的猫咪更喜欢吃东西，它们的食欲很好，能更好地生存，可能这也是橘猫在世界范围都有的原因吧。可它却是小型猫，橘猫小时候颜值一般挺高，看起来小小的一只，又嫩又可爱的，但等橘猫长大以后，才真正地意识到什么是 "橘足轻重"。

Python如何爬取猫咪网站交易数据

橘猫的交易数量最多呀，之前也提到橘猫世界各地都有，从这里也可以看到橘猫数量最多。其次是咖啡猫，布偶猫，英短蓝白猫等。

Python如何爬取猫咪网站交易数据

售卖的猫咪猫龄主要在1-6个月，都是刚出生还未满半岁的小猫咪呀。这时候的小猫咪应该很可爱吧，等待有缘的主人把它带回家。

最后来看一下网站里价格最贵的猫咪和浏览次数最多的猫咪

import pandas as pd
df = pd.read_excel('处理后数据.xlsx')
print(df.info())
df1 = df.sort_values(by='浏览次数', ascending=False)
print(df1.iloc[:3, ::].values)
print('----------------------------------------------------------')
df2 = df.sort_values(by='价格', ascending=False)
print(df2.iloc[:3, ::].values)

# 浏览次数最多的
http://www.maomijiaoyi.com/index.php?/chanpinxiangqing_441879.html
http://www.maomijiaoyi.com/index.php?/chanpinxiangqing_462431.html
http://www.maomijiaoyi.com/index.php?/chanpinxiangqing_455366.html

Python如何爬取猫咪网站交易数据

反观浏览次数排第二、第三的，价格便宜不少，预防都打了3针疫苗，在售只数还比较充裕，还比第一可爱好多（个人感觉）。

# 价格最贵的如下
http://www.maomijiaoyi.com/index.php?/chanpinxiangqing_265770.html
http://www.maomijiaoyi.com/index.php?/chanpinxiangqing_281910.html
http://www.maomijiaoyi.com/index.php?/chanpinxiangqing_230417.html

Python如何爬取猫咪网站交易数据

价格最贵的发现均为 3000 元的布偶猫。查阅资料发现，布偶猫，大型猫咪，不仅购买的时候价格高昂，饲养成本也比较高，因为食量和运动量都比较大，而且美容等相关费用也会高一些。

上述就是小编为大家分享的Python如何爬取猫咪网站交易数据了，如果刚好有类似的疑惑，不妨参照上述分析进行理解。如果想知道更多相关知识，欢迎关注行业资讯频道。

内容来源网络，如有侵权，联系删除，本文地址：https://www.230890.com/zhan/49731.html

Python如何爬取猫咪网站交易数据

相关推荐

怎么利用Outlook来创建基于电子邮件的持久化后门

关于应用程序连接Oracle 12C ORA-01017问题是怎么解决的

Windows Java代码如何远程访问HBase

焦虑症状,抑郁症焦虑症都有些什么症状

怎么使用ImageModifier

大戴礼记,什么是辟谷

分享到：