Python和BeautifulSoup:如何将JSON转换为CSV

戴森

我正在使用Python和BS4构建我的第一个网络爬虫。我想调查2018年KONA铁人三项世界锦标赛的计时数据。将JSON转换为CSV的最佳方法是什么?

from bs4 import BeautifulSoup, Comment
from collections import defaultdict
import json
import requests

sauce = 
'http://m.ironman.com/triathlon/events/americas/ironman/world- 
championship/results.aspx'


r = requests.get(sauce)
data = r.text

soup = BeautifulSoup(data, 'html.parser')


def parse_table(soup):
    result = defaultdict(list)
    my_table = soup.find('tbody')

    for node in my_table.children:
        if isinstance(node, Comment):
            # Get content and strip comment "<!--" and "-->"
            # Wrap the rows in "table" tags as well.
            data = '<table>{}</table>'.format(node[4:-3])
            break

    table = BeautifulSoup(data, 'html.parser')

    for row in table.find_all('tr'):
        name, _, swim, bike, run, div_rank, gender_rank, 
overall_rank = [col.text.strip() for col in row.find_all('td')[1:]]

        result[name].append({
            'div_rank': div_rank,
            'gender_rank': gender_rank,
            'overall_rank': overall_rank,
            'swim': swim,
            'bike': bike,
            'run': run,
        })

    return result

with open('data.json', 'w') as jsonfile:
    json.dump(parse_table(soup), jsonfile)

print(json.dumps(parse_table(soup), indent=3))

JSON输出包含运动员的姓名,其分工,性别和总体排名以及游泳,骑自行车和跑步时间:

{
    "Avila, Anthony 2470": [ {
        "div_rank": "138", "gender_rank": "1243", "overall_rank": "1565", "swim": "01:20:11", "bike": "05:27:59", "run": "04:31:56"
    }
    ],
    "Lindgren, Mikael 1050": [ {
        "div_rank": "151", "gender_rank": "872", "overall_rank": "983", "swim": "01:09:06", "bike": "05:17:51", "run": "03:49:20"
    }
    ],
    "Umezawa, Kazuyoshi 1870": [ {
        "div_rank": "229", "gender_rank": "1589", "overall_rank": "2186", "swim": "01:17:22", "bike": "06:14:45", "run": "07:16:21"
    }
    ],
    "Maric, Bojan 917": [ {
        "div_rank": "162", "gender_rank": "923", "overall_rank": "1065", "swim": "01:03:22", "bike": "05:13:56", "run": "04:01:45"
    }
    ],
    "Nishioka, Maki 2340": [ {
        "div_rank": "6", "gender_rank": "52", "overall_rank": "700", "swim": "00:58:40", "bike": "05:19:10", "run": "03:33:58"
    }...
}
chitown88

您可以考虑以下几件事。你可以和熊猫一起做.read_json()或者我所做的只是遍历键,您拥有的值,然后将其扔入数据框。一旦有了数据框,就可以编写csv。

from bs4 import BeautifulSoup, Comment
from collections import defaultdict
import json
import requests
import pandas as pd


sauce = 'http://m.ironman.com/triathlon/events/americas/ironman/world-championship/results.aspx'


r = requests.get(sauce)
data = r.text

soup = BeautifulSoup(data, 'html.parser')


def parse_table(soup):
    result = defaultdict(list)
    my_table = soup.find('tbody')

    for node in my_table.children:
        if isinstance(node, Comment):
            # Get content and strip comment "<!--" and "-->"
            # Wrap the rows in "table" tags as well.
            data = '<table>{}</table>'.format(node[4:-3])
            break

    table = BeautifulSoup(data, 'html.parser')

    for row in table.find_all('tr'):
        name, _, swim, bike, run, div_rank, gender_rank, overall_rank = [col.text.strip() for col in row.find_all('td')[1:]]

        result[name].append({
            'div_rank': div_rank,
            'gender_rank': gender_rank,
            'overall_rank': overall_rank,
            'swim': swim,
            'bike': bike,
            'run': run,
        })

    return result

jsonObj = parse_table(soup)

result = pd.DataFrame()
for k, v in jsonObj.items():

    temp_df = pd.DataFrame.from_dict(v)
    temp_df['name'] = k
    result = result.append(temp_df)

result = result.reset_index(drop=True)
result.to_csv('path/to/filename.csv', index=False)

输出:

print (result)
          bike               ...                                            name
0     05:27:59               ...                             Avila, Anthony 2470
1     05:17:51               ...                           Lindgren, Mikael 1050
2     06:14:45               ...                         Umezawa, Kazuyoshi 1870
3     05:13:56               ...                                Maric, Bojan 917
4     05:19:10               ...                             Nishioka, Maki 2340
5     04:32:26               ...                                   Rana, Ivan 18
6     04:49:08               ...                             Spalding, Joel 1006
7     04:50:10               ...                               Samuel, Mark 2479
8     06:45:57               ...                              Long, Felicia 1226
9     05:24:33               ...                         Mccarroll, Charles 1355
10    06:36:36               ...                              Freeman, Roger 154
11    --:--:--               ...                             Solis, Eduardo 1159
12    04:55:29               ...                         Schlohmann, Thomas 1696
13    05:39:18               ...                             Swinson, Renee 1568
14    04:40:41               ...                            Mechin, Antoine 2226
15    05:23:18               ...                            Hammond, Serena 1548
16    05:15:10               ...                               Hassel, Diana 810
17    06:15:59               ...                          Netto, Laurie-Ann 1559
18    --:--:--               ...                              Mazur, Maksym 1412
19    07:11:19               ...                        Weiskopf-Larson, Sue 870
20    05:49:02               ...                      Sosnowska, Aleksandra 1921
21    06:45:48               ...                              Wendel, Sandra 262
22    04:39:46               ...                            Oosterdijk, Tom 2306
23    06:03:01               ...                                 Moss, Julie 358
24    06:24:58               ...                          Borgio, Alessandro 726
25    05:07:42               ...                             Newsome, Jason 1058
26    04:44:46               ...                                Wild, David 2008
27    04:46:06               ...                               Weitz, Matti 2239
28    04:41:05               ...                                  Gyde, Sam 1288
29    05:27:55               ...                              Yamauchi, Akio 452
       ...               ...                                             ...
2442  04:38:36               ...                                  Lunn, Paul 916
2443  05:27:27               ...                            Van Soest, John 1169
2444  06:07:56               ...                                Austin, John 194
2445  05:20:26               ...                             Mcgrath, Scott 1131
2446  04:53:27               ...                                Pike, Chris 1743
2447  05:23:20               ...                                Ball, Duncan 722
2448  05:33:26               ...                 Fauske Haferkamp, Cathrine 1222
2449  05:17:34               ...                              Vocking, Peter 641
2450  05:15:30               ...                              Temme, Travis 1010
2451  07:14:14               ...                                 Sun, Shiyi 2342
2452  04:52:14               ...                             Li, Peng Cheng 2232
2453  06:26:26               ...                               Lloyd, Graham 148
2454  04:44:42               ...                           Bartelle, Daniel 1441
2455  04:51:58               ...                      Overmars, Koen Pieter 1502
2456  05:23:24               ...                                Muroya, Koji 439
2457  05:45:42               ...                         Brown, Ani De Leon 1579
2458  06:42:16               ...                               Peters, Nancy 370
2459  06:43:07               ...                          Albarote, Lezette 1575
2460  04:50:45               ...                               Mohr, Robert 1990
2461  07:17:40               ...                                Hirose, Roen 497
2462  05:12:10               ...                              Girardi, Bruno 312
2463  04:59:44               ...                              Cowan, Anthony 966
2464  06:03:59               ...                               Hoskens, Rudy 433
2465  04:32:20               ...                              Baker, Edward 1798
2466  05:11:07               ...                          Svetana, Ushakova 2343
2467  05:56:06               ...                              Peterson, Greg 779
2468  05:22:15               ...                            Wallace, Patrick 287
2469  05:53:14               ...                                   Lott, Jon 914
2470  05:00:29               ...                             Goodlad, Martin 977
2471  04:34:59               ...                                Maley, Joel 1840

[2472 rows x 7 columns]

额外

还要指出,数据已经作为json结构返回给您。唯一的区别是您需要计算查询参数以遍历页面,这比您的代码要慢得多,因此需要权衡取舍。除非您查看查询参数以一次返回每页30个结果,否则返回所有2460个结果。但这也是获取json结构的一种选择。

但是您可以采用json结构并将其规范化为数据框,然后另存为csv。

import requests
from pandas.io.json import json_normalize
import pandas as pd


request_url = 'http://m.ironman.com/Handlers/EventLiveResultsMobile.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}


page = ''
params = {
'year': '2018',
'race': 'worldchampionship',
'q': '',
'p': page,
'so': 'orank',
'sd': ''}

response = requests.get(request_url, headers=headers, params=params)
jsonObj = response.json()

lastPage = jsonObj['lastPage']

result = pd.DataFrame()
for page in range(1, lastPage):

    page = str(page)
    print ('Processed Page: '+ page)
    response = requests.get(request_url, headers=headers, params=params)
    jsonObj = response.json()

    temp_df = json_normalize(jsonObj['records'])
    result = result.append(temp_df)

result = result.reset_index(drop=True)
result.to_csv('path/to/filename.csv', index=False)

本文收集自互联网,转载请注明来源。

如有侵权,请联系 [email protected] 删除。

编辑于
0

我来说两句

0 条评论
登录 后参与评论

相关文章