我刚开始使用漂亮的汤和 Python,想从本地电子商务网站获取数据。我已经成功地获取了标题、时间和 URL,但我对价格有困难。我要抓取的文本是<div class="amount--3NTpl">GH₵ 1,300,000</div>
. 我试过了,price = soup.find("div", class_="amount--3NTpl").text
但是当我尝试导出到 csv 时出现错误,但是当我完全删除价格时脚本可以工作。
这是我的脚本;
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print("Server responded:", response.status_code)
else:
soup = BeautifulSoup(response.text, "html.parser")
return soup
def get_detail_data(soup):
try:
title = soup.find("h1", class_="title--3s1R8").text
except:
title = ""
try:
date = soup.find("h3", class_="sub-title--37mkY").text
except:
date = ""
try:
price = soup.find("div", class_="amount--3NTpl").text
except:
price = ""
data = {
"title": title,
"date": date,
"price": price,
}
return data
def get_index_data(soup):
try:
links = soup.findAll("a", class_="card-link--3ssYv gtm-ad-item")
except:
links = []
urls = ["https://tonaton.com"+item.get("href") for item in links]
return urls
def write_csv(data, url):
with open("tonatonoutput.csv", "a") as csvfile:
writer = csv.writer(csvfile)
row = [data["title"], data["date"], data["price"], url]
writer.writerow(row)
def main():
url = "https://tonaton.com/en/ads/ghana/property?page=1"
products = get_index_data(get_page(url))
for link in products:
data = get_detail_data(get_page(link))
write_csv(data, link)
if __name__ == '__main__':
main()
错误信息;
Traceback (most recent call last):
File "C:/Users/MUG/PycharmProjects/ReProject/tonatonscrapper.py", line 72, in <module>
main()
File "C:/Users/MUG/PycharmProjects/ReProject/tonatonscrapper.py", line 68, in main
write_csv(data, link)
File "C:/Users/MUG/PycharmProjects/ReProject/tonatonscrapper.py", line 58, in write_csv
writer.writerow(row)
File "C:\Users\MUG\AppData\Local\Programs\Python\Python38\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\x82' in position 76: character maps to <undefined>
您可以尝试GH₵
从价格中剥离。
将获取价格值的代码更改为:
...
try:
price = soup.find("div", class_="amount--3NTpl").text
price = price.split(maxsplit=1)[-1]
except:
price = ""
...
本文收集自互联网,转载请注明来源。
如有侵权,请联系 [email protected] 删除。
我来说两句