我正在尝试从网站获取特定名称列表,以最终将它们输出到文件中。我正在解析的网站是这个
代码当然不完美,但令我困惑的是,为什么每组名称之间的输出中有空格?
from urllib.request import urlopen
from bs4 import BeautifulSoup as bS
import re
# get the internals links
def get_internals():
array=[]
html = urlopen("http://127.0.0.1/www.prenom-marocain.com")
soup = bS(html,"lxml")
azlinks = soup.find("nav", {"class":"page-nav"}).findAll("a", {"href":re.compile("^p.*$")})
for links in azlinks:
array.append(links.attrs['href'])
return array
# The function for fetching the names
def fetch_name(url):
array=[]
html = urlopen("http://127.0.0.1/www.prenom-marocain.com/"+url)
soup = bS(html, "lxml")
for child in soup.findAll("ul", {"class":"arrow"}):
array.append(child.getText())
return array
alpha_array = get_internals()
first_names=[]
for links in alpha_array:
first_name += (fetch_name(links))
for names in first_names:
print(names)
`
我怎样才能摆脱输出中的空格?这是 array.append 方法的正常行为吗?将所有内容存储在文件中的最佳方式是什么?提前致谢 !
在您的函数中,fetch_name()
您将空字符串添加到数组中(在没有任何名称的页面上,<ul class="arrow">
是空字符串)。如果您可以将其过滤掉,则简单,这将打印所有没有空格的名称:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bS
import re
# get the internals links
def get_internals():
array=[]
html = urlopen("http://www.prenom-marocain.com")
soup = bS(html,"lxml")
azlinks = soup.find("nav", {"class":"page-nav"}).findAll("a", {"href":re.compile("^p.*$")})
for links in azlinks:
array.append(links.attrs['href'])
return array
# The function for fetching the names
def fetch_name(url):
array=[]
html = urlopen("http://www.prenom-marocain.com/"+url)
soup = bS(html, "lxml")
for child in soup.findAll("ul", {"class":"arrow"}):
if not child.text.strip():
break
array.append(child.text.strip())
return array
alpha_array = get_internals()
first_names=[]
for links in alpha_array:
first_names += (fetch_name(links))
for names in first_names:
print(names)
本文收集自互联网,转载请注明来源。
如有侵权,请联系 [email protected] 删除。
我来说两句