从url下载文件（在列表中）

意大利细面条

我正在抓取google以获取pdf文件（白皮书），并希望将它们另存为文件，而不是在控制台中列出。

这是我目前拥有的代码：

import  requests, re
from    docopt import docopt
from    bs4 import BeautifulSoup
from    time import time as timer
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin

def get_urls(search_string, start):
    #Empty temp List to store the Urls
    temp        = []
    url         = 'https://www.google.com/search'
    payload     = { 'q' : search_string, 'start' : start }
    my_headers  = { 'User-agent' : 'Mozilla/11.0' }
    r           = requests.get( url, params = payload, headers = my_headers )
    soup        = BeautifulSoup( r.text, 'html.parser' )
    h3tags      = soup.find_all( 'h3', class_='r' )
    for h3 in h3tags:
        try:
            temp.append( re.search('url\?q=(.+?)\&sa', h3.a['href']).group(1) )
        except:
            continue
    return temp

def main():
    start     = timer()
    #Empty List to store the Urls
    result    = []
    arguments = docopt( __doc__, version='MakMan Google Scrapper & Mass Exploiter' )
    search    = arguments['<search>']
    pages     = arguments['<pages>']
    #Calling the function [pages] times.
    for page in range( 0,  int(pages) ):
        #Getting the URLs in the list
        result.extend( get_urls( search, str(page*10) ) )
    #Removing Duplicate URLs
    result    = list( set( result ) )
    print( *result, sep = '\n' )
    print( '\nTotal URLs Scraped : %s ' % str( len( result ) ) ) 
    print( 'Script Execution Time : %s ' % ( timer() - start, ) ) 
if __name__ == '__main__':
    main()


#End

我尝试添加：

with open ('file.txt', 'w') as f:
   print( *result, file=f)

最后将其解析为文件，但是我敢肯定，有一种下载pdf文件的简便方法，而无需先将链接保存到文件中。

阿波罗2020号

如果您拥有PDF文件的网址，则可以urllib.urlretrieve()像这样使用。这会将文件下载到当前工作目录，并保留其名称。当然，您可以指定所需的任何目标路径。

from os import path
from urllib import urlretrieve
from urlparse import urlparse

src_url = 'http://path/to/document.pdf'
tgt_path = path.split(urlparse(src_url).path)[-1]
urlretrieve(src_url, tgt_path)

本文收集自互联网，转载请注明来源。

如有侵权，请联系 [email protected] 删除。