51学通信论坛2017新版
标题:
基本html数据提取:
[打印本页]
作者:
admin
时间:
2019-6-23 14:59
标题:
基本html数据提取:
基本html数据提取:[attach]5803[/attach]
[attach]5802[/attach]
# Example_8_1.py
def getHTMLlines(htmlpath):
f = open(htmlpath, "r", encoding='utf-8')
ls = f.readlines()
f.close()
return ls
def extractImageUrls(htmllist):
urls = []
for line in htmllist:
if 'img' in line:
url = line.split('src=')[-1].split('"')[1]
if 'http' in url:
urls.append(url)
return urls
def showResults(urls):
count = 0
for url in urls:
print('第{:2}个URL:{}'.format(count, url))
count += 1
def saveResults(filepath, urls):
f = open(filepath, "w")
for url in urls:
f.write(url+"\n")
f.close()
def main():
inputfile = 'nationalgeographic.html'
outputfile = 'nationalgeographic-urls.txt'
htmlLines = getHTMLlines(inputfile)
imageUrls = extractImageUrls(htmlLines)
showResults(imageUrls)
saveResults(outputfile, imageUrls)
main()
复制代码
结果,生成一个txt文件,提取出所有的jpg文件url。
[attach]5804[/attach]
欢迎光临 51学通信论坛2017新版 (http://bbs.51xuetongxin.com/)
Powered by Discuz! X3